LLVM 20.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanHCFGBuilder.h"
62#include "VPlanPatternMatch.h"
63#include "VPlanTransforms.h"
64#include "VPlanUtils.h"
65#include "VPlanVerifier.h"
66#include "llvm/ADT/APInt.h"
67#include "llvm/ADT/ArrayRef.h"
68#include "llvm/ADT/DenseMap.h"
70#include "llvm/ADT/Hashing.h"
71#include "llvm/ADT/MapVector.h"
72#include "llvm/ADT/STLExtras.h"
75#include "llvm/ADT/Statistic.h"
76#include "llvm/ADT/StringRef.h"
77#include "llvm/ADT/Twine.h"
78#include "llvm/ADT/TypeSwitch.h"
83#include "llvm/Analysis/CFG.h"
99#include "llvm/IR/Attributes.h"
100#include "llvm/IR/BasicBlock.h"
101#include "llvm/IR/CFG.h"
102#include "llvm/IR/Constant.h"
103#include "llvm/IR/Constants.h"
104#include "llvm/IR/DataLayout.h"
105#include "llvm/IR/DebugInfo.h"
106#include "llvm/IR/DebugLoc.h"
107#include "llvm/IR/DerivedTypes.h"
109#include "llvm/IR/Dominators.h"
110#include "llvm/IR/Function.h"
111#include "llvm/IR/IRBuilder.h"
112#include "llvm/IR/InstrTypes.h"
113#include "llvm/IR/Instruction.h"
114#include "llvm/IR/Instructions.h"
116#include "llvm/IR/Intrinsics.h"
117#include "llvm/IR/MDBuilder.h"
118#include "llvm/IR/Metadata.h"
119#include "llvm/IR/Module.h"
120#include "llvm/IR/Operator.h"
121#include "llvm/IR/PatternMatch.h"
123#include "llvm/IR/Type.h"
124#include "llvm/IR/Use.h"
125#include "llvm/IR/User.h"
126#include "llvm/IR/Value.h"
127#include "llvm/IR/Verifier.h"
128#include "llvm/Support/Casting.h"
130#include "llvm/Support/Debug.h"
145#include <algorithm>
146#include <cassert>
147#include <cstdint>
148#include <functional>
149#include <iterator>
150#include <limits>
151#include <memory>
152#include <string>
153#include <tuple>
154#include <utility>
155
156using namespace llvm;
157
158#define LV_NAME "loop-vectorize"
159#define DEBUG_TYPE LV_NAME
160
161#ifndef NDEBUG
162const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163#endif
164
165/// @{
166/// Metadata attribute names
167const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
169 "llvm.loop.vectorize.followup_vectorized";
171 "llvm.loop.vectorize.followup_epilogue";
172/// @}
173
174STATISTIC(LoopsVectorized, "Number of loops vectorized");
175STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177
179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180 cl::desc("Enable vectorization of epilogue loops."));
181
183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184 cl::desc("When epilogue vectorization is enabled, and a value greater than "
185 "1 is specified, forces the given VF for all applicable epilogue "
186 "loops."));
187
189 "epilogue-vectorization-minimum-VF", cl::Hidden,
190 cl::desc("Only loops with vectorization factor equal to or larger than "
191 "the specified value are considered for epilogue vectorization."));
192
193/// Loops with a known constant trip count below this number are vectorized only
194/// if no scalar iteration overheads are incurred.
196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197 cl::desc("Loops with a constant trip count that is smaller than this "
198 "value are vectorized only if no scalar iteration overheads "
199 "are incurred."));
200
202 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203 cl::desc("The maximum allowed number of runtime memory checks"));
204
205// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206// that predication is preferred, and this lists all options. I.e., the
207// vectorizer will try to fold the tail-loop (epilogue) into the vector body
208// and predicate the instructions accordingly. If tail-folding fails, there are
209// different fallback strategies depending on these values:
211 enum Option {
215 };
216} // namespace PreferPredicateTy
217
219 "prefer-predicate-over-epilogue",
222 cl::desc("Tail-folding and predication preferences over creating a scalar "
223 "epilogue loop."),
225 "scalar-epilogue",
226 "Don't tail-predicate loops, create scalar epilogue"),
228 "predicate-else-scalar-epilogue",
229 "prefer tail-folding, create scalar epilogue if tail "
230 "folding fails."),
232 "predicate-dont-vectorize",
233 "prefers tail-folding, don't attempt vectorization if "
234 "tail-folding fails.")));
235
237 "force-tail-folding-style", cl::desc("Force the tail folding style"),
238 cl::init(TailFoldingStyle::None),
240 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
242 TailFoldingStyle::Data, "data",
243 "Create lane mask for data only, using active.lane.mask intrinsic"),
244 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
245 "data-without-lane-mask",
246 "Create lane mask with compare/stepvector"),
247 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
248 "Create lane mask using active.lane.mask intrinsic, and use "
249 "it for both data and control flow"),
250 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
251 "data-and-control-without-rt-check",
252 "Similar to data-and-control, but remove the runtime check"),
253 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
254 "Use predicated EVL instructions for tail folding. If EVL "
255 "is unsupported, fallback to data-without-lane-mask.")));
256
258 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
259 cl::desc("Maximize bandwidth when selecting vectorization factor which "
260 "will be determined by the smallest type in loop."));
261
263 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
264 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
265
266/// An interleave-group may need masking if it resides in a block that needs
267/// predication, or in order to mask away gaps.
269 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
270 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
271
273 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
274 cl::desc("A flag that overrides the target's number of scalar registers."));
275
277 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
278 cl::desc("A flag that overrides the target's number of vector registers."));
279
281 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
282 cl::desc("A flag that overrides the target's max interleave factor for "
283 "scalar loops."));
284
286 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
287 cl::desc("A flag that overrides the target's max interleave factor for "
288 "vectorized loops."));
289
291 "force-target-instruction-cost", cl::init(0), cl::Hidden,
292 cl::desc("A flag that overrides the target's expected cost for "
293 "an instruction to a single constant value. Mostly "
294 "useful for getting consistent testing."));
295
297 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
298 cl::desc(
299 "Pretend that scalable vectors are supported, even if the target does "
300 "not support them. This flag should only be used for testing."));
301
303 "small-loop-cost", cl::init(20), cl::Hidden,
304 cl::desc(
305 "The cost of a loop that is considered 'small' by the interleaver."));
306
308 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
309 cl::desc("Enable the use of the block frequency analysis to access PGO "
310 "heuristics minimizing code growth in cold regions and being more "
311 "aggressive in hot regions."));
312
313// Runtime interleave loops for load/store throughput.
315 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
316 cl::desc(
317 "Enable runtime interleaving until load/store ports are saturated"));
318
319/// The number of stores in a loop that are allowed to need predication.
321 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
322 cl::desc("Max number of stores to be predicated behind an if."));
323
325 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
326 cl::desc("Count the induction variable only once when interleaving"));
327
329 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
330 cl::desc("Enable if predication of stores during vectorization."));
331
333 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
334 cl::desc("The maximum interleave count to use when interleaving a scalar "
335 "reduction in a nested loop."));
336
337static cl::opt<bool>
338 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
340 cl::desc("Prefer in-loop vector reductions, "
341 "overriding the targets preference."));
342
344 "force-ordered-reductions", cl::init(false), cl::Hidden,
345 cl::desc("Enable the vectorisation of loops with in-order (strict) "
346 "FP reductions"));
347
349 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
350 cl::desc(
351 "Prefer predicating a reduction operation over an after loop select."));
352
353namespace llvm {
355 "enable-vplan-native-path", cl::Hidden,
356 cl::desc("Enable VPlan-native vectorization path with "
357 "support for outer loop vectorization."));
358} // namespace llvm
359
360// This flag enables the stress testing of the VPlan H-CFG construction in the
361// VPlan-native vectorization path. It must be used in conjuction with
362// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
363// verification of the H-CFGs built.
365 "vplan-build-stress-test", cl::init(false), cl::Hidden,
366 cl::desc(
367 "Build VPlan for every supported loop nest in the function and bail "
368 "out right after the build (stress test the VPlan H-CFG construction "
369 "in the VPlan-native vectorization path)."));
370
372 "interleave-loops", cl::init(true), cl::Hidden,
373 cl::desc("Enable loop interleaving in Loop vectorization passes"));
375 "vectorize-loops", cl::init(true), cl::Hidden,
376 cl::desc("Run the Loop vectorization passes"));
377
379 "force-widen-divrem-via-safe-divisor", cl::Hidden,
380 cl::desc(
381 "Override cost based safe divisor widening for div/rem instructions"));
382
384 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
386 cl::desc("Try wider VFs if they enable the use of vector variants"));
387
389 "enable-early-exit-vectorization", cl::init(false), cl::Hidden,
390 cl::desc(
391 "Enable vectorization of early exit loops with uncountable exits."));
392
393// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
394// variables not overflowing do not hold. See `emitSCEVChecks`.
395static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
396// Likelyhood of bypassing the vectorized loop because pointers overlap. See
397// `emitMemRuntimeChecks`.
398static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
399// Likelyhood of bypassing the vectorized loop because there are zero trips left
400// after prolog. See `emitIterationCountCheck`.
401static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
402
403/// A helper function that returns true if the given type is irregular. The
404/// type is irregular if its allocated size doesn't equal the store size of an
405/// element of the corresponding vector type.
406static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
407 // Determine if an array of N elements of type Ty is "bitcast compatible"
408 // with a <N x Ty> vector.
409 // This is only true if there is no padding between the array elements.
410 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
411}
412
413/// Returns "best known" trip count for the specified loop \p L as defined by
414/// the following procedure:
415/// 1) Returns exact trip count if it is known.
416/// 2) Returns expected trip count according to profile data if any.
417/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
418/// 4) Returns std::nullopt if all of the above failed.
419static std::optional<unsigned>
421 bool CanUseConstantMax = true) {
422 // Check if exact trip count is known.
423 if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L))
424 return ExpectedTC;
425
426 // Check if there is an expected trip count available from profile data.
428 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
429 return *EstimatedTC;
430
431 if (!CanUseConstantMax)
432 return std::nullopt;
433
434 // Check if upper bound estimate is known.
435 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
436 return ExpectedTC;
437
438 return std::nullopt;
439}
440
441namespace {
442// Forward declare GeneratedRTChecks.
443class GeneratedRTChecks;
444
445using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
446} // namespace
447
448namespace llvm {
449
451
452/// InnerLoopVectorizer vectorizes loops which contain only one basic
453/// block to a specified vectorization factor (VF).
454/// This class performs the widening of scalars into vectors, or multiple
455/// scalars. This class also implements the following features:
456/// * It inserts an epilogue loop for handling loops that don't have iteration
457/// counts that are known to be a multiple of the vectorization factor.
458/// * It handles the code generation for reduction variables.
459/// * Scalarization (implementation using scalars) of un-vectorizable
460/// instructions.
461/// InnerLoopVectorizer does not perform any vectorization-legality
462/// checks, and relies on the caller to check for the different legality
463/// aspects. The InnerLoopVectorizer relies on the
464/// LoopVectorizationLegality class to provide information about the induction
465/// and reduction variables that were found to a given vectorization factor.
467public:
470 const TargetLibraryInfo *TLI,
474 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
476 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
477 VPlan &Plan)
478 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
479 AC(AC), ORE(ORE), VF(VecWidth),
481 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
483 // Query this against the original loop and save it here because the profile
484 // of the original loop header may change as the transformation happens.
487 }
488
489 virtual ~InnerLoopVectorizer() = default;
490
491 /// Create a new empty loop that will contain vectorized instructions later
492 /// on, while the old loop will be used as the scalar remainder. Control flow
493 /// is generated around the vectorized (and scalar epilogue) loops consisting
494 /// of various checks and bypasses. Return the pre-header block of the new
495 /// loop. In the case of epilogue vectorization, this function is overriden to
496 /// handle the more complex control flow around the loops. \p ExpandedSCEVs is
497 /// used to look up SCEV expansions for expressions needed during skeleton
498 /// creation.
499 virtual BasicBlock *
500 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
501
502 /// Fix the vectorized code, taking care of header phi's, and more.
504
505 // Return true if any runtime check is added.
507
508 /// A helper function to scalarize a single Instruction in the innermost loop.
509 /// Generates a sequence of scalar instances for each lane between \p MinLane
510 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
511 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
512 /// Instr's operands.
513 void scalarizeInstruction(const Instruction *Instr,
514 VPReplicateRecipe *RepRecipe, const VPLane &Lane,
515 VPTransformState &State);
516
517 /// Fix the non-induction PHIs in \p Plan.
519
520 /// Create a ResumePHI VPInstruction for the induction \p InductionPhiIRI to
521 /// resume iteration count in the scalar epilogue from where the vectorized
522 /// loop left off, and add it to the scalar preheader of VPlan. Also creates
523 /// the induction resume value, and the value for the bypass block, if needed.
524 /// \p Step is the SCEV-expanded induction step to use. In cases where the
525 /// loop skeleton is more complicated (i.e., epilogue vectorization) and the
526 /// resume values can come from an additional bypass block,
527 /// \p MainVectorTripCount provides the trip count of the main vector loop,
528 /// used to compute the resume value reaching the scalar loop preheader
529 /// directly from this additional bypass block.
530 void createInductionResumeVPValue(VPIRInstruction *InductionPhiIRI,
531 const InductionDescriptor &ID, Value *Step,
532 ArrayRef<BasicBlock *> BypassBlocks,
533 VPBuilder &ScalarPHBuilder,
534 Value *MainVectorTripCount = nullptr);
535
536 /// Returns the original loop trip count.
537 Value *getTripCount() const { return TripCount; }
538
539 /// Used to set the trip count after ILV's construction and after the
540 /// preheader block has been executed. Note that this always holds the trip
541 /// count of the original loop for both main loop and epilogue vectorization.
542 void setTripCount(Value *TC) { TripCount = TC; }
543
544 // Retrieve the additional bypass value associated with an original
545 /// induction header phi.
547 return Induction2AdditionalBypassValue.at(OrigPhi);
548 }
549
550 /// Return the additional bypass block which targets the scalar loop by
551 /// skipping the epilogue loop after completing the main loop.
554 "Trying to access AdditionalBypassBlock but it has not been set");
556 }
557
558protected:
560
561 /// Set up the values of the IVs correctly when exiting the vector loop.
562 virtual void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
563 Value *VectorTripCount, BasicBlock *MiddleBlock,
564 VPTransformState &State);
565
566 /// Iteratively sink the scalarized operands of a predicated instruction into
567 /// the block that was created for it.
568 void sinkScalarOperands(Instruction *PredInst);
569
570 /// Returns (and creates if needed) the trip count of the widened loop.
572
573 /// Emit a bypass check to see if the vector trip count is zero, including if
574 /// it overflows.
576
577 /// Emit a bypass check to see if all of the SCEV assumptions we've
578 /// had to make are correct. Returns the block containing the checks or
579 /// nullptr if no checks have been added.
581
582 /// Emit bypass checks to check any memory assumptions we may have made.
583 /// Returns the block containing the checks or nullptr if no checks have been
584 /// added.
586
587 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
588 /// vector loop preheader, middle block and scalar preheader.
590
591 /// Create new phi nodes for the induction variables to resume iteration count
592 /// in the scalar epilogue, from where the vectorized loop left off.
593 /// In cases where the loop skeleton is more complicated (i.e. epilogue
594 /// vectorization), \p MainVectorTripCount provides the trip count of the main
595 /// loop, used to compute these resume values. If \p IVSubset is provided, it
596 /// contains the phi nodes for which resume values are needed, because they
597 /// will generate wide induction phis in the epilogue loop.
598 void
599 createInductionResumeVPValues(const SCEV2ValueTy &ExpandedSCEVs,
600 Value *MainVectorTripCount = nullptr,
601 SmallPtrSetImpl<PHINode *> *IVSubset = nullptr);
602
603 /// Allow subclasses to override and print debug traces before/after vplan
604 /// execution, when trace information is requested.
605 virtual void printDebugTracesAtStart() {}
606 virtual void printDebugTracesAtEnd() {}
607
608 /// The original loop.
610
611 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
612 /// dynamic knowledge to simplify SCEV expressions and converts them to a
613 /// more usable form.
615
616 /// Loop Info.
618
619 /// Dominator Tree.
621
622 /// Target Library Info.
624
625 /// Target Transform Info.
627
628 /// Assumption Cache.
630
631 /// Interface to emit optimization remarks.
633
634 /// The vectorization SIMD factor to use. Each vector will have this many
635 /// vector elements.
637
639
640 /// The vectorization unroll factor to use. Each scalar is vectorized to this
641 /// many different vector instructions.
642 unsigned UF;
643
644 /// The builder that we use
646
647 // --- Vectorization state ---
648
649 /// The vector-loop preheader.
651
652 /// The scalar-loop preheader.
654
655 /// Middle Block between the vector and the scalar.
657
658 /// A list of all bypass blocks. The first block is the entry of the loop.
660
661 /// Store instructions that were predicated.
663
664 /// Trip count of the original loop.
665 Value *TripCount = nullptr;
666
667 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
669
670 /// The legality analysis.
672
673 /// The profitablity analysis.
675
676 // Record whether runtime checks are added.
677 bool AddedSafetyChecks = false;
678
679 /// BFI and PSI are used to check for profile guided size optimizations.
682
683 // Whether this loop should be optimized for size based on profile guided size
684 // optimizatios.
686
687 /// Structure to hold information about generated runtime checks, responsible
688 /// for cleaning the checks, if vectorization turns out unprofitable.
689 GeneratedRTChecks &RTChecks;
690
691 /// Mapping of induction phis to their additional bypass values. They
692 /// need to be added as operands to phi nodes in the scalar loop preheader
693 /// after the epilogue skeleton has been created.
695
696 /// The additional bypass block which conditionally skips over the epilogue
697 /// loop after executing the main loop. Needed to resume inductions and
698 /// reductions during epilogue vectorization.
700
702};
703
704/// Encapsulate information regarding vectorization of a loop and its epilogue.
705/// This information is meant to be updated and used across two stages of
706/// epilogue vectorization.
709 unsigned MainLoopUF = 0;
711 unsigned EpilogueUF = 0;
716 Value *TripCount = nullptr;
719
721 ElementCount EVF, unsigned EUF,
723 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
725 assert(EUF == 1 &&
726 "A high UF for the epilogue loop is likely not beneficial.");
727 }
728};
729
730/// An extension of the inner loop vectorizer that creates a skeleton for a
731/// vectorized loop that has its epilogue (residual) also vectorized.
732/// The idea is to run the vplan on a given loop twice, firstly to setup the
733/// skeleton and vectorize the main loop, and secondly to complete the skeleton
734/// from the first step and vectorize the epilogue. This is achieved by
735/// deriving two concrete strategy classes from this base class and invoking
736/// them in succession from the loop vectorizer planner.
738public:
746 GeneratedRTChecks &Checks, VPlan &Plan)
748 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
749 CM, BFI, PSI, Checks, Plan),
750 EPI(EPI) {}
751
752 // Override this function to handle the more complex control flow around the
753 // three loops.
754 BasicBlock *
755 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final {
756 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
757 }
758
759 /// The interface for creating a vectorized skeleton using one of two
760 /// different strategies, each corresponding to one execution of the vplan
761 /// as described above.
762 virtual BasicBlock *
763 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
764
765 /// Holds and updates state information required to vectorize the main loop
766 /// and its epilogue in two separate passes. This setup helps us avoid
767 /// regenerating and recomputing runtime safety checks. It also helps us to
768 /// shorten the iteration-count-check path length for the cases where the
769 /// iteration count of the loop is so small that the main vector loop is
770 /// completely skipped.
772};
773
774/// A specialized derived class of inner loop vectorizer that performs
775/// vectorization of *main* loops in the process of vectorizing loops and their
776/// epilogues.
778public:
786 GeneratedRTChecks &Check, VPlan &Plan)
788 EPI, LVL, CM, BFI, PSI, Check, Plan) {}
789 /// Implements the interface for creating a vectorized skeleton using the
790 /// *main loop* strategy (ie the first pass of vplan execution).
791 BasicBlock *
792 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
793
794protected:
795 /// Emits an iteration count bypass check once for the main loop (when \p
796 /// ForEpilogue is false) and once for the epilogue loop (when \p
797 /// ForEpilogue is true).
798 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
799 void printDebugTracesAtStart() override;
800 void printDebugTracesAtEnd() override;
801
803 Value *VectorTripCount, BasicBlock *MiddleBlock,
804 VPTransformState &State) override {};
805};
806
807// A specialized derived class of inner loop vectorizer that performs
808// vectorization of *epilogue* loops in the process of vectorizing loops and
809// their epilogues.
811public:
819 GeneratedRTChecks &Checks, VPlan &Plan)
821 EPI, LVL, CM, BFI, PSI, Checks, Plan) {
823 }
824 /// Implements the interface for creating a vectorized skeleton using the
825 /// *epilogue loop* strategy (ie the second pass of vplan execution).
826 BasicBlock *
827 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
828
829protected:
830 /// Emits an iteration count bypass check after the main vector loop has
831 /// finished to see if there are any iterations left to execute by either
832 /// the vector epilogue or the scalar epilogue.
834 BasicBlock *Bypass,
835 BasicBlock *Insert);
836 void printDebugTracesAtStart() override;
837 void printDebugTracesAtEnd() override;
838};
839} // end namespace llvm
840
841/// Look for a meaningful debug location on the instruction or its operands.
843 if (!I)
844 return DebugLoc();
845
846 DebugLoc Empty;
847 if (I->getDebugLoc() != Empty)
848 return I->getDebugLoc();
849
850 for (Use &Op : I->operands()) {
851 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
852 if (OpInst->getDebugLoc() != Empty)
853 return OpInst->getDebugLoc();
854 }
855
856 return I->getDebugLoc();
857}
858
859/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
860/// is passed, the message relates to that particular instruction.
861#ifndef NDEBUG
862static void debugVectorizationMessage(const StringRef Prefix,
863 const StringRef DebugMsg,
864 Instruction *I) {
865 dbgs() << "LV: " << Prefix << DebugMsg;
866 if (I != nullptr)
867 dbgs() << " " << *I;
868 else
869 dbgs() << '.';
870 dbgs() << '\n';
871}
872#endif
873
874/// Create an analysis remark that explains why vectorization failed
875///
876/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
877/// RemarkName is the identifier for the remark. If \p I is passed it is an
878/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
879/// the location of the remark. If \p DL is passed, use it as debug location for
880/// the remark. \return the remark object that can be streamed to.
882createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
883 Instruction *I, DebugLoc DL = {}) {
884 Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
885 // If debug location is attached to the instruction, use it. Otherwise if DL
886 // was not provided, use the loop's.
887 if (I && I->getDebugLoc())
888 DL = I->getDebugLoc();
889 else if (!DL)
890 DL = TheLoop->getStartLoc();
891
892 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
893}
894
895namespace llvm {
896
897/// Return a value for Step multiplied by VF.
899 int64_t Step) {
900 assert(Ty->isIntegerTy() && "Expected an integer step");
901 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
902}
903
904/// Return the runtime value for VF.
906 return B.CreateElementCount(Ty, VF);
907}
908
910 const StringRef OREMsg, const StringRef ORETag,
911 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
912 Instruction *I) {
913 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
914 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
915 ORE->emit(
916 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
917 << "loop not vectorized: " << OREMsg);
918}
919
920/// Reports an informative message: print \p Msg for debugging purposes as well
921/// as an optimization remark. Uses either \p I as location of the remark, or
922/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
923/// remark. If \p DL is passed, use it as debug location for the remark.
924static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
926 Loop *TheLoop, Instruction *I = nullptr,
927 DebugLoc DL = {}) {
929 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
930 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
931 I, DL)
932 << Msg);
933}
934
935/// Report successful vectorization of the loop. In case an outer loop is
936/// vectorized, prepend "outer" to the vectorization remark.
938 VectorizationFactor VF, unsigned IC) {
940 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
941 nullptr));
942 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
943 ORE->emit([&]() {
944 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
945 TheLoop->getHeader())
946 << "vectorized " << LoopType << "loop (vectorization width: "
947 << ore::NV("VectorizationFactor", VF.Width)
948 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
949 });
950}
951
952} // end namespace llvm
953
954namespace llvm {
955
956// Loop vectorization cost-model hints how the scalar epilogue loop should be
957// lowered.
959
960 // The default: allowing scalar epilogues.
962
963 // Vectorization with OptForSize: don't allow epilogues.
965
966 // A special case of vectorisation with OptForSize: loops with a very small
967 // trip count are considered for vectorization under OptForSize, thereby
968 // making sure the cost of their loop body is dominant, free of runtime
969 // guards and scalar iteration overheads.
971
972 // Loop hint predicate indicating an epilogue is undesired.
974
975 // Directive indicating we must either tail fold or not vectorize
978
979using InstructionVFPair = std::pair<Instruction *, ElementCount>;
980
981/// LoopVectorizationCostModel - estimates the expected speedups due to
982/// vectorization.
983/// In many cases vectorization is not profitable. This can happen because of
984/// a number of reasons. In this class we mainly attempt to predict the
985/// expected speedup/slowdowns due to the supported instruction set. We use the
986/// TargetTransformInfo to query the different backends for the cost of
987/// different operations.
990
991public:
1001 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1002 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1003 Hints(Hints), InterleaveInfo(IAI) {}
1004
1005 /// \return An upper bound for the vectorization factors (both fixed and
1006 /// scalable). If the factors are 0, vectorization and interleaving should be
1007 /// avoided up front.
1008 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1009
1010 /// \return True if runtime checks are required for vectorization, and false
1011 /// otherwise.
1012 bool runtimeChecksRequired();
1013
1014 /// Setup cost-based decisions for user vectorization factor.
1015 /// \return true if the UserVF is a feasible VF to be chosen.
1019 return expectedCost(UserVF).isValid();
1020 }
1021
1022 /// \return The size (in bits) of the smallest and widest types in the code
1023 /// that needs to be vectorized. We ignore values that remain scalar such as
1024 /// 64 bit loop indices.
1025 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1026
1027 /// \return The desired interleave count.
1028 /// If interleave count has been specified by metadata it will be returned.
1029 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1030 /// are the selected vectorization factor and the cost of the selected VF.
1031 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1032
1033 /// Memory access instruction may be vectorized in more than one way.
1034 /// Form of instruction after vectorization depends on cost.
1035 /// This function takes cost-based decisions for Load/Store instructions
1036 /// and collects them in a map. This decisions map is used for building
1037 /// the lists of loop-uniform and loop-scalar instructions.
1038 /// The calculated cost is saved with widening decision in order to
1039 /// avoid redundant calculations.
1041
1042 /// A call may be vectorized in different ways depending on whether we have
1043 /// vectorized variants available and whether the target supports masking.
1044 /// This function analyzes all calls in the function at the supplied VF,
1045 /// makes a decision based on the costs of available options, and stores that
1046 /// decision in a map for use in planning and plan execution.
1048
1049 /// A struct that represents some properties of the register usage
1050 /// of a loop.
1052 /// Holds the number of loop invariant values that are used in the loop.
1053 /// The key is ClassID of target-provided register class.
1055 /// Holds the maximum number of concurrent live intervals in the loop.
1056 /// The key is ClassID of target-provided register class.
1058 };
1059
1060 /// \return Returns information about the register usages of the loop for the
1061 /// given vectorization factors.
1064
1065 /// Collect values we want to ignore in the cost model.
1066 void collectValuesToIgnore();
1067
1068 /// Collect all element types in the loop for which widening is needed.
1070
1071 /// Split reductions into those that happen in the loop, and those that happen
1072 /// outside. In loop reductions are collected into InLoopReductions.
1074
1075 /// Returns true if we should use strict in-order reductions for the given
1076 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1077 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1078 /// of FP operations.
1079 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1080 return !Hints->allowReordering() && RdxDesc.isOrdered();
1081 }
1082
1083 /// \returns The smallest bitwidth each instruction can be represented with.
1084 /// The vector equivalents of these instructions should be truncated to this
1085 /// type.
1087 return MinBWs;
1088 }
1089
1090 /// \returns True if it is more profitable to scalarize instruction \p I for
1091 /// vectorization factor \p VF.
1093 assert(VF.isVector() &&
1094 "Profitable to scalarize relevant only for VF > 1.");
1095 assert(
1096 TheLoop->isInnermost() &&
1097 "cost-model should not be used for outer loops (in VPlan-native path)");
1098
1099 auto Scalars = InstsToScalarize.find(VF);
1100 assert(Scalars != InstsToScalarize.end() &&
1101 "VF not yet analyzed for scalarization profitability");
1102 return Scalars->second.contains(I);
1103 }
1104
1105 /// Returns true if \p I is known to be uniform after vectorization.
1107 assert(
1108 TheLoop->isInnermost() &&
1109 "cost-model should not be used for outer loops (in VPlan-native path)");
1110 // Pseudo probe needs to be duplicated for each unrolled iteration and
1111 // vector lane so that profiled loop trip count can be accurately
1112 // accumulated instead of being under counted.
1113 if (isa<PseudoProbeInst>(I))
1114 return false;
1115
1116 if (VF.isScalar())
1117 return true;
1118
1119 auto UniformsPerVF = Uniforms.find(VF);
1120 assert(UniformsPerVF != Uniforms.end() &&
1121 "VF not yet analyzed for uniformity");
1122 return UniformsPerVF->second.count(I);
1123 }
1124
1125 /// Returns true if \p I is known to be scalar after vectorization.
1127 assert(
1128 TheLoop->isInnermost() &&
1129 "cost-model should not be used for outer loops (in VPlan-native path)");
1130 if (VF.isScalar())
1131 return true;
1132
1133 auto ScalarsPerVF = Scalars.find(VF);
1134 assert(ScalarsPerVF != Scalars.end() &&
1135 "Scalar values are not calculated for VF");
1136 return ScalarsPerVF->second.count(I);
1137 }
1138
1139 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1140 /// for vectorization factor \p VF.
1142 return VF.isVector() && MinBWs.contains(I) &&
1143 !isProfitableToScalarize(I, VF) &&
1145 }
1146
1147 /// Decision that was taken during cost calculation for memory instruction.
1150 CM_Widen, // For consecutive accesses with stride +1.
1151 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1158
1159 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1160 /// instruction \p I and vector width \p VF.
1163 assert(VF.isVector() && "Expected VF >=2");
1164 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1165 }
1166
1167 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1168 /// interleaving group \p Grp and vector width \p VF.
1172 assert(VF.isVector() && "Expected VF >=2");
1173 /// Broadcast this decicion to all instructions inside the group.
1174 /// When interleaving, the cost will only be assigned one instruction, the
1175 /// insert position. For other cases, add the appropriate fraction of the
1176 /// total cost to each instruction. This ensures accurate costs are used,
1177 /// even if the insert position instruction is not used.
1178 InstructionCost InsertPosCost = Cost;
1179 InstructionCost OtherMemberCost = 0;
1180 if (W != CM_Interleave)
1181 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1182 ;
1183 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1184 if (auto *I = Grp->getMember(Idx)) {
1185 if (Grp->getInsertPos() == I)
1186 WideningDecisions[std::make_pair(I, VF)] =
1187 std::make_pair(W, InsertPosCost);
1188 else
1189 WideningDecisions[std::make_pair(I, VF)] =
1190 std::make_pair(W, OtherMemberCost);
1191 }
1192 }
1193 }
1194
1195 /// Return the cost model decision for the given instruction \p I and vector
1196 /// width \p VF. Return CM_Unknown if this instruction did not pass
1197 /// through the cost modeling.
1199 assert(VF.isVector() && "Expected VF to be a vector VF");
1200 assert(
1201 TheLoop->isInnermost() &&
1202 "cost-model should not be used for outer loops (in VPlan-native path)");
1203
1204 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1205 auto Itr = WideningDecisions.find(InstOnVF);
1206 if (Itr == WideningDecisions.end())
1207 return CM_Unknown;
1208 return Itr->second.first;
1209 }
1210
1211 /// Return the vectorization cost for the given instruction \p I and vector
1212 /// width \p VF.
1214 assert(VF.isVector() && "Expected VF >=2");
1215 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1216 assert(WideningDecisions.contains(InstOnVF) &&
1217 "The cost is not calculated");
1218 return WideningDecisions[InstOnVF].second;
1219 }
1220
1225 std::optional<unsigned> MaskPos;
1227 };
1228
1230 Function *Variant, Intrinsic::ID IID,
1231 std::optional<unsigned> MaskPos,
1233 assert(!VF.isScalar() && "Expected vector VF");
1234 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1235 MaskPos, Cost};
1236 }
1237
1239 ElementCount VF) const {
1240 assert(!VF.isScalar() && "Expected vector VF");
1241 return CallWideningDecisions.at(std::make_pair(CI, VF));
1242 }
1243
1244 /// Return True if instruction \p I is an optimizable truncate whose operand
1245 /// is an induction variable. Such a truncate will be removed by adding a new
1246 /// induction variable with the destination type.
1248 // If the instruction is not a truncate, return false.
1249 auto *Trunc = dyn_cast<TruncInst>(I);
1250 if (!Trunc)
1251 return false;
1252
1253 // Get the source and destination types of the truncate.
1254 Type *SrcTy = toVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1255 Type *DestTy = toVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1256
1257 // If the truncate is free for the given types, return false. Replacing a
1258 // free truncate with an induction variable would add an induction variable
1259 // update instruction to each iteration of the loop. We exclude from this
1260 // check the primary induction variable since it will need an update
1261 // instruction regardless.
1262 Value *Op = Trunc->getOperand(0);
1263 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1264 return false;
1265
1266 // If the truncated value is not an induction variable, return false.
1267 return Legal->isInductionPhi(Op);
1268 }
1269
1270 /// Collects the instructions to scalarize for each predicated instruction in
1271 /// the loop.
1273
1274 /// Collect Uniform and Scalar values for the given \p VF.
1275 /// The sets depend on CM decision for Load/Store instructions
1276 /// that may be vectorized as interleave, gather-scatter or scalarized.
1277 /// Also make a decision on what to do about call instructions in the loop
1278 /// at that VF -- scalarize, call a known vector routine, or call a
1279 /// vector intrinsic.
1281 // Do the analysis once.
1282 if (VF.isScalar() || Uniforms.contains(VF))
1283 return;
1285 collectLoopUniforms(VF);
1287 collectLoopScalars(VF);
1288 }
1289
1290 /// Returns true if the target machine supports masked store operation
1291 /// for the given \p DataType and kind of access to \p Ptr.
1292 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1293 return Legal->isConsecutivePtr(DataType, Ptr) &&
1294 TTI.isLegalMaskedStore(DataType, Alignment);
1295 }
1296
1297 /// Returns true if the target machine supports masked load operation
1298 /// for the given \p DataType and kind of access to \p Ptr.
1299 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1300 return Legal->isConsecutivePtr(DataType, Ptr) &&
1301 TTI.isLegalMaskedLoad(DataType, Alignment);
1302 }
1303
1304 /// Returns true if the target machine can represent \p V as a masked gather
1305 /// or scatter operation.
1307 bool LI = isa<LoadInst>(V);
1308 bool SI = isa<StoreInst>(V);
1309 if (!LI && !SI)
1310 return false;
1311 auto *Ty = getLoadStoreType(V);
1313 if (VF.isVector())
1314 Ty = VectorType::get(Ty, VF);
1315 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1316 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1317 }
1318
1319 /// Returns true if the target machine supports all of the reduction
1320 /// variables found for the given VF.
1322 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1323 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1324 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1325 }));
1326 }
1327
1328 /// Given costs for both strategies, return true if the scalar predication
1329 /// lowering should be used for div/rem. This incorporates an override
1330 /// option so it is not simply a cost comparison.
1332 InstructionCost SafeDivisorCost) const {
1333 switch (ForceSafeDivisor) {
1334 case cl::BOU_UNSET:
1335 return ScalarCost < SafeDivisorCost;
1336 case cl::BOU_TRUE:
1337 return false;
1338 case cl::BOU_FALSE:
1339 return true;
1340 }
1341 llvm_unreachable("impossible case value");
1342 }
1343
1344 /// Returns true if \p I is an instruction which requires predication and
1345 /// for which our chosen predication strategy is scalarization (i.e. we
1346 /// don't have an alternate strategy such as masking available).
1347 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1349
1350 /// Returns true if \p I is an instruction that needs to be predicated
1351 /// at runtime. The result is independent of the predication mechanism.
1352 /// Superset of instructions that return true for isScalarWithPredication.
1353 bool isPredicatedInst(Instruction *I) const;
1354
1355 /// Return the costs for our two available strategies for lowering a
1356 /// div/rem operation which requires speculating at least one lane.
1357 /// First result is for scalarization (will be invalid for scalable
1358 /// vectors); second is for the safe-divisor strategy.
1359 std::pair<InstructionCost, InstructionCost>
1361 ElementCount VF) const;
1362
1363 /// Returns true if \p I is a memory instruction with consecutive memory
1364 /// access that can be widened.
1366
1367 /// Returns true if \p I is a memory instruction in an interleaved-group
1368 /// of memory accesses that can be vectorized with wide vector loads/stores
1369 /// and shuffles.
1371
1372 /// Check if \p Instr belongs to any interleaved access group.
1374 return InterleaveInfo.isInterleaved(Instr);
1375 }
1376
1377 /// Get the interleaved access group that \p Instr belongs to.
1380 return InterleaveInfo.getInterleaveGroup(Instr);
1381 }
1382
1383 /// Returns true if we're required to use a scalar epilogue for at least
1384 /// the final iteration of the original loop.
1385 bool requiresScalarEpilogue(bool IsVectorizing) const {
1386 if (!isScalarEpilogueAllowed()) {
1387 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1388 return false;
1389 }
1390 // If we might exit from anywhere but the latch and early exit vectorization
1391 // is disabled, we must run the exiting iteration in scalar form.
1394 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1395 "from latch block\n");
1396 return true;
1397 }
1398 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1399 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1400 "interleaved group requires scalar epilogue\n");
1401 return true;
1402 }
1403 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1404 return false;
1405 }
1406
1407 /// Returns true if we're required to use a scalar epilogue for at least
1408 /// the final iteration of the original loop for all VFs in \p Range.
1409 /// A scalar epilogue must either be required for all VFs in \p Range or for
1410 /// none.
1412 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1413 return requiresScalarEpilogue(VF.isVector());
1414 };
1415 bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1416 assert(
1417 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1418 "all VFs in range must agree on whether a scalar epilogue is required");
1419 return IsRequired;
1420 }
1421
1422 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1423 /// loop hint annotation.
1425 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1426 }
1427
1428 /// Returns the TailFoldingStyle that is best for the current loop.
1429 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1430 if (!ChosenTailFoldingStyle)
1432 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1433 : ChosenTailFoldingStyle->second;
1434 }
1435
1436 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1437 /// overflow or not.
1438 /// \param IsScalableVF true if scalable vector factors enabled.
1439 /// \param UserIC User specific interleave count.
1440 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1441 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1442 if (!Legal->canFoldTailByMasking()) {
1443 ChosenTailFoldingStyle =
1445 return;
1446 }
1447
1448 if (!ForceTailFoldingStyle.getNumOccurrences()) {
1449 ChosenTailFoldingStyle = std::make_pair(
1450 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1451 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1452 return;
1453 }
1454
1455 // Set styles when forced.
1456 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1457 ForceTailFoldingStyle.getValue());
1459 return;
1460 // Override forced styles if needed.
1461 // FIXME: use actual opcode/data type for analysis here.
1462 // FIXME: Investigate opportunity for fixed vector factor.
1463 bool EVLIsLegal = UserIC <= 1 &&
1464 TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1466 if (!EVLIsLegal) {
1467 // If for some reason EVL mode is unsupported, fallback to
1468 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1469 // in a generic way.
1470 ChosenTailFoldingStyle =
1473 LLVM_DEBUG(
1474 dbgs()
1475 << "LV: Preference for VP intrinsics indicated. Will "
1476 "not try to generate VP Intrinsics "
1477 << (UserIC > 1
1478 ? "since interleave count specified is greater than 1.\n"
1479 : "due to non-interleaving reasons.\n"));
1480 }
1481 }
1482
1483 /// Returns true if all loop blocks should be masked to fold tail loop.
1484 bool foldTailByMasking() const {
1485 // TODO: check if it is possible to check for None style independent of
1486 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1488 }
1489
1490 /// Return maximum safe number of elements to be processed per vector
1491 /// iteration, which do not prevent store-load forwarding and are safe with
1492 /// regard to the memory dependencies. Required for EVL-based VPlans to
1493 /// correctly calculate AVL (application vector length) as min(remaining AVL,
1494 /// MaxSafeElements).
1495 /// TODO: need to consider adjusting cost model to use this value as a
1496 /// vectorization factor for EVL-based vectorization.
1497 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1498
1499 /// Returns true if the instructions in this block requires predication
1500 /// for any reason, e.g. because tail folding now requires a predicate
1501 /// or because the block in the original loop was predicated.
1504 }
1505
1506 /// Returns true if VP intrinsics with explicit vector length support should
1507 /// be generated in the tail folded loop.
1508 bool foldTailWithEVL() const {
1510 }
1511
1512 /// Returns true if the Phi is part of an inloop reduction.
1513 bool isInLoopReduction(PHINode *Phi) const {
1514 return InLoopReductions.contains(Phi);
1515 }
1516
1517 /// Returns true if the predicated reduction select should be used to set the
1518 /// incoming value for the reduction phi.
1519 bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const {
1520 // Force to use predicated reduction select since the EVL of the
1521 // second-to-last iteration might not be VF*UF.
1522 if (foldTailWithEVL())
1523 return true;
1526 Opcode, PhiTy, TargetTransformInfo::ReductionFlags());
1527 }
1528
1529 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1530 /// with factor VF. Return the cost of the instruction, including
1531 /// scalarization overhead if it's needed.
1533
1534 /// Estimate cost of a call instruction CI if it were vectorized with factor
1535 /// VF. Return the cost of the instruction, including scalarization overhead
1536 /// if it's needed.
1538
1539 /// Invalidates decisions already taken by the cost model.
1541 WideningDecisions.clear();
1542 CallWideningDecisions.clear();
1543 Uniforms.clear();
1544 Scalars.clear();
1545 }
1546
1547 /// Returns the expected execution cost. The unit of the cost does
1548 /// not matter because we use the 'cost' units to compare different
1549 /// vector widths. The cost that is returned is *not* normalized by
1550 /// the factor width.
1552
1553 bool hasPredStores() const { return NumPredStores > 0; }
1554
1555 /// Returns true if epilogue vectorization is considered profitable, and
1556 /// false otherwise.
1557 /// \p VF is the vectorization factor chosen for the original loop.
1558 /// \p Multiplier is an aditional scaling factor applied to VF before
1559 /// comparing to EpilogueVectorizationMinVF.
1561 const unsigned IC) const;
1562
1563 /// Returns the execution time cost of an instruction for a given vector
1564 /// width. Vector width of one means scalar.
1566
1567 /// Return the cost of instructions in an inloop reduction pattern, if I is
1568 /// part of that pattern.
1569 std::optional<InstructionCost>
1572
1573 /// Returns true if \p Op should be considered invariant and if it is
1574 /// trivially hoistable.
1576
1577private:
1578 unsigned NumPredStores = 0;
1579
1580 /// \return An upper bound for the vectorization factors for both
1581 /// fixed and scalable vectorization, where the minimum-known number of
1582 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1583 /// disabled or unsupported, then the scalable part will be equal to
1584 /// ElementCount::getScalable(0).
1585 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1586 ElementCount UserVF,
1587 bool FoldTailByMasking);
1588
1589 /// \return the maximized element count based on the targets vector
1590 /// registers and the loop trip-count, but limited to a maximum safe VF.
1591 /// This is a helper function of computeFeasibleMaxVF.
1592 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1593 unsigned SmallestType,
1594 unsigned WidestType,
1595 ElementCount MaxSafeVF,
1596 bool FoldTailByMasking);
1597
1598 /// Checks if scalable vectorization is supported and enabled. Caches the
1599 /// result to avoid repeated debug dumps for repeated queries.
1600 bool isScalableVectorizationAllowed();
1601
1602 /// \return the maximum legal scalable VF, based on the safe max number
1603 /// of elements.
1604 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1605
1606 /// Calculate vectorization cost of memory instruction \p I.
1607 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1608
1609 /// The cost computation for scalarized memory instruction.
1610 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1611
1612 /// The cost computation for interleaving group of memory instructions.
1613 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1614
1615 /// The cost computation for Gather/Scatter instruction.
1616 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1617
1618 /// The cost computation for widening instruction \p I with consecutive
1619 /// memory access.
1620 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1621
1622 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1623 /// Load: scalar load + broadcast.
1624 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1625 /// element)
1626 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1627
1628 /// Estimate the overhead of scalarizing an instruction. This is a
1629 /// convenience wrapper for the type-based getScalarizationOverhead API.
1630 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1632
1633 /// Returns true if an artificially high cost for emulated masked memrefs
1634 /// should be used.
1635 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1636
1637 /// Map of scalar integer values to the smallest bitwidth they can be legally
1638 /// represented as. The vector equivalents of these values should be truncated
1639 /// to this type.
1641
1642 /// A type representing the costs for instructions if they were to be
1643 /// scalarized rather than vectorized. The entries are Instruction-Cost
1644 /// pairs.
1645 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1646
1647 /// A set containing all BasicBlocks that are known to present after
1648 /// vectorization as a predicated block.
1650 PredicatedBBsAfterVectorization;
1651
1652 /// Records whether it is allowed to have the original scalar loop execute at
1653 /// least once. This may be needed as a fallback loop in case runtime
1654 /// aliasing/dependence checks fail, or to handle the tail/remainder
1655 /// iterations when the trip count is unknown or doesn't divide by the VF,
1656 /// or as a peel-loop to handle gaps in interleave-groups.
1657 /// Under optsize and when the trip count is very small we don't allow any
1658 /// iterations to execute in the scalar loop.
1659 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1660
1661 /// Control finally chosen tail folding style. The first element is used if
1662 /// the IV update may overflow, the second element - if it does not.
1663 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1664 ChosenTailFoldingStyle;
1665
1666 /// true if scalable vectorization is supported and enabled.
1667 std::optional<bool> IsScalableVectorizationAllowed;
1668
1669 /// Maximum safe number of elements to be processed per vector iteration,
1670 /// which do not prevent store-load forwarding and are safe with regard to the
1671 /// memory dependencies. Required for EVL-based veectorization, where this
1672 /// value is used as the upper bound of the safe AVL.
1673 std::optional<unsigned> MaxSafeElements;
1674
1675 /// A map holding scalar costs for different vectorization factors. The
1676 /// presence of a cost for an instruction in the mapping indicates that the
1677 /// instruction will be scalarized when vectorizing with the associated
1678 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1680
1681 /// Holds the instructions known to be uniform after vectorization.
1682 /// The data is collected per VF.
1684
1685 /// Holds the instructions known to be scalar after vectorization.
1686 /// The data is collected per VF.
1688
1689 /// Holds the instructions (address computations) that are forced to be
1690 /// scalarized.
1692
1693 /// PHINodes of the reductions that should be expanded in-loop.
1694 SmallPtrSet<PHINode *, 4> InLoopReductions;
1695
1696 /// A Map of inloop reduction operations and their immediate chain operand.
1697 /// FIXME: This can be removed once reductions can be costed correctly in
1698 /// VPlan. This was added to allow quick lookup of the inloop operations.
1699 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1700
1701 /// Returns the expected difference in cost from scalarizing the expression
1702 /// feeding a predicated instruction \p PredInst. The instructions to
1703 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1704 /// non-negative return value implies the expression will be scalarized.
1705 /// Currently, only single-use chains are considered for scalarization.
1706 InstructionCost computePredInstDiscount(Instruction *PredInst,
1707 ScalarCostsTy &ScalarCosts,
1708 ElementCount VF);
1709
1710 /// Collect the instructions that are uniform after vectorization. An
1711 /// instruction is uniform if we represent it with a single scalar value in
1712 /// the vectorized loop corresponding to each vector iteration. Examples of
1713 /// uniform instructions include pointer operands of consecutive or
1714 /// interleaved memory accesses. Note that although uniformity implies an
1715 /// instruction will be scalar, the reverse is not true. In general, a
1716 /// scalarized instruction will be represented by VF scalar values in the
1717 /// vectorized loop, each corresponding to an iteration of the original
1718 /// scalar loop.
1719 void collectLoopUniforms(ElementCount VF);
1720
1721 /// Collect the instructions that are scalar after vectorization. An
1722 /// instruction is scalar if it is known to be uniform or will be scalarized
1723 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1724 /// to the list if they are used by a load/store instruction that is marked as
1725 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1726 /// VF values in the vectorized loop, each corresponding to an iteration of
1727 /// the original scalar loop.
1728 void collectLoopScalars(ElementCount VF);
1729
1730 /// Keeps cost model vectorization decision and cost for instructions.
1731 /// Right now it is used for memory instructions only.
1733 std::pair<InstWidening, InstructionCost>>;
1734
1735 DecisionList WideningDecisions;
1736
1737 using CallDecisionList =
1738 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1739
1740 CallDecisionList CallWideningDecisions;
1741
1742 /// Returns true if \p V is expected to be vectorized and it needs to be
1743 /// extracted.
1744 bool needsExtract(Value *V, ElementCount VF) const {
1745 Instruction *I = dyn_cast<Instruction>(V);
1746 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1748 return false;
1749
1750 // Assume we can vectorize V (and hence we need extraction) if the
1751 // scalars are not computed yet. This can happen, because it is called
1752 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1753 // the scalars are collected. That should be a safe assumption in most
1754 // cases, because we check if the operands have vectorizable types
1755 // beforehand in LoopVectorizationLegality.
1756 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1757 };
1758
1759 /// Returns a range containing only operands needing to be extracted.
1760 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1761 ElementCount VF) const {
1763 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1764 }
1765
1766public:
1767 /// The loop that we evaluate.
1769
1770 /// Predicated scalar evolution analysis.
1772
1773 /// Loop Info analysis.
1775
1776 /// Vectorization legality.
1778
1779 /// Vector target information.
1781
1782 /// Target Library Info.
1784
1785 /// Demanded bits analysis.
1787
1788 /// Assumption cache.
1790
1791 /// Interface to emit optimization remarks.
1793
1795
1796 /// Loop Vectorize Hint.
1798
1799 /// The interleave access information contains groups of interleaved accesses
1800 /// with the same stride and close to each other.
1802
1803 /// Values to ignore in the cost model.
1805
1806 /// Values to ignore in the cost model when VF > 1.
1808
1809 /// All element types found in the loop.
1811};
1812} // end namespace llvm
1813
1814namespace {
1815/// Helper struct to manage generating runtime checks for vectorization.
1816///
1817/// The runtime checks are created up-front in temporary blocks to allow better
1818/// estimating the cost and un-linked from the existing IR. After deciding to
1819/// vectorize, the checks are moved back. If deciding not to vectorize, the
1820/// temporary blocks are completely removed.
1821class GeneratedRTChecks {
1822 /// Basic block which contains the generated SCEV checks, if any.
1823 BasicBlock *SCEVCheckBlock = nullptr;
1824
1825 /// The value representing the result of the generated SCEV checks. If it is
1826 /// nullptr, either no SCEV checks have been generated or they have been used.
1827 Value *SCEVCheckCond = nullptr;
1828
1829 /// Basic block which contains the generated memory runtime checks, if any.
1830 BasicBlock *MemCheckBlock = nullptr;
1831
1832 /// The value representing the result of the generated memory runtime checks.
1833 /// If it is nullptr, either no memory runtime checks have been generated or
1834 /// they have been used.
1835 Value *MemRuntimeCheckCond = nullptr;
1836
1837 DominatorTree *DT;
1838 LoopInfo *LI;
1840
1841 SCEVExpander SCEVExp;
1842 SCEVExpander MemCheckExp;
1843
1844 bool CostTooHigh = false;
1845 const bool AddBranchWeights;
1846
1847 Loop *OuterLoop = nullptr;
1848
1850
1851public:
1852 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1854 const DataLayout &DL, bool AddBranchWeights)
1855 : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
1856 MemCheckExp(*PSE.getSE(), DL, "scev.check"),
1857 AddBranchWeights(AddBranchWeights), PSE(PSE) {}
1858
1859 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1860 /// accurately estimate the cost of the runtime checks. The blocks are
1861 /// un-linked from the IR and are added back during vector code generation. If
1862 /// there is no vector code generation, the check blocks are removed
1863 /// completely.
1864 void create(Loop *L, const LoopAccessInfo &LAI,
1865 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1866
1867 // Hard cutoff to limit compile-time increase in case a very large number of
1868 // runtime checks needs to be generated.
1869 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1870 // profile info.
1871 CostTooHigh =
1873 if (CostTooHigh)
1874 return;
1875
1876 BasicBlock *LoopHeader = L->getHeader();
1877 BasicBlock *Preheader = L->getLoopPreheader();
1878
1879 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1880 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1881 // may be used by SCEVExpander. The blocks will be un-linked from their
1882 // predecessors and removed from LI & DT at the end of the function.
1883 if (!UnionPred.isAlwaysTrue()) {
1884 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1885 nullptr, "vector.scevcheck");
1886
1887 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1888 &UnionPred, SCEVCheckBlock->getTerminator());
1889 }
1890
1891 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1892 if (RtPtrChecking.Need) {
1893 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1894 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1895 "vector.memcheck");
1896
1897 auto DiffChecks = RtPtrChecking.getDiffChecks();
1898 if (DiffChecks) {
1899 Value *RuntimeVF = nullptr;
1900 MemRuntimeCheckCond = addDiffRuntimeChecks(
1901 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1902 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1903 if (!RuntimeVF)
1904 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1905 return RuntimeVF;
1906 },
1907 IC);
1908 } else {
1909 MemRuntimeCheckCond = addRuntimeChecks(
1910 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1912 }
1913 assert(MemRuntimeCheckCond &&
1914 "no RT checks generated although RtPtrChecking "
1915 "claimed checks are required");
1916 }
1917
1918 if (!MemCheckBlock && !SCEVCheckBlock)
1919 return;
1920
1921 // Unhook the temporary block with the checks, update various places
1922 // accordingly.
1923 if (SCEVCheckBlock)
1924 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1925 if (MemCheckBlock)
1926 MemCheckBlock->replaceAllUsesWith(Preheader);
1927
1928 if (SCEVCheckBlock) {
1929 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1930 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1931 Preheader->getTerminator()->eraseFromParent();
1932 }
1933 if (MemCheckBlock) {
1934 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1935 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1936 Preheader->getTerminator()->eraseFromParent();
1937 }
1938
1939 DT->changeImmediateDominator(LoopHeader, Preheader);
1940 if (MemCheckBlock) {
1941 DT->eraseNode(MemCheckBlock);
1942 LI->removeBlock(MemCheckBlock);
1943 }
1944 if (SCEVCheckBlock) {
1945 DT->eraseNode(SCEVCheckBlock);
1946 LI->removeBlock(SCEVCheckBlock);
1947 }
1948
1949 // Outer loop is used as part of the later cost calculations.
1950 OuterLoop = L->getParentLoop();
1951 }
1952
1953 InstructionCost getCost() {
1954 if (SCEVCheckBlock || MemCheckBlock)
1955 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1956
1957 if (CostTooHigh) {
1959 Cost.setInvalid();
1960 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1961 return Cost;
1962 }
1963
1964 InstructionCost RTCheckCost = 0;
1965 if (SCEVCheckBlock)
1966 for (Instruction &I : *SCEVCheckBlock) {
1967 if (SCEVCheckBlock->getTerminator() == &I)
1968 continue;
1971 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1972 RTCheckCost += C;
1973 }
1974 if (MemCheckBlock) {
1975 InstructionCost MemCheckCost = 0;
1976 for (Instruction &I : *MemCheckBlock) {
1977 if (MemCheckBlock->getTerminator() == &I)
1978 continue;
1981 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1982 MemCheckCost += C;
1983 }
1984
1985 // If the runtime memory checks are being created inside an outer loop
1986 // we should find out if these checks are outer loop invariant. If so,
1987 // the checks will likely be hoisted out and so the effective cost will
1988 // reduce according to the outer loop trip count.
1989 if (OuterLoop) {
1990 ScalarEvolution *SE = MemCheckExp.getSE();
1991 // TODO: If profitable, we could refine this further by analysing every
1992 // individual memory check, since there could be a mixture of loop
1993 // variant and invariant checks that mean the final condition is
1994 // variant.
1995 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1996 if (SE->isLoopInvariant(Cond, OuterLoop)) {
1997 // It seems reasonable to assume that we can reduce the effective
1998 // cost of the checks even when we know nothing about the trip
1999 // count. Assume that the outer loop executes at least twice.
2000 unsigned BestTripCount = 2;
2001
2002 // Get the best known TC estimate.
2003 if (auto EstimatedTC = getSmallBestKnownTC(
2004 PSE, OuterLoop, /* CanUseConstantMax = */ false))
2005 BestTripCount = *EstimatedTC;
2006
2007 BestTripCount = std::max(BestTripCount, 1U);
2008 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2009
2010 // Let's ensure the cost is always at least 1.
2011 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2013
2014 if (BestTripCount > 1)
2016 << "We expect runtime memory checks to be hoisted "
2017 << "out of the outer loop. Cost reduced from "
2018 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2019
2020 MemCheckCost = NewMemCheckCost;
2021 }
2022 }
2023
2024 RTCheckCost += MemCheckCost;
2025 }
2026
2027 if (SCEVCheckBlock || MemCheckBlock)
2028 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2029 << "\n");
2030
2031 return RTCheckCost;
2032 }
2033
2034 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2035 /// unused.
2036 ~GeneratedRTChecks() {
2037 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2038 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2039 if (!SCEVCheckCond)
2040 SCEVCleaner.markResultUsed();
2041
2042 if (!MemRuntimeCheckCond)
2043 MemCheckCleaner.markResultUsed();
2044
2045 if (MemRuntimeCheckCond) {
2046 auto &SE = *MemCheckExp.getSE();
2047 // Memory runtime check generation creates compares that use expanded
2048 // values. Remove them before running the SCEVExpanderCleaners.
2049 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2050 if (MemCheckExp.isInsertedInstruction(&I))
2051 continue;
2052 SE.forgetValue(&I);
2053 I.eraseFromParent();
2054 }
2055 }
2056 MemCheckCleaner.cleanup();
2057 SCEVCleaner.cleanup();
2058
2059 if (SCEVCheckCond)
2060 SCEVCheckBlock->eraseFromParent();
2061 if (MemRuntimeCheckCond)
2062 MemCheckBlock->eraseFromParent();
2063 }
2064
2065 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2066 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2067 /// depending on the generated condition.
2068 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2069 BasicBlock *LoopVectorPreHeader) {
2070 if (!SCEVCheckCond)
2071 return nullptr;
2072
2073 Value *Cond = SCEVCheckCond;
2074 // Mark the check as used, to prevent it from being removed during cleanup.
2075 SCEVCheckCond = nullptr;
2076 if (auto *C = dyn_cast<ConstantInt>(Cond))
2077 if (C->isZero())
2078 return nullptr;
2079
2080 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2081
2082 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2083 // Create new preheader for vector loop.
2084 if (OuterLoop)
2085 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2086
2087 SCEVCheckBlock->getTerminator()->eraseFromParent();
2088 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2089 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2090 SCEVCheckBlock);
2091
2092 DT->addNewBlock(SCEVCheckBlock, Pred);
2093 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2094
2095 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2096 if (AddBranchWeights)
2097 setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
2098 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2099 return SCEVCheckBlock;
2100 }
2101
2102 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2103 /// the branches to branch to the vector preheader or \p Bypass, depending on
2104 /// the generated condition.
2105 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2106 BasicBlock *LoopVectorPreHeader) {
2107 // Check if we generated code that checks in runtime if arrays overlap.
2108 if (!MemRuntimeCheckCond)
2109 return nullptr;
2110
2111 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2112 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2113 MemCheckBlock);
2114
2115 DT->addNewBlock(MemCheckBlock, Pred);
2116 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2117 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2118
2119 if (OuterLoop)
2120 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2121
2122 BranchInst &BI =
2123 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2124 if (AddBranchWeights) {
2125 setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2126 }
2127 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2128 MemCheckBlock->getTerminator()->setDebugLoc(
2129 Pred->getTerminator()->getDebugLoc());
2130
2131 // Mark the check as used, to prevent it from being removed during cleanup.
2132 MemRuntimeCheckCond = nullptr;
2133 return MemCheckBlock;
2134 }
2135};
2136} // namespace
2137
2139 return Style == TailFoldingStyle::Data ||
2140 Style == TailFoldingStyle::DataAndControlFlow ||
2141 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2142}
2143
2145 return Style == TailFoldingStyle::DataAndControlFlow ||
2146 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2147}
2148
2149// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2150// vectorization. The loop needs to be annotated with #pragma omp simd
2151// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2152// vector length information is not provided, vectorization is not considered
2153// explicit. Interleave hints are not allowed either. These limitations will be
2154// relaxed in the future.
2155// Please, note that we are currently forced to abuse the pragma 'clang
2156// vectorize' semantics. This pragma provides *auto-vectorization hints*
2157// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2158// provides *explicit vectorization hints* (LV can bypass legal checks and
2159// assume that vectorization is legal). However, both hints are implemented
2160// using the same metadata (llvm.loop.vectorize, processed by
2161// LoopVectorizeHints). This will be fixed in the future when the native IR
2162// representation for pragma 'omp simd' is introduced.
2163static bool isExplicitVecOuterLoop(Loop *OuterLp,
2165 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2166 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2167
2168 // Only outer loops with an explicit vectorization hint are supported.
2169 // Unannotated outer loops are ignored.
2171 return false;
2172
2173 Function *Fn = OuterLp->getHeader()->getParent();
2174 if (!Hints.allowVectorization(Fn, OuterLp,
2175 true /*VectorizeOnlyWhenForced*/)) {
2176 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2177 return false;
2178 }
2179
2180 if (Hints.getInterleave() > 1) {
2181 // TODO: Interleave support is future work.
2182 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2183 "outer loops.\n");
2184 Hints.emitRemarkWithHints();
2185 return false;
2186 }
2187
2188 return true;
2189}
2190
2194 // Collect inner loops and outer loops without irreducible control flow. For
2195 // now, only collect outer loops that have explicit vectorization hints. If we
2196 // are stress testing the VPlan H-CFG construction, we collect the outermost
2197 // loop of every loop nest.
2198 if (L.isInnermost() || VPlanBuildStressTest ||
2200 LoopBlocksRPO RPOT(&L);
2201 RPOT.perform(LI);
2202 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2203 V.push_back(&L);
2204 // TODO: Collect inner loops inside marked outer loops in case
2205 // vectorization fails for the outer loop. Do not invoke
2206 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2207 // already known to be reducible. We can use an inherited attribute for
2208 // that.
2209 return;
2210 }
2211 }
2212 for (Loop *InnerL : L)
2213 collectSupportedLoops(*InnerL, LI, ORE, V);
2214}
2215
2216//===----------------------------------------------------------------------===//
2217// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2218// LoopVectorizationCostModel and LoopVectorizationPlanner.
2219//===----------------------------------------------------------------------===//
2220
2221/// Compute the transformed value of Index at offset StartValue using step
2222/// StepValue.
2223/// For integer induction, returns StartValue + Index * StepValue.
2224/// For pointer induction, returns StartValue[Index * StepValue].
2225/// FIXME: The newly created binary instructions should contain nsw/nuw
2226/// flags, which can be found from the original scalar operations.
2227static Value *
2229 Value *Step,
2231 const BinaryOperator *InductionBinOp) {
2232 Type *StepTy = Step->getType();
2233 Value *CastedIndex = StepTy->isIntegerTy()
2234 ? B.CreateSExtOrTrunc(Index, StepTy)
2235 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2236 if (CastedIndex != Index) {
2237 CastedIndex->setName(CastedIndex->getName() + ".cast");
2238 Index = CastedIndex;
2239 }
2240
2241 // Note: the IR at this point is broken. We cannot use SE to create any new
2242 // SCEV and then expand it, hoping that SCEV's simplification will give us
2243 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2244 // lead to various SCEV crashes. So all we can do is to use builder and rely
2245 // on InstCombine for future simplifications. Here we handle some trivial
2246 // cases only.
2247 auto CreateAdd = [&B](Value *X, Value *Y) {
2248 assert(X->getType() == Y->getType() && "Types don't match!");
2249 if (auto *CX = dyn_cast<ConstantInt>(X))
2250 if (CX->isZero())
2251 return Y;
2252 if (auto *CY = dyn_cast<ConstantInt>(Y))
2253 if (CY->isZero())
2254 return X;
2255 return B.CreateAdd(X, Y);
2256 };
2257
2258 // We allow X to be a vector type, in which case Y will potentially be
2259 // splatted into a vector with the same element count.
2260 auto CreateMul = [&B](Value *X, Value *Y) {
2261 assert(X->getType()->getScalarType() == Y->getType() &&
2262 "Types don't match!");
2263 if (auto *CX = dyn_cast<ConstantInt>(X))
2264 if (CX->isOne())
2265 return Y;
2266 if (auto *CY = dyn_cast<ConstantInt>(Y))
2267 if (CY->isOne())
2268 return X;
2269 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2270 if (XVTy && !isa<VectorType>(Y->getType()))
2271 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2272 return B.CreateMul(X, Y);
2273 };
2274
2275 switch (InductionKind) {
2277 assert(!isa<VectorType>(Index->getType()) &&
2278 "Vector indices not supported for integer inductions yet");
2279 assert(Index->getType() == StartValue->getType() &&
2280 "Index type does not match StartValue type");
2281 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2282 return B.CreateSub(StartValue, Index);
2283 auto *Offset = CreateMul(Index, Step);
2284 return CreateAdd(StartValue, Offset);
2285 }
2287 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2289 assert(!isa<VectorType>(Index->getType()) &&
2290 "Vector indices not supported for FP inductions yet");
2291 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2292 assert(InductionBinOp &&
2293 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2294 InductionBinOp->getOpcode() == Instruction::FSub) &&
2295 "Original bin op should be defined for FP induction");
2296
2297 Value *MulExp = B.CreateFMul(Step, Index);
2298 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2299 "induction");
2300 }
2302 return nullptr;
2303 }
2304 llvm_unreachable("invalid enum");
2305}
2306
2307std::optional<unsigned> getMaxVScale(const Function &F,
2308 const TargetTransformInfo &TTI) {
2309 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2310 return MaxVScale;
2311
2312 if (F.hasFnAttribute(Attribute::VScaleRange))
2313 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2314
2315 return std::nullopt;
2316}
2317
2318/// For the given VF and UF and maximum trip count computed for the loop, return
2319/// whether the induction variable might overflow in the vectorized loop. If not,
2320/// then we know a runtime overflow check always evaluates to false and can be
2321/// removed.
2324 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2325 // Always be conservative if we don't know the exact unroll factor.
2326 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2327
2328 Type *IdxTy = Cost->Legal->getWidestInductionType();
2329 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2330
2331 // We know the runtime overflow check is known false iff the (max) trip-count
2332 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2333 // the vector loop induction variable.
2334 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2335 uint64_t MaxVF = VF.getKnownMinValue();
2336 if (VF.isScalable()) {
2337 std::optional<unsigned> MaxVScale =
2338 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2339 if (!MaxVScale)
2340 return false;
2341 MaxVF *= *MaxVScale;
2342 }
2343
2344 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2345 }
2346
2347 return false;
2348}
2349
2350// Return whether we allow using masked interleave-groups (for dealing with
2351// strided loads/stores that reside in predicated blocks, or for dealing
2352// with gaps).
2354 // If an override option has been passed in for interleaved accesses, use it.
2355 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2357
2359}
2360
2362 VPReplicateRecipe *RepRecipe,
2363 const VPLane &Lane,
2364 VPTransformState &State) {
2365 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2366
2367 // Does this instruction return a value ?
2368 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2369
2370 Instruction *Cloned = Instr->clone();
2371 if (!IsVoidRetTy) {
2372 Cloned->setName(Instr->getName() + ".cloned");
2373#if !defined(NDEBUG)
2374 // Verify that VPlan type inference results agree with the type of the
2375 // generated values.
2376 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2377 "inferred type and type from generated instructions do not match");
2378#endif
2379 }
2380
2381 RepRecipe->setFlags(Cloned);
2382
2383 if (auto DL = Instr->getDebugLoc())
2384 State.setDebugLocFrom(DL);
2385
2386 // Replace the operands of the cloned instructions with their scalar
2387 // equivalents in the new loop.
2388 for (const auto &I : enumerate(RepRecipe->operands())) {
2389 auto InputLane = Lane;
2390 VPValue *Operand = I.value();
2392 InputLane = VPLane::getFirstLane();
2393 Cloned->setOperand(I.index(), State.get(Operand, InputLane));
2394 }
2395 State.addNewMetadata(Cloned, Instr);
2396
2397 // Place the cloned scalar in the new loop.
2398 State.Builder.Insert(Cloned);
2399
2400 State.set(RepRecipe, Cloned, Lane);
2401
2402 // If we just cloned a new assumption, add it the assumption cache.
2403 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2405
2406 // End if-block.
2407 VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
2408 bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
2409 assert((Parent || all_of(RepRecipe->operands(),
2410 [](VPValue *Op) {
2411 return Op->isDefinedOutsideLoopRegions();
2412 })) &&
2413 "Expected a recipe is either within a region or all of its operands "
2414 "are defined outside the vectorized region.");
2415 if (IfPredicateInstr)
2416 PredicatedInstructions.push_back(Cloned);
2417}
2418
2419Value *
2421 if (VectorTripCount)
2422 return VectorTripCount;
2423
2424 Value *TC = getTripCount();
2425 IRBuilder<> Builder(InsertBlock->getTerminator());
2426
2427 Type *Ty = TC->getType();
2428 // This is where we can make the step a runtime constant.
2429 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2430
2431 // If the tail is to be folded by masking, round the number of iterations N
2432 // up to a multiple of Step instead of rounding down. This is done by first
2433 // adding Step-1 and then rounding down. Note that it's ok if this addition
2434 // overflows: the vector induction variable will eventually wrap to zero given
2435 // that it starts at zero and its Step is a power of two; the loop will then
2436 // exit, with the last early-exit vector comparison also producing all-true.
2437 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2438 // is accounted for in emitIterationCountCheck that adds an overflow check.
2439 if (Cost->foldTailByMasking()) {
2441 "VF*UF must be a power of 2 when folding tail by masking");
2442 TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2443 "n.rnd.up");
2444 }
2445
2446 // Now we need to generate the expression for the part of the loop that the
2447 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2448 // iterations are not required for correctness, or N - Step, otherwise. Step
2449 // is equal to the vectorization factor (number of SIMD elements) times the
2450 // unroll factor (number of SIMD instructions).
2451 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2452
2453 // There are cases where we *must* run at least one iteration in the remainder
2454 // loop. See the cost model for when this can happen. If the step evenly
2455 // divides the trip count, we set the remainder to be equal to the step. If
2456 // the step does not evenly divide the trip count, no adjustment is necessary
2457 // since there will already be scalar iterations. Note that the minimum
2458 // iterations check ensures that N >= Step.
2459 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2460 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2461 R = Builder.CreateSelect(IsZero, Step, R);
2462 }
2463
2464 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2465
2466 return VectorTripCount;
2467}
2468
2469/// Introduces a new VPIRBasicBlock for \p CheckIRBB to \p Plan between the
2470/// vector preheader and its predecessor, also connecting the new block to the
2471/// scalar preheader.
2472static void introduceCheckBlockInVPlan(VPlan &Plan, BasicBlock *CheckIRBB) {
2473 VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2474 VPBlockBase *VectorPH = Plan.getVectorPreheader();
2475 VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor();
2476 if (PreVectorPH->getNumSuccessors() != 1) {
2477 assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2478 assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&
2479 "Unexpected successor");
2480 VPIRBasicBlock *CheckVPIRBB = VPIRBasicBlock::fromBasicBlock(CheckIRBB);
2481 VPBlockUtils::insertOnEdge(PreVectorPH, VectorPH, CheckVPIRBB);
2482 PreVectorPH = CheckVPIRBB;
2483 }
2484 VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
2485 PreVectorPH->swapSuccessors();
2486}
2487
2489 Value *Count = getTripCount();
2490 // Reuse existing vector loop preheader for TC checks.
2491 // Note that new preheader block is generated for vector loop.
2492 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2493 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2494
2495 // Generate code to check if the loop's trip count is less than VF * UF, or
2496 // equal to it in case a scalar epilogue is required; this implies that the
2497 // vector trip count is zero. This check also covers the case where adding one
2498 // to the backedge-taken count overflowed leading to an incorrect trip count
2499 // of zero. In this case we will also jump to the scalar loop.
2500 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2502
2503 // If tail is to be folded, vector loop takes care of all iterations.
2504 Type *CountTy = Count->getType();
2505 Value *CheckMinIters = Builder.getFalse();
2506 auto CreateStep = [&]() -> Value * {
2507 // Create step with max(MinProTripCount, UF * VF).
2509 return createStepForVF(Builder, CountTy, VF, UF);
2510
2511 Value *MinProfTC =
2513 if (!VF.isScalable())
2514 return MinProfTC;
2516 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2517 };
2518
2519 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2520 if (Style == TailFoldingStyle::None) {
2521 Value *Step = CreateStep();
2522 ScalarEvolution &SE = *PSE.getSE();
2523 // TODO: Emit unconditional branch to vector preheader instead of
2524 // conditional branch with known condition.
2525 const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop);
2526 // Check if the trip count is < the step.
2527 if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) {
2528 // TODO: Ensure step is at most the trip count when determining max VF and
2529 // UF, w/o tail folding.
2530 CheckMinIters = Builder.getTrue();
2532 TripCountSCEV, SE.getSCEV(Step))) {
2533 // Generate the minimum iteration check only if we cannot prove the
2534 // check is known to be true, or known to be false.
2535 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2536 } // else step known to be < trip count, use CheckMinIters preset to false.
2537 } else if (VF.isScalable() &&
2540 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2541 // an overflow to zero when updating induction variables and so an
2542 // additional overflow check is required before entering the vector loop.
2543
2544 // Get the maximum unsigned value for the type.
2545 Value *MaxUIntTripCount =
2546 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2547 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2548
2549 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2550 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2551 }
2552
2553 // Create new preheader for vector loop.
2555 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2556 "vector.ph");
2557
2558 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2559 DT->getNode(Bypass)->getIDom()) &&
2560 "TC check is expected to dominate Bypass");
2561
2562 BranchInst &BI =
2563 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2565 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2566 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2567 LoopBypassBlocks.push_back(TCCheckBlock);
2568
2569 // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here.
2570 introduceCheckBlockInVPlan(Plan, TCCheckBlock);
2571}
2572
2574 BasicBlock *const SCEVCheckBlock =
2575 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader);
2576 if (!SCEVCheckBlock)
2577 return nullptr;
2578
2579 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2581 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2582 "Cannot SCEV check stride or overflow when optimizing for size");
2583 assert(!LoopBypassBlocks.empty() &&
2584 "Should already be a bypass block due to iteration count check");
2585 LoopBypassBlocks.push_back(SCEVCheckBlock);
2586 AddedSafetyChecks = true;
2587
2588 introduceCheckBlockInVPlan(Plan, SCEVCheckBlock);
2589 return SCEVCheckBlock;
2590}
2591
2593 // VPlan-native path does not do any analysis for runtime checks currently.
2595 return nullptr;
2596
2597 BasicBlock *const MemCheckBlock =
2598 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2599
2600 // Check if we generated code that checks in runtime if arrays overlap. We put
2601 // the checks into a separate block to make the more common case of few
2602 // elements faster.
2603 if (!MemCheckBlock)
2604 return nullptr;
2605
2606 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2607 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2608 "Cannot emit memory checks when optimizing for size, unless forced "
2609 "to vectorize.");
2610 ORE->emit([&]() {
2611 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2614 << "Code-size may be reduced by not forcing "
2615 "vectorization, or by source-code modifications "
2616 "eliminating the need for runtime checks "
2617 "(e.g., adding 'restrict').";
2618 });
2619 }
2620
2621 LoopBypassBlocks.push_back(MemCheckBlock);
2622
2623 AddedSafetyChecks = true;
2624
2625 introduceCheckBlockInVPlan(Plan, MemCheckBlock);
2626 return MemCheckBlock;
2627}
2628
2631 assert(LoopVectorPreHeader && "Invalid loop structure");
2633 Cost->requiresScalarEpilogue(VF.isVector())) &&
2634 "loops not exiting via the latch without required epilogue?");
2635
2638 LI, nullptr, Twine(Prefix) + "middle.block");
2641 nullptr, Twine(Prefix) + "scalar.ph");
2642}
2643
2645 VPIRInstruction *InductionPhiRI, const InductionDescriptor &II, Value *Step,
2646 ArrayRef<BasicBlock *> BypassBlocks, VPBuilder &ScalarPHBuilder,
2647 Value *MainVectorTripCount) {
2648 // TODO: Move to LVP or general VPlan construction, once no IR values are
2649 // generated.
2650 auto *OrigPhi = cast<PHINode>(&InductionPhiRI->getInstruction());
2652 assert(VectorTripCount && "Expected valid arguments");
2653
2654 Instruction *OldInduction = Legal->getPrimaryInduction();
2655 // For the primary induction the end values are known.
2656 Value *EndValue = VectorTripCount;
2657 Value *EndValueFromAdditionalBypass = MainVectorTripCount;
2658 // Otherwise compute them accordingly.
2659 if (OrigPhi != OldInduction) {
2661
2662 // Fast-math-flags propagate from the original induction instruction.
2663 if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
2664 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2665
2666 EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
2667 Step, II.getKind(), II.getInductionBinOp());
2668 EndValue->setName("ind.end");
2669
2670 // Compute the end value for the additional bypass (if applicable).
2671 if (MainVectorTripCount) {
2672 B.SetInsertPoint(getAdditionalBypassBlock(),
2673 getAdditionalBypassBlock()->getFirstInsertionPt());
2674 EndValueFromAdditionalBypass =
2675 emitTransformedIndex(B, MainVectorTripCount, II.getStartValue(), Step,
2676 II.getKind(), II.getInductionBinOp());
2677 EndValueFromAdditionalBypass->setName("ind.end");
2678 }
2679 }
2680
2681 auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
2683 {Plan.getOrAddLiveIn(EndValue), Plan.getOrAddLiveIn(II.getStartValue())},
2684 OrigPhi->getDebugLoc(), "bc.resume.val");
2685 assert(InductionPhiRI->getNumOperands() == 0 &&
2686 "InductionPhiRI should not have any operands");
2687 InductionPhiRI->addOperand(ResumePhiRecipe);
2688
2689 if (EndValueFromAdditionalBypass) {
2690 // Store the bypass value here, as it needs to be added as operand to its
2691 // scalar preheader phi node after the epilogue skeleton has been created.
2692 // TODO: Directly add as extra operand to the VPResumePHI recipe.
2693 assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
2694 "entry for OrigPhi already exits");
2695 Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
2696 }
2697}
2698
2699/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2700/// expansion results.
2702 const SCEV2ValueTy &ExpandedSCEVs) {
2703 const SCEV *Step = ID.getStep();
2704 if (auto *C = dyn_cast<SCEVConstant>(Step))
2705 return C->getValue();
2706 if (auto *U = dyn_cast<SCEVUnknown>(Step))
2707 return U->getValue();
2708 auto I = ExpandedSCEVs.find(Step);
2709 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
2710 return I->second;
2711}
2712
2713/// Knowing that loop \p L executes a single vector iteration, add instructions
2714/// that will get simplified and thus should not have any cost to \p
2715/// InstsToIgnore.
2718 SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2719 auto *Cmp = L->getLatchCmpInst();
2720 if (Cmp)
2721 InstsToIgnore.insert(Cmp);
2722 for (const auto &KV : IL) {
2723 // Extract the key by hand so that it can be used in the lambda below. Note
2724 // that captured structured bindings are a C++20 extension.
2725 const PHINode *IV = KV.first;
2726
2727 // Get next iteration value of the induction variable.
2728 Instruction *IVInst =
2729 cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2730 if (all_of(IVInst->users(),
2731 [&](const User *U) { return U == IV || U == Cmp; }))
2732 InstsToIgnore.insert(IVInst);
2733 }
2734}
2735
2737 const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
2738 SmallPtrSetImpl<PHINode *> *IVSubset) {
2739 // We are going to resume the execution of the scalar loop.
2740 // Go over all of the induction variable PHIs of the scalar loop header and
2741 // fix their starting values, which depend on the counter of the last
2742 // iteration of the vectorized loop. If we come from one of the
2743 // LoopBypassBlocks then we need to start from the original start value.
2744 // Otherwise we provide the trip count from the main vector loop.
2745 VPBasicBlock *ScalarPHVPBB = Plan.getScalarPreheader();
2746 VPBuilder ScalarPHBuilder(ScalarPHVPBB, ScalarPHVPBB->begin());
2747 bool HasCanonical = false;
2748 for (VPRecipeBase &R : *Plan.getScalarHeader()) {
2749 auto *PhiR = cast<VPIRInstruction>(&R);
2750 auto *Phi = dyn_cast<PHINode>(&PhiR->getInstruction());
2751 if (!Phi)
2752 break;
2753 if (!Legal->getInductionVars().contains(Phi) ||
2754 (IVSubset && !IVSubset->contains(Phi)))
2755 continue;
2756 const InductionDescriptor &II = Legal->getInductionVars().find(Phi)->second;
2757 createInductionResumeVPValue(PhiR, II, getExpandedStep(II, ExpandedSCEVs),
2758 LoopBypassBlocks, ScalarPHBuilder,
2759 MainVectorTripCount);
2760 auto *ConstStart = dyn_cast<ConstantInt>(II.getStartValue());
2761 auto *ConstStep = II.getConstIntStepValue();
2762 if (Phi->getType() == VectorTripCount->getType() && ConstStart &&
2763 ConstStart->isZero() && ConstStep && ConstStep->isOne())
2764 HasCanonical = true;
2765 }
2766
2767 if (!IVSubset || HasCanonical)
2768 return;
2769 // When vectorizing the epilogue, create a resume phi for the canonical IV if
2770 // no suitable resume phi was already created.
2771 ScalarPHBuilder.createNaryOp(
2774 Plan.getOrAddLiveIn(ConstantInt::get(VectorTripCount->getType(), 0))},
2775 {}, "vec.epilog.resume.val");
2776}
2777
2779 const SCEV2ValueTy &ExpandedSCEVs) {
2780 /*
2781 In this function we generate a new loop. The new loop will contain
2782 the vectorized instructions while the old loop will continue to run the
2783 scalar remainder.
2784
2785 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2786 / | preheader are expanded here. Eventually all required SCEV
2787 / | expansion should happen here.
2788 / v
2789 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2790 | / |
2791 | / v
2792 || [ ] <-- vector pre header.
2793 |/ |
2794 | v
2795 | [ ] \
2796 | [ ]_| <-- vector loop (created during VPlan execution).
2797 | |
2798 | v
2799 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2800 | | successors created during VPlan execution)
2801 \/ |
2802 /\ v
2803 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
2804 | |
2805 (opt) v <-- edge from middle to exit iff epilogue is not required.
2806 | [ ] \
2807 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header
2808 | | wrapped in VPIRBasicBlock).
2809 \ |
2810 \ v
2811 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
2812 ...
2813 */
2814
2815 // Create an empty vector loop, and prepare basic blocks for the runtime
2816 // checks.
2818
2819 // Now, compare the new count to zero. If it is zero skip the vector loop and
2820 // jump to the scalar loop. This check also covers the case where the
2821 // backedge-taken count is uint##_max: adding one to it will overflow leading
2822 // to an incorrect trip count of zero. In this (rare) case we will also jump
2823 // to the scalar loop.
2825
2826 // Generate the code to check any assumptions that we've made for SCEV
2827 // expressions.
2829
2830 // Generate the code that checks in runtime if arrays overlap. We put the
2831 // checks into a separate block to make the more common case of few elements
2832 // faster.
2834
2835 // Emit phis for the new starting index of the scalar loop.
2836 createInductionResumeVPValues(ExpandedSCEVs);
2837
2838 return LoopVectorPreHeader;
2839}
2840
2841// Fix up external users of the induction variable. At this point, we are
2842// in LCSSA form, with all external PHIs that use the IV having one input value,
2843// coming from the remainder loop. We need those PHIs to also have a correct
2844// value for the IV when arriving directly from the middle block.
2846 const InductionDescriptor &II,
2847 Value *VectorTripCount,
2848 BasicBlock *MiddleBlock,
2849 VPTransformState &State) {
2850 // There are two kinds of external IV usages - those that use the value
2851 // computed in the last iteration (the PHI) and those that use the penultimate
2852 // value (the value that feeds into the phi from the loop latch).
2853 // We allow both, but they, obviously, have different values.
2854
2855 DenseMap<Value *, Value *> MissingVals;
2856
2857 Value *EndValue = cast<PHINode>(OrigPhi->getIncomingValueForBlock(
2859 ->getIncomingValueForBlock(MiddleBlock);
2860
2861 // An external user of the last iteration's value should see the value that
2862 // the remainder loop uses to initialize its own IV.
2864 for (User *U : PostInc->users()) {
2865 Instruction *UI = cast<Instruction>(U);
2866 if (!OrigLoop->contains(UI)) {
2867 assert(isa<PHINode>(UI) && "Expected LCSSA form");
2868 MissingVals[UI] = EndValue;
2869 }
2870 }
2871
2872 // An external user of the penultimate value need to see EndValue - Step.
2873 // The simplest way to get this is to recompute it from the constituent SCEVs,
2874 // that is Start + (Step * (CRD - 1)).
2875 for (User *U : OrigPhi->users()) {
2876 auto *UI = cast<Instruction>(U);
2877 if (!OrigLoop->contains(UI)) {
2878 assert(isa<PHINode>(UI) && "Expected LCSSA form");
2879 IRBuilder<> B(MiddleBlock->getTerminator());
2880
2881 // Fast-math-flags propagate from the original induction instruction.
2882 if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
2883 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2884
2885 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
2886 assert(StepVPV && "step must have been expanded during VPlan execution");
2887 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
2888 : State.get(StepVPV, VPLane(0));
2889 Value *Escape = nullptr;
2890 if (EndValue->getType()->isIntegerTy())
2891 Escape = B.CreateSub(EndValue, Step);
2892 else if (EndValue->getType()->isPointerTy())
2893 Escape = B.CreatePtrAdd(EndValue, B.CreateNeg(Step));
2894 else {
2895 assert(EndValue->getType()->isFloatingPointTy() &&
2896 "Unexpected induction type");
2897 Escape = B.CreateBinOp(II.getInductionBinOp()->getOpcode() ==
2898 Instruction::FAdd
2899 ? Instruction::FSub
2900 : Instruction::FAdd,
2901 EndValue, Step);
2902 }
2903 Escape->setName("ind.escape");
2904 MissingVals[UI] = Escape;
2905 }
2906 }
2907
2908 assert((MissingVals.empty() ||
2909 all_of(MissingVals,
2910 [MiddleBlock, this](const std::pair<Value *, Value *> &P) {
2911 return all_of(
2912 predecessors(cast<Instruction>(P.first)->getParent()),
2913 [MiddleBlock, this](BasicBlock *Pred) {
2914 return Pred == MiddleBlock ||
2915 Pred == OrigLoop->getLoopLatch();
2916 });
2917 })) &&
2918 "Expected escaping values from latch/middle.block only");
2919
2920 for (auto &I : MissingVals) {
2921 PHINode *PHI = cast<PHINode>(I.first);
2922 // One corner case we have to handle is two IVs "chasing" each-other,
2923 // that is %IV2 = phi [...], [ %IV1, %latch ]
2924 // In this case, if IV1 has an external use, we need to avoid adding both
2925 // "last value of IV1" and "penultimate value of IV2". So, verify that we
2926 // don't already have an incoming value for the middle block.
2927 if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
2928 PHI->addIncoming(I.second, MiddleBlock);
2929 }
2930}
2931
2932namespace {
2933
2934struct CSEDenseMapInfo {
2935 static bool canHandle(const Instruction *I) {
2936 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2937 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2938 }
2939
2940 static inline Instruction *getEmptyKey() {
2942 }
2943
2944 static inline Instruction *getTombstoneKey() {
2946 }
2947
2948 static unsigned getHashValue(const Instruction *I) {
2949 assert(canHandle(I) && "Unknown instruction!");
2950 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
2951 I->value_op_end()));
2952 }
2953
2954 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2955 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2956 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2957 return LHS == RHS;
2958 return LHS->isIdenticalTo(RHS);
2959 }
2960};
2961
2962} // end anonymous namespace
2963
2964///Perform cse of induction variable instructions.
2965static void cse(BasicBlock *BB) {
2966 // Perform simple cse.
2968 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2969 if (!CSEDenseMapInfo::canHandle(&In))
2970 continue;
2971
2972 // Check if we can replace this instruction with any of the
2973 // visited instructions.
2974 if (Instruction *V = CSEMap.lookup(&In)) {
2975 In.replaceAllUsesWith(V);
2976 In.eraseFromParent();
2977 continue;
2978 }
2979
2980 CSEMap[&In] = &In;
2981 }
2982}
2983
2986 ElementCount VF) const {
2987 // We only need to calculate a cost if the VF is scalar; for actual vectors
2988 // we should already have a pre-calculated cost at each VF.
2989 if (!VF.isScalar())
2990 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
2991
2993 Type *RetTy = CI->getType();
2995 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
2996 return *RedCost;
2997
2999 for (auto &ArgOp : CI->args())
3000 Tys.push_back(ArgOp->getType());
3001
3002 InstructionCost ScalarCallCost =
3004
3005 // If this is an intrinsic we may have a lower cost for it.
3007 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
3008 return std::min(ScalarCallCost, IntrinsicCost);
3009 }
3010 return ScalarCallCost;
3011}
3012
3014 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3015 return Elt;
3016 return VectorType::get(Elt, VF);
3017}
3018
3021 ElementCount VF) const {
3023 assert(ID && "Expected intrinsic call!");
3024 Type *RetTy = maybeVectorizeType(CI->getType(), VF);
3025 FastMathFlags FMF;
3026 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3027 FMF = FPMO->getFastMathFlags();
3028
3031 SmallVector<Type *> ParamTys;
3032 std::transform(FTy->param_begin(), FTy->param_end(),
3033 std::back_inserter(ParamTys),
3034 [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
3035
3036 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3037 dyn_cast<IntrinsicInst>(CI));
3038 return TTI.getIntrinsicInstrCost(CostAttrs,
3040}
3041
3043 // Fix widened non-induction PHIs by setting up the PHI operands.
3045 fixNonInductionPHIs(State);
3046
3047 // Forget the original basic block.
3050
3051 // When dealing with uncountable early exits we create middle.split blocks
3052 // between the vector loop region and the exit block. These blocks need
3053 // adding to any outer loop.
3054 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3055 Loop *OuterLoop = OrigLoop->getParentLoop();
3056 if (Legal->hasUncountableEarlyExit() && OuterLoop) {
3057 VPBasicBlock *MiddleVPBB = State.Plan->getMiddleBlock();
3058 VPBlockBase *PredVPBB = MiddleVPBB->getSinglePredecessor();
3059 while (PredVPBB && PredVPBB != VectorRegion) {
3060 BasicBlock *MiddleSplitBB =
3061 State.CFG.VPBB2IRBB[cast<VPBasicBlock>(PredVPBB)];
3062 OuterLoop->addBasicBlockToLoop(MiddleSplitBB, *LI);
3063 PredVPBB = PredVPBB->getSinglePredecessor();
3064 }
3065 }
3066
3067 // After vectorization, the exit blocks of the original loop will have
3068 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
3069 // looked through single-entry phis.
3070 SmallVector<BasicBlock *> ExitBlocks;
3071 OrigLoop->getExitBlocks(ExitBlocks);
3072 for (BasicBlock *Exit : ExitBlocks)
3073 for (PHINode &PN : Exit->phis())
3075
3076 if (Cost->requiresScalarEpilogue(VF.isVector())) {
3077 // No edge from the middle block to the unique exit block has been inserted
3078 // and there is nothing to fix from vector loop; phis should have incoming
3079 // from scalar loop only.
3080 } else {
3081 // TODO: Check in VPlan to see if IV users need fixing instead of checking
3082 // the cost model.
3083
3084 // If we inserted an edge from the middle block to the unique exit block,
3085 // update uses outside the loop (phis) to account for the newly inserted
3086 // edge.
3087
3088 // Fix-up external users of the induction variables.
3089 for (const auto &Entry : Legal->getInductionVars())
3090 fixupIVUsers(Entry.first, Entry.second,
3092 }
3093
3095 sinkScalarOperands(&*PI);
3096
3097 VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
3098 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
3099
3100 // Remove redundant induction instructions.
3101 cse(HeaderBB);
3102
3103 // Set/update profile weights for the vector and remainder loops as original
3104 // loop iterations are now distributed among them. Note that original loop
3105 // becomes the scalar remainder loop after vectorization.
3106 //
3107 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3108 // end up getting slightly roughened result but that should be OK since
3109 // profile is not inherently precise anyway. Note also possible bypass of
3110 // vector code caused by legality checks is ignored, assigning all the weight
3111 // to the vector loop, optimistically.
3112 //
3113 // For scalable vectorization we can't know at compile time how many
3114 // iterations of the loop are handled in one vector iteration, so instead
3115 // assume a pessimistic vscale of '1'.
3116 Loop *VectorLoop = LI->getLoopFor(HeaderBB);
3118 VF.getKnownMinValue() * UF);
3119}
3120
3122 // The basic block and loop containing the predicated instruction.
3123 auto *PredBB = PredInst->getParent();
3124 auto *VectorLoop = LI->getLoopFor(PredBB);
3125
3126 // Initialize a worklist with the operands of the predicated instruction.
3127 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3128
3129 // Holds instructions that we need to analyze again. An instruction may be
3130 // reanalyzed if we don't yet know if we can sink it or not.
3131 SmallVector<Instruction *, 8> InstsToReanalyze;
3132
3133 // Returns true if a given use occurs in the predicated block. Phi nodes use
3134 // their operands in their corresponding predecessor blocks.
3135 auto IsBlockOfUsePredicated = [&](Use &U) -> bool {
3136 auto *I = cast<Instruction>(U.getUser());
3137 BasicBlock *BB = I->getParent();
3138 if (auto *Phi = dyn_cast<PHINode>(I))
3139 BB = Phi->getIncomingBlock(
3140 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3141 return BB == PredBB;
3142 };
3143
3144 // Iteratively sink the scalarized operands of the predicated instruction
3145 // into the block we created for it. When an instruction is sunk, it's
3146 // operands are then added to the worklist. The algorithm ends after one pass
3147 // through the worklist doesn't sink a single instruction.
3148 bool Changed;
3149 do {
3150 // Add the instructions that need to be reanalyzed to the worklist, and
3151 // reset the changed indicator.
3152 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3153 InstsToReanalyze.clear();
3154 Changed = false;
3155
3156 while (!Worklist.empty()) {
3157 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3158
3159 // We can't sink an instruction if it is a phi node, is not in the loop,
3160 // may have side effects or may read from memory.
3161 // TODO: Could do more granular checking to allow sinking
3162 // a load past non-store instructions.
3163 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3164 I->mayHaveSideEffects() || I->mayReadFromMemory())
3165 continue;
3166
3167 // If the instruction is already in PredBB, check if we can sink its
3168 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3169 // sinking the scalar instruction I, hence it appears in PredBB; but it
3170 // may have failed to sink I's operands (recursively), which we try
3171 // (again) here.
3172 if (I->getParent() == PredBB) {
3173 Worklist.insert(I->op_begin(), I->op_end());
3174 continue;
3175 }
3176
3177 // It's legal to sink the instruction if all its uses occur in the
3178 // predicated block. Otherwise, there's nothing to do yet, and we may
3179 // need to reanalyze the instruction.
3180 if (!llvm::all_of(I->uses(), IsBlockOfUsePredicated)) {
3181 InstsToReanalyze.push_back(I);
3182 continue;
3183 }
3184
3185 // Move the instruction to the beginning of the predicated block, and add
3186 // it's operands to the worklist.
3187 I->moveBefore(&*PredBB->getFirstInsertionPt());
3188 Worklist.insert(I->op_begin(), I->op_end());
3189
3190 // The sinking may have enabled other instructions to be sunk, so we will
3191 // need to iterate.
3192 Changed = true;
3193 }
3194 } while (Changed);
3195}
3196
3198 auto Iter = vp_depth_first_deep(Plan.getEntry());
3199 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3200 for (VPRecipeBase &P : VPBB->phis()) {
3201 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3202 if (!VPPhi)
3203 continue;
3204 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
3205 // Make sure the builder has a valid insert point.
3206 Builder.SetInsertPoint(NewPhi);
3207 for (unsigned Idx = 0; Idx < VPPhi->getNumOperands(); ++Idx) {
3208 VPValue *Inc = VPPhi->getIncomingValue(Idx);
3209 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);
3210 NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
3211 }
3212 }
3213 }
3214}
3215
3216void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3217 // We should not collect Scalars more than once per VF. Right now, this
3218 // function is called from collectUniformsAndScalars(), which already does
3219 // this check. Collecting Scalars for VF=1 does not make any sense.
3220 assert(VF.isVector() && !Scalars.contains(VF) &&
3221 "This function should not be visited twice for the same VF");
3222
3223 // This avoids any chances of creating a REPLICATE recipe during planning
3224 // since that would result in generation of scalarized code during execution,
3225 // which is not supported for scalable vectors.
3226 if (VF.isScalable()) {
3227 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3228 return;
3229 }
3230
3232
3233 // These sets are used to seed the analysis with pointers used by memory
3234 // accesses that will remain scalar.
3236 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3237 auto *Latch = TheLoop->getLoopLatch();
3238
3239 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3240 // The pointer operands of loads and stores will be scalar as long as the
3241 // memory access is not a gather or scatter operation. The value operand of a
3242 // store will remain scalar if the store is scalarized.
3243 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3244 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3245 assert(WideningDecision != CM_Unknown &&
3246 "Widening decision should be ready at this moment");
3247 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3248 if (Ptr == Store->getValueOperand())
3249 return WideningDecision == CM_Scalarize;
3250 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3251 "Ptr is neither a value or pointer operand");
3252 return WideningDecision != CM_GatherScatter;
3253 };
3254
3255 // A helper that returns true if the given value is a getelementptr
3256 // instruction contained in the loop.
3257 auto IsLoopVaryingGEP = [&](Value *V) {
3258 return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
3259 };
3260
3261 // A helper that evaluates a memory access's use of a pointer. If the use will
3262 // be a scalar use and the pointer is only used by memory accesses, we place
3263 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3264 // PossibleNonScalarPtrs.
3265 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3266 // We only care about bitcast and getelementptr instructions contained in
3267 // the loop.
3268 if (!IsLoopVaryingGEP(Ptr))
3269 return;
3270
3271 // If the pointer has already been identified as scalar (e.g., if it was
3272 // also identified as uniform), there's nothing to do.
3273 auto *I = cast<Instruction>(Ptr);
3274 if (Worklist.count(I))
3275 return;
3276
3277 // If the use of the pointer will be a scalar use, and all users of the
3278 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3279 // place the pointer in PossibleNonScalarPtrs.
3280 if (IsScalarUse(MemAccess, Ptr) &&
3281 all_of(I->users(), IsaPred<LoadInst, StoreInst>))
3282 ScalarPtrs.insert(I);
3283 else
3284 PossibleNonScalarPtrs.insert(I);
3285 };
3286
3287 // We seed the scalars analysis with three classes of instructions: (1)
3288 // instructions marked uniform-after-vectorization and (2) bitcast,
3289 // getelementptr and (pointer) phi instructions used by memory accesses
3290 // requiring a scalar use.
3291 //
3292 // (1) Add to the worklist all instructions that have been identified as
3293 // uniform-after-vectorization.
3294 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3295
3296 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3297 // memory accesses requiring a scalar use. The pointer operands of loads and
3298 // stores will be scalar unless the operation is a gather or scatter.
3299 // The value operand of a store will remain scalar if the store is scalarized.
3300 for (auto *BB : TheLoop->blocks())
3301 for (auto &I : *BB) {
3302 if (auto *Load = dyn_cast<LoadInst>(&I)) {
3303 EvaluatePtrUse(Load, Load->getPointerOperand());
3304 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3305 EvaluatePtrUse(Store, Store->getPointerOperand());
3306 EvaluatePtrUse(Store, Store->getValueOperand());
3307 }
3308 }
3309 for (auto *I : ScalarPtrs)
3310 if (!PossibleNonScalarPtrs.count(I)) {
3311 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3312 Worklist.insert(I);
3313 }
3314
3315 // Insert the forced scalars.
3316 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3317 // induction variable when the PHI user is scalarized.
3318 auto ForcedScalar = ForcedScalars.find(VF);
3319 if (ForcedScalar != ForcedScalars.end())
3320 for (auto *I : ForcedScalar->second) {
3321 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3322 Worklist.insert(I);
3323 }
3324
3325 // Expand the worklist by looking through any bitcasts and getelementptr
3326 // instructions we've already identified as scalar. This is similar to the
3327 // expansion step in collectLoopUniforms(); however, here we're only
3328 // expanding to include additional bitcasts and getelementptr instructions.
3329 unsigned Idx = 0;
3330 while (Idx != Worklist.size()) {
3331 Instruction *Dst = Worklist[Idx++];
3332 if (!IsLoopVaryingGEP(Dst->getOperand(0)))
3333 continue;
3334 auto *Src = cast<Instruction>(Dst->getOperand(0));
3335 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3336 auto *J = cast<Instruction>(U);
3337 return !TheLoop->contains(J) || Worklist.count(J) ||
3338 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3339 IsScalarUse(J, Src));
3340 })) {
3341 Worklist.insert(Src);
3342 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3343 }
3344 }
3345
3346 // An induction variable will remain scalar if all users of the induction
3347 // variable and induction variable update remain scalar.
3348 for (const auto &Induction : Legal->getInductionVars()) {
3349 auto *Ind = Induction.first;
3350 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3351
3352 // If tail-folding is applied, the primary induction variable will be used
3353 // to feed a vector compare.
3354 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3355 continue;
3356
3357 // Returns true if \p Indvar is a pointer induction that is used directly by
3358 // load/store instruction \p I.
3359 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3360 Instruction *I) {
3361 return Induction.second.getKind() ==
3363 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3364 Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
3365 };
3366
3367 // Determine if all users of the induction variable are scalar after
3368 // vectorization.
3369 bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
3370 auto *I = cast<Instruction>(U);
3371 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3372 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3373 });
3374 if (!ScalarInd)
3375 continue;
3376
3377 // If the induction variable update is a fixed-order recurrence, neither the
3378 // induction variable or its update should be marked scalar after
3379 // vectorization.
3380 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3381 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3382 continue;
3383
3384 // Determine if all users of the induction variable update instruction are
3385 // scalar after vectorization.
3386 bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3387 auto *I = cast<Instruction>(U);
3388 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3389 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3390 });
3391 if (!ScalarIndUpdate)
3392 continue;
3393
3394 // The induction variable and its update instruction will remain scalar.
3395 Worklist.insert(Ind);
3396 Worklist.insert(IndUpdate);
3397 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3398 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3399 << "\n");
3400 }
3401
3402 Scalars[VF].insert(Worklist.begin(), Worklist.end());
3403}
3404
3406 Instruction *I, ElementCount VF) const {
3407 if (!isPredicatedInst(I))
3408 return false;
3409
3410 // Do we have a non-scalar lowering for this predicated
3411 // instruction? No - it is scalar with predication.
3412 switch(I->getOpcode()) {
3413 default:
3414 return true;
3415 case Instruction::Call:
3416 if (VF.isScalar())
3417 return true;
3418 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3419 .Kind == CM_Scalarize;
3420 case Instruction::Load:
3421 case Instruction::Store: {
3423 auto *Ty = getLoadStoreType(I);
3424 Type *VTy = Ty;
3425 if (VF.isVector())
3426 VTy = VectorType::get(Ty, VF);
3427 const Align Alignment = getLoadStoreAlignment(I);
3428 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3429 TTI.isLegalMaskedGather(VTy, Alignment))
3430 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3431 TTI.isLegalMaskedScatter(VTy, Alignment));
3432 }
3433 case Instruction::UDiv:
3434 case Instruction::SDiv:
3435 case Instruction::SRem:
3436 case Instruction::URem: {
3437 // We have the option to use the safe-divisor idiom to avoid predication.
3438 // The cost based decision here will always select safe-divisor for
3439 // scalable vectors as scalarization isn't legal.
3440 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3441 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3442 }
3443 }
3444}
3445
3446// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
3448 // If predication is not needed, avoid it.
3449 // TODO: We can use the loop-preheader as context point here and get
3450 // context sensitive reasoning for isSafeToSpeculativelyExecute.
3451 if (!blockNeedsPredicationForAnyReason(I->getParent()) ||
3453 (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
3454 isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I))
3455 return false;
3456
3457 // If the instruction was executed conditionally in the original scalar loop,
3458 // predication is needed with a mask whose lanes are all possibly inactive.
3459 if (Legal->blockNeedsPredication(I->getParent()))
3460 return true;
3461
3462 // All that remain are instructions with side-effects originally executed in
3463 // the loop unconditionally, but now execute under a tail-fold mask (only)
3464 // having at least one active lane (the first). If the side-effects of the
3465 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
3466 // - it will cause the same side-effects as when masked.
3467 switch(I->getOpcode()) {
3468 default:
3470 "instruction should have been considered by earlier checks");
3471 case Instruction::Call:
3472 // Side-effects of a Call are assumed to be non-invariant, needing a
3473 // (fold-tail) mask.
3475 "should have returned earlier for calls not needing a mask");
3476 return true;
3477 case Instruction::Load:
3478 // If the address is loop invariant no predication is needed.
3480 case Instruction::Store: {
3481 // For stores, we need to prove both speculation safety (which follows from
3482 // the same argument as loads), but also must prove the value being stored
3483 // is correct. The easiest form of the later is to require that all values
3484 // stored are the same.
3486 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
3487 }
3488 case Instruction::UDiv:
3489 case Instruction::SDiv:
3490 case Instruction::SRem:
3491 case Instruction::URem:
3492 // If the divisor is loop-invariant no predication is needed.
3493 return !TheLoop->isLoopInvariant(I->getOperand(1));
3494 }
3495}
3496
3497std::pair<InstructionCost, InstructionCost>
3499 ElementCount VF) const {
3500 assert(I->getOpcode() == Instruction::UDiv ||
3501 I->getOpcode() == Instruction::SDiv ||
3502 I->getOpcode() == Instruction::SRem ||
3503 I->getOpcode() == Instruction::URem);
3505
3507
3508 // Scalarization isn't legal for scalable vector types
3509 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3510 if (!VF.isScalable()) {
3511 // Get the scalarization cost and scale this amount by the probability of
3512 // executing the predicated block. If the instruction is not predicated,
3513 // we fall through to the next case.
3514 ScalarizationCost = 0;
3515
3516 // These instructions have a non-void type, so account for the phi nodes
3517 // that we will create. This cost is likely to be zero. The phi node
3518 // cost, if any, should be scaled by the block probability because it
3519 // models a copy at the end of each predicated block.
3520 ScalarizationCost += VF.getKnownMinValue() *
3521 TTI.getCFInstrCost(Instruction::PHI, CostKind);
3522
3523 // The cost of the non-predicated instruction.
3524 ScalarizationCost += VF.getKnownMinValue() *
3525 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3526
3527 // The cost of insertelement and extractelement instructions needed for
3528 // scalarization.
3529 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
3530
3531 // Scale the cost by the probability of executing the predicated blocks.
3532 // This assumes the predicated block for each vector lane is equally
3533 // likely.
3534 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3535 }
3536 InstructionCost SafeDivisorCost = 0;
3537
3538 auto *VecTy = toVectorTy(I->getType(), VF);
3539
3540 // The cost of the select guard to ensure all lanes are well defined
3541 // after we speculate above any internal control flow.
3542 SafeDivisorCost +=
3543 TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
3544 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
3546
3547 // Certain instructions can be cheaper to vectorize if they have a constant
3548 // second vector operand. One example of this are shifts on x86.
3549 Value *Op2 = I->getOperand(1);
3550 auto Op2Info = TTI.getOperandInfo(Op2);
3551 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3552 Legal->isInvariant(Op2))
3554
3555 SmallVector<const Value *, 4> Operands(I->operand_values());
3556 SafeDivisorCost += TTI.getArithmeticInstrCost(
3557 I->getOpcode(), VecTy, CostKind,
3558 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3559 Op2Info, Operands, I);
3560 return {ScalarizationCost, SafeDivisorCost};
3561}
3562
3564 Instruction *I, ElementCount VF) const {
3565 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3567 "Decision should not be set yet.");
3568 auto *Group = getInterleavedAccessGroup(I);
3569 assert(Group && "Must have a group.");
3570 unsigned InterleaveFactor = Group->getFactor();
3571
3572 // If the instruction's allocated size doesn't equal its type size, it
3573 // requires padding and will be scalarized.
3574 auto &DL = I->getDataLayout();
3575 auto *ScalarTy = getLoadStoreType(I);
3576 if (hasIrregularType(ScalarTy, DL))
3577 return false;
3578
3579 // For scalable vectors, the only interleave factor currently supported
3580 // must be power of 2 since we require the (de)interleave2 intrinsics
3581 // instead of shufflevectors.
3582 if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor))
3583 return false;
3584
3585 // If the group involves a non-integral pointer, we may not be able to
3586 // losslessly cast all values to a common type.
3587 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3588 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
3589 Instruction *Member = Group->getMember(Idx);
3590 if (!Member)
3591 continue;
3592 auto *MemberTy = getLoadStoreType(Member);
3593 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3594 // Don't coerce non-integral pointers to integers or vice versa.
3595 if (MemberNI != ScalarNI)
3596 // TODO: Consider adding special nullptr value case here
3597 return false;
3598 if (MemberNI && ScalarNI &&
3599 ScalarTy->getPointerAddressSpace() !=
3600 MemberTy->getPointerAddressSpace())
3601 return false;
3602 }
3603
3604 // Check if masking is required.
3605 // A Group may need masking for one of two reasons: it resides in a block that
3606 // needs predication, or it was decided to use masking to deal with gaps
3607 // (either a gap at the end of a load-access that may result in a speculative
3608 // load, or any gaps in a store-access).
3609 bool PredicatedAccessRequiresMasking =
3610 blockNeedsPredicationForAnyReason(I->getParent()) &&
3612 bool LoadAccessWithGapsRequiresEpilogMasking =
3613 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3615 bool StoreAccessWithGapsRequiresMasking =
3616 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3617 if (!PredicatedAccessRequiresMasking &&
3618 !LoadAccessWithGapsRequiresEpilogMasking &&
3619 !StoreAccessWithGapsRequiresMasking)
3620 return true;
3621
3622 // If masked interleaving is required, we expect that the user/target had
3623 // enabled it, because otherwise it either wouldn't have been created or
3624 // it should have been invalidated by the CostModel.
3626 "Masked interleave-groups for predicated accesses are not enabled.");
3627
3628 if (Group->isReverse())
3629 return false;
3630
3631 auto *Ty = getLoadStoreType(I);
3632 const Align Alignment = getLoadStoreAlignment(I);
3633 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
3634 : TTI.isLegalMaskedStore(Ty, Alignment);
3635}
3636
3638 Instruction *I, ElementCount VF) {
3639 // Get and ensure we have a valid memory instruction.
3640 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3641
3643 auto *ScalarTy = getLoadStoreType(I);
3644
3645 // In order to be widened, the pointer should be consecutive, first of all.
3646 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3647 return false;
3648
3649 // If the instruction is a store located in a predicated block, it will be
3650 // scalarized.
3651 if (isScalarWithPredication(I, VF))
3652 return false;
3653
3654 // If the instruction's allocated size doesn't equal it's type size, it
3655 // requires padding and will be scalarized.
3656 auto &DL = I->getDataLayout();
3657 if (hasIrregularType(ScalarTy, DL))
3658 return false;
3659
3660 return true;
3661}
3662
3663void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3664 // We should not collect Uniforms more than once per VF. Right now,
3665 // this function is called from collectUniformsAndScalars(), which
3666 // already does this check. Collecting Uniforms for VF=1 does not make any
3667 // sense.
3668
3669 assert(VF.isVector() && !Uniforms.contains(VF) &&
3670 "This function should not be visited twice for the same VF");
3671
3672 // Visit the list of Uniforms. If we find no uniform value, we won't
3673 // analyze again. Uniforms.count(VF) will return 1.
3674 Uniforms[VF].clear();
3675
3676 // Now we know that the loop is vectorizable!
3677 // Collect instructions inside the loop that will remain uniform after
3678 // vectorization.
3679
3680 // Global values, params and instructions outside of current loop are out of
3681 // scope.
3682 auto IsOutOfScope = [&](Value *V) -> bool {
3683 Instruction *I = dyn_cast<Instruction>(V);
3684 return (!I || !TheLoop->contains(I));
3685 };
3686
3687 // Worklist containing uniform instructions demanding lane 0.
3688 SetVector<Instruction *> Worklist;
3689
3690 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3691 // that require predication must not be considered uniform after
3692 // vectorization, because that would create an erroneous replicating region
3693 // where only a single instance out of VF should be formed.
3694 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3695 if (IsOutOfScope(I)) {
3696 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3697 << *I << "\n");
3698 return;
3699 }
3700 if (isPredicatedInst(I)) {
3701 LLVM_DEBUG(
3702 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3703 << "\n");
3704 return;
3705 }
3706 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3707 Worklist.insert(I);
3708 };
3709
3710 // Start with the conditional branches exiting the loop. If the branch
3711 // condition is an instruction contained in the loop that is only used by the
3712 // branch, it is uniform. Note conditions from uncountable early exits are not
3713 // uniform.
3715 TheLoop->getExitingBlocks(Exiting);
3716 for (BasicBlock *E : Exiting) {
3718 continue;
3719 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3720 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3721 AddToWorklistIfAllowed(Cmp);
3722 }
3723
3724 auto PrevVF = VF.divideCoefficientBy(2);
3725 // Return true if all lanes perform the same memory operation, and we can
3726 // thus choose to execute only one.
3727 auto IsUniformMemOpUse = [&](Instruction *I) {
3728 // If the value was already known to not be uniform for the previous
3729 // (smaller VF), it cannot be uniform for the larger VF.
3730 if (PrevVF.isVector()) {
3731 auto Iter = Uniforms.find(PrevVF);
3732 if (Iter != Uniforms.end() && !Iter->second.contains(I))
3733 return false;
3734 }
3735 if (!Legal->isUniformMemOp(*I, VF))
3736 return false;
3737 if (isa<LoadInst>(I))
3738 // Loading the same address always produces the same result - at least
3739 // assuming aliasing and ordering which have already been checked.
3740 return true;
3741 // Storing the same value on every iteration.
3742 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3743 };
3744
3745 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3746 InstWidening WideningDecision = getWideningDecision(I, VF);
3747 assert(WideningDecision != CM_Unknown &&
3748 "Widening decision should be ready at this moment");
3749
3750 if (IsUniformMemOpUse(I))
3751 return true;
3752
3753 return (WideningDecision == CM_Widen ||
3754 WideningDecision == CM_Widen_Reverse ||
3755 WideningDecision == CM_Interleave);
3756 };
3757
3758 // Returns true if Ptr is the pointer operand of a memory access instruction
3759 // I, I is known to not require scalarization, and the pointer is not also
3760 // stored.
3761 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3762 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3763 return false;
3764 return getLoadStorePointerOperand(I) == Ptr &&
3765 (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3766 };
3767
3768 // Holds a list of values which are known to have at least one uniform use.
3769 // Note that there may be other uses which aren't uniform. A "uniform use"
3770 // here is something which only demands lane 0 of the unrolled iterations;
3771 // it does not imply that all lanes produce the same value (e.g. this is not
3772 // the usual meaning of uniform)
3773 SetVector<Value *> HasUniformUse;
3774
3775 // Scan the loop for instructions which are either a) known to have only
3776 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3777 for (auto *BB : TheLoop->blocks())
3778 for (auto &I : *BB) {
3779 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3780 switch (II->getIntrinsicID()) {
3781 case Intrinsic::sideeffect:
3782 case Intrinsic::experimental_noalias_scope_decl:
3783 case Intrinsic::assume:
3784 case Intrinsic::lifetime_start:
3785 case Intrinsic::lifetime_end:
3787 AddToWorklistIfAllowed(&I);
3788 break;
3789 default:
3790 break;
3791 }
3792 }
3793
3794 // ExtractValue instructions must be uniform, because the operands are
3795 // known to be loop-invariant.
3796 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3797 assert(IsOutOfScope(EVI->getAggregateOperand()) &&
3798 "Expected aggregate value to be loop invariant");
3799 AddToWorklistIfAllowed(EVI);
3800 continue;
3801 }
3802
3803 // If there's no pointer operand, there's nothing to do.
3805 if (!Ptr)
3806 continue;
3807
3808 if (IsUniformMemOpUse(&I))
3809 AddToWorklistIfAllowed(&I);
3810
3811 if (IsVectorizedMemAccessUse(&I, Ptr))
3812 HasUniformUse.insert(Ptr);
3813 }
3814
3815 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3816 // demanding) users. Since loops are assumed to be in LCSSA form, this
3817 // disallows uses outside the loop as well.
3818 for (auto *V : HasUniformUse) {
3819 if (IsOutOfScope(V))
3820 continue;
3821 auto *I = cast<Instruction>(V);
3822 bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
3823 auto *UI = cast<Instruction>(U);
3824 return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
3825 });
3826 if (UsersAreMemAccesses)
3827 AddToWorklistIfAllowed(I);
3828 }
3829
3830 // Expand Worklist in topological order: whenever a new instruction
3831 // is added , its users should be already inside Worklist. It ensures
3832 // a uniform instruction will only be used by uniform instructions.
3833 unsigned Idx = 0;
3834 while (Idx != Worklist.size()) {
3835 Instruction *I = Worklist[Idx++];
3836
3837 for (auto *OV : I->operand_values()) {
3838 // isOutOfScope operands cannot be uniform instructions.
3839 if (IsOutOfScope(OV))
3840 continue;
3841 // First order recurrence Phi's should typically be considered
3842 // non-uniform.
3843 auto *OP = dyn_cast<PHINode>(OV);
3845 continue;
3846 // If all the users of the operand are uniform, then add the
3847 // operand into the uniform worklist.
3848 auto *OI = cast<Instruction>(OV);
3849 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3850 auto *J = cast<Instruction>(U);
3851 return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
3852 }))
3853 AddToWorklistIfAllowed(OI);
3854 }
3855 }
3856
3857 // For an instruction to be added into Worklist above, all its users inside
3858 // the loop should also be in Worklist. However, this condition cannot be
3859 // true for phi nodes that form a cyclic dependence. We must process phi
3860 // nodes separately. An induction variable will remain uniform if all users
3861 // of the induction variable and induction variable update remain uniform.
3862 // The code below handles both pointer and non-pointer induction variables.
3863 BasicBlock *Latch = TheLoop->getLoopLatch();
3864 for (const auto &Induction : Legal->getInductionVars()) {
3865 auto *Ind = Induction.first;
3866 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3867
3868 // Determine if all users of the induction variable are uniform after
3869 // vectorization.
3870 bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
3871 auto *I = cast<Instruction>(U);
3872 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3873 IsVectorizedMemAccessUse(I, Ind);
3874 });
3875 if (!UniformInd)
3876 continue;
3877
3878 // Determine if all users of the induction variable update instruction are
3879 // uniform after vectorization.
3880 bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3881 auto *I = cast<Instruction>(U);
3882 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3883 IsVectorizedMemAccessUse(I, IndUpdate);
3884 });
3885 if (!UniformIndUpdate)
3886 continue;
3887
3888 // The induction variable and its update instruction will remain uniform.
3889 AddToWorklistIfAllowed(Ind);
3890 AddToWorklistIfAllowed(IndUpdate);
3891 }
3892
3893 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
3894}
3895
3897 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3898
3900 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3901 "runtime pointer checks needed. Enable vectorization of this "
3902 "loop with '#pragma clang loop vectorize(enable)' when "
3903 "compiling with -Os/-Oz",
3904 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3905 return true;
3906 }
3907
3908 if (!PSE.getPredicate().isAlwaysTrue()) {
3909 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3910 "runtime SCEV checks needed. Enable vectorization of this "
3911 "loop with '#pragma clang loop vectorize(enable)' when "
3912 "compiling with -Os/-Oz",
3913 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3914 return true;
3915 }
3916
3917 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3918 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3919 reportVectorizationFailure("Runtime stride check for small trip count",
3920 "runtime stride == 1 checks needed. Enable vectorization of "
3921 "this loop without such check by compiling with -Os/-Oz",
3922 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3923 return true;
3924 }
3925
3926 return false;
3927}
3928
3929bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3930 if (IsScalableVectorizationAllowed)
3931 return *IsScalableVectorizationAllowed;
3932
3933 IsScalableVectorizationAllowed = false;
3935 return false;
3936
3938 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3939 "ScalableVectorizationDisabled", ORE, TheLoop);
3940 return false;
3941 }
3942
3943 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3944
3945 auto MaxScalableVF = ElementCount::getScalable(
3946 std::numeric_limits<ElementCount::ScalarTy>::max());
3947
3948 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3949 // FIXME: While for scalable vectors this is currently sufficient, this should
3950 // be replaced by a more detailed mechanism that filters out specific VFs,
3951 // instead of invalidating vectorization for a whole set of VFs based on the
3952 // MaxVF.
3953
3954 // Disable scalable vectorization if the loop contains unsupported reductions.
3955 if (!canVectorizeReductions(MaxScalableVF)) {
3957 "Scalable vectorization not supported for the reduction "
3958 "operations found in this loop.",
3959 "ScalableVFUnfeasible", ORE, TheLoop);
3960 return false;
3961 }
3962
3963 // Disable scalable vectorization if the loop contains any instructions
3964 // with element types not supported for scalable vectors.
3965 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3966 return !Ty->isVoidTy() &&
3968 })) {
3969 reportVectorizationInfo("Scalable vectorization is not supported "
3970 "for all element types found in this loop.",
3971 "ScalableVFUnfeasible", ORE, TheLoop);
3972 return false;
3973 }
3974
3976 reportVectorizationInfo("The target does not provide maximum vscale value "
3977 "for safe distance analysis.",
3978 "ScalableVFUnfeasible", ORE, TheLoop);
3979 return false;
3980 }
3981
3982 IsScalableVectorizationAllowed = true;
3983 return true;
3984}
3985
3987LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3988 if (!isScalableVectorizationAllowed())
3989 return ElementCount::getScalable(0);
3990
3991 auto MaxScalableVF = ElementCount::getScalable(
3992 std::numeric_limits<ElementCount::ScalarTy>::max());
3994 return MaxScalableVF;
3995
3996 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3997 // Limit MaxScalableVF by the maximum safe dependence distance.
3998 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3999
4000 if (!MaxScalableVF)
4002 "Max legal vector width too small, scalable vectorization "
4003 "unfeasible.",
4004 "ScalableVFUnfeasible", ORE, TheLoop);
4005
4006 return MaxScalableVF;
4007}
4008
4009FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4010 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4012 unsigned SmallestType, WidestType;
4013 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4014
4015 // Get the maximum safe dependence distance in bits computed by LAA.
4016 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4017 // the memory accesses that is most restrictive (involved in the smallest
4018 // dependence distance).
4019 unsigned MaxSafeElements =
4021
4022 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4023 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4025 this->MaxSafeElements = MaxSafeElements;
4026
4027 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4028 << ".\n");
4029 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4030 << ".\n");
4031
4032 // First analyze the UserVF, fall back if the UserVF should be ignored.
4033 if (UserVF) {
4034 auto MaxSafeUserVF =
4035 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4036
4037 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4038 // If `VF=vscale x N` is safe, then so is `VF=N`
4039 if (UserVF.isScalable())
4040 return FixedScalableVFPair(
4041 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4042
4043 return UserVF;
4044 }
4045
4046 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4047
4048 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4049 // is better to ignore the hint and let the compiler choose a suitable VF.
4050 if (!UserVF.isScalable()) {
4051 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4052 << " is unsafe, clamping to max safe VF="
4053 << MaxSafeFixedVF << ".\n");
4054 ORE->emit([&]() {
4055 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4057 TheLoop->getHeader())
4058 << "User-specified vectorization factor "
4059 << ore::NV("UserVectorizationFactor", UserVF)
4060 << " is unsafe, clamping to maximum safe vectorization factor "
4061 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4062 });
4063 return MaxSafeFixedVF;
4064 }
4065
4067 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4068 << " is ignored because scalable vectors are not "
4069 "available.\n");
4070 ORE->emit([&]() {
4071 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4073 TheLoop->getHeader())
4074 << "User-specified vectorization factor "
4075 << ore::NV("UserVectorizationFactor", UserVF)
4076 << " is ignored because the target does not support scalable "
4077 "vectors. The compiler will pick a more suitable value.";
4078 });
4079 } else {
4080 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4081 << " is unsafe. Ignoring scalable UserVF.\n");
4082 ORE->emit([&]() {
4083 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4085 TheLoop->getHeader())
4086 << "User-specified vectorization factor "
4087 << ore::NV("UserVectorizationFactor", UserVF)
4088 << " is unsafe. Ignoring the hint to let the compiler pick a "
4089 "more suitable value.";
4090 });
4091 }
4092 }
4093
4094 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4095 << " / " << WidestType << " bits.\n");
4096
4099 if (auto MaxVF =
4100 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4101 MaxSafeFixedVF, FoldTailByMasking))
4102 Result.FixedVF = MaxVF;
4103
4104 if (auto MaxVF =
4105 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4106 MaxSafeScalableVF, FoldTailByMasking))
4107 if (MaxVF.isScalable()) {
4108 Result.ScalableVF = MaxVF;
4109 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4110 << "\n");
4111 }
4112
4113 return Result;
4114}
4115
4119 // TODO: It may be useful to do since it's still likely to be dynamically
4120 // uniform if the target can skip.
4122 "Not inserting runtime ptr check for divergent target",
4123 "runtime pointer checks needed. Not enabled for divergent target",
4124 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4126 }
4127
4128 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4129 unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
4130 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4131 if (TC != MaxTC)
4132 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
4133 if (TC == 1) {
4134 reportVectorizationFailure("Single iteration (non) loop",
4135 "loop trip count is one, irrelevant for vectorization",
4136 "SingleIterationLoop", ORE, TheLoop);
4138 }
4139
4140 switch (ScalarEpilogueStatus) {
4142 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4144 [[fallthrough]];
4146 LLVM_DEBUG(
4147 dbgs() << "LV: vector predicate hint/switch found.\n"
4148 << "LV: Not allowing scalar epilogue, creating predicated "
4149 << "vector loop.\n");
4150 break;
4152 // fallthrough as a special case of OptForSize
4154 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4155 LLVM_DEBUG(
4156 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4157 else
4158 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4159 << "count.\n");
4160
4161 // Bail if runtime checks are required, which are not good when optimising
4162 // for size.
4165
4166 break;
4167 }
4168
4169 // The only loops we can vectorize without a scalar epilogue, are loops with
4170 // a bottom-test and a single exiting block. We'd have to handle the fact
4171 // that not every instruction executes on the last iteration. This will
4172 // require a lane mask which varies through the vector loop body. (TODO)
4174 // If there was a tail-folding hint/switch, but we can't fold the tail by
4175 // masking, fallback to a vectorization with a scalar epilogue.
4176 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4177 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4178 "scalar epilogue instead.\n");
4179 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4180 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4181 }
4183 }
4184
4185 // Now try the tail folding
4186
4187 // Invalidate interleave groups that require an epilogue if we can't mask
4188 // the interleave-group.
4190 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4191 "No decisions should have been taken at this point");
4192 // Note: There is no need to invalidate any cost modeling decisions here, as
4193 // none were taken so far.
4195 }
4196
4197 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4198
4199 // Avoid tail folding if the trip count is known to be a multiple of any VF
4200 // we choose.
4201 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4202 MaxFactors.FixedVF.getFixedValue();
4203 if (MaxFactors.ScalableVF) {
4204 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4205 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4206 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4207 *MaxPowerOf2RuntimeVF,
4208 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4209 } else
4210 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4211 }
4212
4213 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4214 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4215 "MaxFixedVF must be a power of 2");
4216 unsigned MaxVFtimesIC =
4217 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4218 ScalarEvolution *SE = PSE.getSE();
4219 // Currently only loops with countable exits are vectorized, but calling
4220 // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
4221 // uncountable exits whilst also ensuring the symbolic maximum and known
4222 // back-edge taken count remain identical for loops with countable exits.
4223 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
4224 assert(BackedgeTakenCount == PSE.getBackedgeTakenCount() &&
4225 "Invalid loop count");
4226 const SCEV *ExitCount = SE->getAddExpr(
4227 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4228 const SCEV *Rem = SE->getURemExpr(
4229 SE->applyLoopGuards(ExitCount, TheLoop),
4230 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4231 if (Rem->isZero()) {
4232 // Accept MaxFixedVF if we do not have a tail.
4233 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4234 return MaxFactors;
4235 }
4236 }
4237
4238 // If we don't know the precise trip count, or if the trip count that we
4239 // found modulo the vectorization factor is not zero, try to fold the tail
4240 // by masking.
4241 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4242 setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4243 if (foldTailByMasking()) {
4245 LLVM_DEBUG(
4246 dbgs()
4247 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4248 "try to generate VP Intrinsics with scalable vector "
4249 "factors only.\n");
4250 // Tail folded loop using VP intrinsics restricts the VF to be scalable
4251 // for now.
4252 // TODO: extend it for fixed vectors, if required.
4253 assert(MaxFactors.ScalableVF.isScalable() &&
4254 "Expected scalable vector factor.");
4255
4256 MaxFactors.FixedVF = ElementCount::getFixed(1);
4257 }
4258 return MaxFactors;
4259 }
4260
4261 // If there was a tail-folding hint/switch, but we can't fold the tail by
4262 // masking, fallback to a vectorization with a scalar epilogue.
4263 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4264 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4265 "scalar epilogue instead.\n");
4266 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4267 return MaxFactors;
4268 }
4269
4270 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4271 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4273 }
4274
4275 if (TC == 0) {
4277 "unable to calculate the loop count due to complex control flow",
4278 "UnknownLoopCountComplexCFG", ORE, TheLoop);
4280 }
4281
4283 "Cannot optimize for size and vectorize at the same time.",
4284 "cannot optimize for size and vectorize at the same time. "
4285 "Enable vectorization of this loop with '#pragma clang loop "
4286 "vectorize(enable)' when compiling with -Os/-Oz",
4287 "NoTailLoopWithOptForSize", ORE, TheLoop);
4289}
4290
4291ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4292 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4293 ElementCount MaxSafeVF, bool FoldTailByMasking) {
4294 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4295 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4296 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4298
4299 // Convenience function to return the minimum of two ElementCounts.
4300 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4301 assert((LHS.isScalable() == RHS.isScalable()) &&
4302 "Scalable flags must match");
4303 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4304 };
4305
4306 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4307 // Note that both WidestRegister and WidestType may not be a powers of 2.
4308 auto MaxVectorElementCount = ElementCount::get(
4309 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4310 ComputeScalableMaxVF);
4311 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4312 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4313 << (MaxVectorElementCount * WidestType) << " bits.\n");
4314
4315 if (!MaxVectorElementCount) {
4316 LLVM_DEBUG(dbgs() << "LV: The target has no "
4317 << (ComputeScalableMaxVF ? "scalable" : "fixed")
4318 << " vector registers.\n");
4319 return ElementCount::getFixed(1);
4320 }
4321
4322 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4323 if (MaxVectorElementCount.isScalable() &&
4324 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4325 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4326 auto Min = Attr.getVScaleRangeMin();
4327 WidestRegisterMinEC *= Min;
4328 }
4329
4330 // When a scalar epilogue is required, at least one iteration of the scalar
4331 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4332 // max VF that results in a dead vector loop.
4333 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4334 MaxTripCount -= 1;
4335
4336 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4337 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4338 // If upper bound loop trip count (TC) is known at compile time there is no
4339 // point in choosing VF greater than TC (as done in the loop below). Select
4340 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4341 // scalable, we only fall back on a fixed VF when the TC is less than or
4342 // equal to the known number of lanes.
4343 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4344 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4345 "exceeding the constant trip count: "
4346 << ClampedUpperTripCount << "\n");
4347 return ElementCount::get(
4348 ClampedUpperTripCount,
4349 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4350 }
4351
4353 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4355 ElementCount MaxVF = MaxVectorElementCount;
4356 if (MaximizeBandwidth ||
4357 (MaximizeBandwidth.getNumOccurrences() == 0 &&
4360 auto MaxVectorElementCountMaxBW = ElementCount::get(
4361 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4362 ComputeScalableMaxVF);
4363 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4364
4365 // Collect all viable vectorization factors larger than the default MaxVF
4366 // (i.e. MaxVectorElementCount).
4368 for (ElementCount VS = MaxVectorElementCount * 2;
4369 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4370 VFs.push_back(VS);
4371
4372 // For each VF calculate its register usage.
4373 auto RUs = calculateRegisterUsage(VFs);
4374
4375 // Select the largest VF which doesn't require more registers than existing
4376 // ones.
4377 for (int I = RUs.size() - 1; I >= 0; --I) {
4378 const auto &MLU = RUs[I].MaxLocalUsers;
4379 if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4380 return LU.second <= TTI.getNumberOfRegisters(LU.first);
4381 })) {
4382 MaxVF = VFs[I];
4383 break;
4384 }
4385 }
4386 if (ElementCount MinVF =
4387 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4388 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4389 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4390 << ") with target's minimum: " << MinVF << '\n');
4391 MaxVF = MinVF;
4392 }
4393 }
4394
4395 // Invalidate any widening decisions we might have made, in case the loop
4396 // requires prediction (decided later), but we have already made some
4397 // load/store widening decisions.
4399 }
4400 return MaxVF;
4401}
4402
4403/// Convenience function that returns the value of vscale_range iff
4404/// vscale_range.min == vscale_range.max or otherwise returns the value
4405/// returned by the corresponding TTI method.
4406static std::optional<unsigned>
4408 const Function *Fn = L->getHeader()->getParent();
4409 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4410 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4411 auto Min = Attr.getVScaleRangeMin();
4412 auto Max = Attr.getVScaleRangeMax();
4413 if (Max && Min == Max)
4414 return Max;
4415 }
4416
4417 return TTI.getVScaleForTuning();
4418}
4419
4420/// This function attempts to return a value that represents the vectorization
4421/// factor at runtime. For fixed-width VFs we know this precisely at compile
4422/// time, but for scalable VFs we calculate it based on an estimate of the
4423/// vscale value.
4424static unsigned getEstimatedRuntimeVF(const Loop *L,
4425 const TargetTransformInfo &TTI,
4426 ElementCount VF) {
4427 unsigned EstimatedVF = VF.getKnownMinValue();
4428 if (VF.isScalable())
4429 if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
4430 EstimatedVF *= *VScale;
4431 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4432 return EstimatedVF;
4433}
4434
4435bool LoopVectorizationPlanner::isMoreProfitable(
4437 const unsigned MaxTripCount) const {
4438 InstructionCost CostA = A.Cost;
4439 InstructionCost CostB = B.Cost;
4440
4441 // Improve estimate for the vector width if it is scalable.
4442 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4443 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4444 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4445 if (A.Width.isScalable())
4446 EstimatedWidthA *= *VScale;
4447 if (B.Width.isScalable())
4448 EstimatedWidthB *= *VScale;
4449 }
4450
4451 // Assume vscale may be larger than 1 (or the value being tuned for),
4452 // so that scalable vectorization is slightly favorable over fixed-width
4453 // vectorization.
4454 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4455 A.Width.isScalable() && !B.Width.isScalable();
4456
4457 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4458 const InstructionCost &RHS) {
4459 return PreferScalable ? LHS <= RHS : LHS < RHS;
4460 };
4461
4462 // To avoid the need for FP division:
4463 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4464 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4465 if (!MaxTripCount)
4466 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4467
4468 auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4469 InstructionCost VectorCost,
4470 InstructionCost ScalarCost) {
4471 // If the trip count is a known (possibly small) constant, the trip count
4472 // will be rounded up to an integer number of iterations under
4473 // FoldTailByMasking. The total cost in that case will be
4474 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4475 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4476 // some extra overheads, but for the purpose of comparing the costs of
4477 // different VFs we can use this to compare the total loop-body cost
4478 // expected after vectorization.
4479 if (CM.foldTailByMasking())
4480 return VectorCost * divideCeil(MaxTripCount, VF);
4481 return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4482 };
4483
4484 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4485 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4486 return CmpFn(RTCostA, RTCostB);
4487}
4488
4489bool LoopVectorizationPlanner::isMoreProfitable(
4490 const VectorizationFactor &A, const VectorizationFactor &B) const {
4491 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
4492 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount);
4493}
4494
4497 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4498 SmallVector<RecipeVFPair> InvalidCosts;
4499 for (const auto &Plan : VPlans) {
4500 for (ElementCount VF : Plan->vectorFactors()) {
4501 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
4502 CM);
4503 precomputeCosts(*Plan, VF, CostCtx);
4504 auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
4505 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4506 for (auto &R : *VPBB) {
4507 if (!R.cost(VF, CostCtx).isValid())
4508 InvalidCosts.emplace_back(&R, VF);
4509 }
4510 }
4511 }
4512 }
4513 if (InvalidCosts.empty())
4514 return;
4515
4516 // Emit a report of VFs with invalid costs in the loop.
4517
4518 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4520 unsigned I = 0;
4521 for (auto &Pair : InvalidCosts)
4522 if (!Numbering.count(Pair.first))
4523 Numbering[Pair.first] = I++;
4524
4525 // Sort the list, first on recipe(number) then on VF.
4526 sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4527 if (Numbering[A.first] != Numbering[B.first])
4528 return Numbering[A.first] < Numbering[B.first];
4529 const auto &LHS = A.second;
4530 const auto &RHS = B.second;
4531 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
4532 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
4533 });
4534
4535 // For a list of ordered recipe-VF pairs:
4536 // [(load, VF1), (load, VF2), (store, VF1)]
4537 // group the recipes together to emit separate remarks for:
4538 // load (VF1, VF2)
4539 // store (VF1)
4540 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4541 auto Subset = ArrayRef<RecipeVFPair>();
4542 do {
4543 if (Subset.empty())
4544 Subset = Tail.take_front(1);
4545
4546 VPRecipeBase *R = Subset.front().first;
4547
4548 unsigned Opcode =
4551 [](const auto *R) { return Instruction::PHI; })
4552 .Case<VPWidenSelectRecipe>(
4553 [](const auto *R) { return Instruction::Select; })
4554 .Case<VPWidenStoreRecipe>(
4555 [](const auto *R) { return Instruction::Store; })
4556 .Case<VPWidenLoadRecipe>(
4557 [](const auto *R) { return Instruction::Load; })
4558 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4559 [](const auto *R) { return Instruction::Call; })
4562 [](const auto *R) { return R->getOpcode(); })
4563 .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
4564 return R->getStoredValues().empty() ? Instruction::Load
4565 : Instruction::Store;
4566 });
4567
4568 // If the next recipe is different, or if there are no other pairs,
4569 // emit a remark for the collated subset. e.g.
4570 // [(load, VF1), (load, VF2))]
4571 // to emit:
4572 // remark: invalid costs for 'load' at VF=(VF1, VF2)
4573 if (Subset == Tail || Tail[Subset.size()].first != R) {
4574 std::string OutString;
4575 raw_string_ostream OS(OutString);
4576 assert(!Subset.empty() && "Unexpected empty range");
4577 OS << "Recipe with invalid costs prevented vectorization at VF=(";
4578 for (const auto &Pair : Subset)
4579 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4580 OS << "):";
4581 if (Opcode == Instruction::Call) {
4582 StringRef Name = "";
4583 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
4584 Name = Int->getIntrinsicName();
4585 } else {
4586 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
4587 Function *CalledFn =
4588 WidenCall ? WidenCall->getCalledScalarFunction()
4589 : cast<Function>(R->getOperand(R->getNumOperands() - 1)
4590 ->getLiveInIRValue());
4591 Name = CalledFn->getName();
4592 }
4593 OS << " call to " << Name;
4594 } else
4595 OS << " " << Instruction::getOpcodeName(Opcode);
4596 reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4597 R->getDebugLoc());
4598 Tail = Tail.drop_front(Subset.size());
4599 Subset = {};
4600 } else
4601 // Grow the subset by one element
4602 Subset = Tail.take_front(Subset.size() + 1);
4603 } while (!Tail.empty());
4604}
4605
4606/// Check if any recipe of \p Plan will generate a vector value, which will be
4607/// assigned a vector register.
4609 const TargetTransformInfo &TTI) {
4610 assert(VF.isVector() && "Checking a scalar VF?");
4611 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4612 DenseSet<VPRecipeBase *> EphemeralRecipes;
4613 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4614 // Set of already visited types.
4615 DenseSet<Type *> Visited;
4616 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4618 for (VPRecipeBase &R : *VPBB) {
4619 if (EphemeralRecipes.contains(&R))
4620 continue;
4621 // Continue early if the recipe is considered to not produce a vector
4622 // result. Note that this includes VPInstruction where some opcodes may
4623 // produce a vector, to preserve existing behavior as VPInstructions model
4624 // aspects not directly mapped to existing IR instructions.
4625 switch (R.getVPDefID()) {
4626 case VPDef::VPDerivedIVSC:
4627 case VPDef::VPScalarIVStepsSC:
4628 case VPDef::VPScalarCastSC:
4629 case VPDef::VPReplicateSC:
4630 case VPDef::VPInstructionSC:
4631 case VPDef::VPCanonicalIVPHISC:
4632 case VPDef::VPVectorPointerSC:
4633 case VPDef::VPReverseVectorPointerSC:
4634 case VPDef::VPExpandSCEVSC:
4635 case VPDef::VPEVLBasedIVPHISC:
4636 case VPDef::VPPredInstPHISC:
4637 case VPDef::VPBranchOnMaskSC:
4638 continue;
4639 case VPDef::VPReductionSC:
4640 case VPDef::VPActiveLaneMaskPHISC:
4641 case VPDef::VPWidenCallSC:
4642 case VPDef::VPWidenCanonicalIVSC:
4643 case VPDef::VPWidenCastSC:
4644 case VPDef::VPWidenGEPSC:
4645 case VPDef::VPWidenIntrinsicSC:
4646 case VPDef::VPWidenSC:
4647 case VPDef::VPWidenSelectSC:
4648 case VPDef::VPBlendSC:
4649 case VPDef::VPFirstOrderRecurrencePHISC:
4650 case VPDef::VPWidenPHISC:
4651 case VPDef::VPWidenIntOrFpInductionSC:
4652 case VPDef::VPWidenPointerInductionSC:
4653 case VPDef::VPReductionPHISC:
4654 case VPDef::VPInterleaveSC:
4655 case VPDef::VPWidenLoadEVLSC:
4656 case VPDef::VPWidenLoadSC:
4657 case VPDef::VPWidenStoreEVLSC:
4658 case VPDef::VPWidenStoreSC:
4659 break;
4660 default:
4661 llvm_unreachable("unhandled recipe");
4662 }
4663
4664 auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4665 Type *VectorTy = toVectorTy(ScalarTy, VF);
4666 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4667 if (!NumLegalParts)
4668 return false;
4669 if (VF.isScalable()) {
4670 // <vscale x 1 x iN> is assumed to be profitable over iN because
4671 // scalable registers are a distinct register class from scalar
4672 // ones. If we ever find a target which wants to lower scalable
4673 // vectors back to scalars, we'll need to update this code to
4674 // explicitly ask TTI about the register class uses for each part.
4675 return NumLegalParts <= VF.getKnownMinValue();
4676 }
4677 // Two or more parts that share a register - are vectorized.
4678 return NumLegalParts < VF.getKnownMinValue();
4679 };
4680
4681 // If no def nor is a store, e.g., branches, continue - no value to check.
4682 if (R.getNumDefinedValues() == 0 &&
4683 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4684 &R))
4685 continue;
4686 // For multi-def recipes, currently only interleaved loads, suffice to
4687 // check first def only.
4688 // For stores check their stored value; for interleaved stores suffice
4689 // the check first stored value only. In all cases this is the second
4690 // operand.
4691 VPValue *ToCheck =
4692 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4693 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4694 if (!Visited.insert({ScalarTy}).second)
4695 continue;
4696 if (WillWiden(ScalarTy))
4697 return true;
4698 }
4699 }
4700
4701 return false;
4702}
4703
4704#ifndef NDEBUG
4705VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4707 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4708 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4709 assert(any_of(VPlans,
4710 [](std::unique_ptr<VPlan> &P) {
4711 return P->hasVF(ElementCount::getFixed(1));
4712 }) &&
4713 "Expected Scalar VF to be a candidate");
4714
4715 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4716 ExpectedCost);
4717 VectorizationFactor ChosenFactor = ScalarCost;
4718
4719 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4720 if (ForceVectorization &&
4721 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4722 // Ignore scalar width, because the user explicitly wants vectorization.
4723 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4724 // evaluation.
4725 ChosenFactor.Cost = InstructionCost::getMax();
4726 }
4727
4728 for (auto &P : VPlans) {
4729 for (ElementCount VF : P->vectorFactors()) {
4730 // The cost for scalar VF=1 is already calculated, so ignore it.
4731 if (VF.isScalar())
4732 continue;
4733
4735 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4736
4737 unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width);
4738 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4739 << " costs: " << (Candidate.Cost / Width));
4740 if (VF.isScalable())
4741 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4742 << getVScaleForTuning(OrigLoop, TTI).value_or(1)
4743 << ")");
4744 LLVM_DEBUG(dbgs() << ".\n");
4745
4746 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4747 LLVM_DEBUG(
4748 dbgs()
4749 << "LV: Not considering vector loop of width " << VF
4750 << " because it will not generate any vector instructions.\n");
4751 continue;
4752 }
4753
4754 if (isMoreProfitable(Candidate, ChosenFactor))
4755 ChosenFactor = Candidate;
4756 }
4757 }
4758
4761 "There are conditional stores.",
4762 "store that is conditionally executed prevents vectorization",
4763 "ConditionalStore", ORE, OrigLoop);
4764 ChosenFactor = ScalarCost;
4765 }
4766
4767 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4768 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
4769 << "LV: Vectorization seems to be not beneficial, "
4770 << "but was forced by a user.\n");
4771 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
4772 return ChosenFactor;
4773}
4774#endif
4775
4776bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4777 ElementCount VF) const {
4778 // Cross iteration phis such as reductions need special handling and are
4779 // currently unsupported.
4780 if (any_of(OrigLoop->getHeader()->phis(),
4781 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
4782 return false;
4783
4784 // Phis with uses outside of the loop require special handling and are
4785 // currently unsupported.
4786 for (const auto &Entry : Legal->getInductionVars()) {
4787 // Look for uses of the value of the induction at the last iteration.
4788 Value *PostInc =
4789 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4790 for (User *U : PostInc->users())
4791 if (!OrigLoop->contains(cast<Instruction>(U)))
4792 return false;
4793 // Look for uses of penultimate value of the induction.
4794 for (User *U : Entry.first->users())
4795 if (!OrigLoop->contains(cast<Instruction>(U)))
4796 return false;
4797 }
4798
4799 // Epilogue vectorization code has not been auditted to ensure it handles
4800 // non-latch exits properly. It may be fine, but it needs auditted and
4801 // tested.
4802 // TODO: Add support for loops with an early exit.
4803 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4804 return false;
4805
4806 return true;
4807}
4808
4810 const ElementCount VF, const unsigned IC) const {
4811 // FIXME: We need a much better cost-model to take different parameters such
4812 // as register pressure, code size increase and cost of extra branches into
4813 // account. For now we apply a very crude heuristic and only consider loops
4814 // with vectorization factors larger than a certain value.
4815
4816 // Allow the target to opt out entirely.
4818 return false;
4819
4820 // We also consider epilogue vectorization unprofitable for targets that don't
4821 // consider interleaving beneficial (eg. MVE).
4822 if (TTI.getMaxInterleaveFactor(VF) <= 1)
4823 return false;
4824
4825 // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4826 // VFs when deciding profitability.
4827 // See related "TODO: extend to support scalable VFs." in
4828 // selectEpilogueVectorizationFactor.
4829 unsigned Multiplier = VF.isFixed() ? IC : 1;
4830 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4833 return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
4834}
4835
4837 const ElementCount MainLoopVF, unsigned IC) {
4840 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4841 return Result;
4842 }
4843
4844 if (!CM.isScalarEpilogueAllowed()) {
4845 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4846 "epilogue is allowed.\n");
4847 return Result;
4848 }
4849
4850 // Not really a cost consideration, but check for unsupported cases here to
4851 // simplify the logic.
4852 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4853 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4854 "is not a supported candidate.\n");
4855 return Result;
4856 }
4857
4859 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4861 if (hasPlanWithVF(ForcedEC))
4862 return {ForcedEC, 0, 0};
4863
4864 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4865 "viable.\n");
4866 return Result;
4867 }
4868
4869 if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
4870 OrigLoop->getHeader()->getParent()->hasMinSize()) {
4871 LLVM_DEBUG(
4872 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4873 return Result;
4874 }
4875
4876 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
4877 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4878 "this loop\n");
4879 return Result;
4880 }
4881
4882 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4883 // the main loop handles 8 lanes per iteration. We could still benefit from
4884 // vectorizing the epilogue loop with VF=4.
4885 ElementCount EstimatedRuntimeVF =
4886 ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF));
4887
4888 ScalarEvolution &SE = *PSE.getSE();
4889 Type *TCType = Legal->getWidestInductionType();
4890 const SCEV *RemainingIterations = nullptr;
4891 unsigned MaxTripCount = 0;
4892 for (auto &NextVF : ProfitableVFs) {
4893 // Skip candidate VFs without a corresponding VPlan.
4894 if (!hasPlanWithVF(NextVF.Width))
4895 continue;
4896
4897 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4898 // vectors) or > the VF of the main loop (fixed vectors).
4899 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4900 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4901 (NextVF.Width.isScalable() &&
4902 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
4903 (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4904 ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
4905 continue;
4906
4907 // If NextVF is greater than the number of remaining iterations, the
4908 // epilogue loop would be dead. Skip such factors.
4909 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4910 // TODO: extend to support scalable VFs.
4911 if (!RemainingIterations) {
4913 getPlanFor(NextVF.Width).getTripCount(), SE);
4914 assert(!isa<SCEVCouldNotCompute>(TC) &&
4915 "Trip count SCEV must be computable");
4916 RemainingIterations = SE.getURemExpr(
4917 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
4918 MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;
4919 if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
4920 SE.getConstant(TCType, MaxTripCount))) {
4921 MaxTripCount =
4922 SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
4923 }
4924 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4925 << MaxTripCount << "\n");
4926 }
4927 if (SE.isKnownPredicate(
4929 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
4930 RemainingIterations))
4931 continue;
4932 }
4933
4934 if (Result.Width.isScalar() ||
4935 isMoreProfitable(NextVF, Result, MaxTripCount))
4936 Result = NextVF;
4937 }
4938
4939 if (Result != VectorizationFactor::Disabled())
4940 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4941 << Result.Width << "\n");
4942 return Result;
4943}
4944
4945std::pair<unsigned, unsigned>
4947 unsigned MinWidth = -1U;
4948 unsigned MaxWidth = 8;
4950 // For in-loop reductions, no element types are added to ElementTypesInLoop
4951 // if there are no loads/stores in the loop. In this case, check through the
4952 // reduction variables to determine the maximum width.
4953 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4954 // Reset MaxWidth so that we can find the smallest type used by recurrences
4955 // in the loop.
4956 MaxWidth = -1U;
4957 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4958 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4959 // When finding the min width used by the recurrence we need to account
4960 // for casts on the input operands of the recurrence.
4961 MaxWidth = std::min<unsigned>(
4962 MaxWidth, std::min<unsigned>(
4965 }
4966 } else {
4967 for (Type *T : ElementTypesInLoop) {
4968 MinWidth = std::min<unsigned>(
4969 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4970 MaxWidth = std::max<unsigned>(
4971 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4972 }
4973 }
4974 return {MinWidth, MaxWidth};
4975}
4976
4978 ElementTypesInLoop.clear();
4979 // For each block.
4980 for (BasicBlock *BB : TheLoop->blocks()) {
4981 // For each instruction in the loop.
4982 for (Instruction &I : BB->instructionsWithoutDebug()) {
4983 Type *T = I.getType();
4984
4985 // Skip ignored values.
4986 if (ValuesToIgnore.count(&I))
4987 continue;
4988
4989 // Only examine Loads, Stores and PHINodes.
4990 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4991 continue;
4992
4993 // Examine PHI nodes that are reduction variables. Update the type to
4994 // account for the recurrence type.
4995 if (auto *PN = dyn_cast<PHINode>(&I)) {
4996 if (!Legal->isReductionVariable(PN))
4997 continue;
4998 const RecurrenceDescriptor &RdxDesc =
4999 Legal->getReductionVars().find(PN)->second;
5002 RdxDesc.getRecurrenceType(),
5004 continue;
5005 T = RdxDesc.getRecurrenceType();
5006 }
5007
5008 // Examine the stored values.
5009 if (auto *ST = dyn_cast<StoreInst>(&I))
5010 T = ST->getValueOperand()->getType();
5011
5012 assert(T->isSized() &&
5013 "Expected the load/store/recurrence type to be sized");
5014
5015 ElementTypesInLoop.insert(T);
5016 }
5017 }
5018}
5019
5020unsigned
5022 InstructionCost LoopCost) {
5023 // -- The interleave heuristics --
5024 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5025 // There are many micro-architectural considerations that we can't predict
5026 // at this level. For example, frontend pressure (on decode or fetch) due to
5027 // code size, or the number and capabilities of the execution ports.
5028 //
5029 // We use the following heuristics to select the interleave count:
5030 // 1. If the code has reductions, then we interleave to break the cross
5031 // iteration dependency.
5032 // 2. If the loop is really small, then we interleave to reduce the loop
5033 // overhead.
5034 // 3. We don't interleave if we think that we will spill registers to memory
5035 // due to the increased register pressure.
5036
5038 return 1;
5039
5040 // Do not interleave if EVL is preferred and no User IC is specified.
5041 if (foldTailWithEVL()) {
5042 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
5043 "Unroll factor forced to be 1.\n");
5044 return 1;
5045 }
5046
5047 // We used the distance for the interleave count.
5049 return 1;
5050
5051 // We don't attempt to perform interleaving for loops with uncountable early
5052 // exits because the VPInstruction::AnyOf code cannot currently handle
5053 // multiple parts.
5055 return 1;
5056
5057 auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
5058 const bool HasReductions = !Legal->getReductionVars().empty();
5059
5060 // If we did not calculate the cost for VF (because the user selected the VF)
5061 // then we calculate the cost of VF here.
5062 if (LoopCost == 0) {
5063 LoopCost = expectedCost(VF);
5064 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5065
5066 // Loop body is free and there is no need for interleaving.
5067 if (LoopCost == 0)
5068 return 1;
5069 }
5070
5072 // We divide by these constants so assume that we have at least one
5073 // instruction that uses at least one register.
5074 for (auto &Pair : R.MaxLocalUsers) {
5075 Pair.second = std::max(Pair.second, 1U);
5076 }
5077
5078 // We calculate the interleave count using the following formula.
5079 // Subtract the number of loop invariants from the number of available
5080 // registers. These registers are used by all of the interleaved instances.
5081 // Next, divide the remaining registers by the number of registers that is
5082 // required by the loop, in order to estimate how many parallel instances
5083 // fit without causing spills. All of this is rounded down if necessary to be
5084 // a power of two. We want power of two interleave count to simplify any
5085 // addressing operations or alignment considerations.
5086 // We also want power of two interleave counts to ensure that the induction
5087 // variable of the vector loop wraps to zero, when tail is folded by masking;
5088 // this currently happens when OptForSize, in which case IC is set to 1 above.
5089 unsigned IC = UINT_MAX;
5090
5091 for (const auto &Pair : R.MaxLocalUsers) {
5092 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
5093 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5094 << " registers of "
5095 << TTI.getRegisterClassName(Pair.first)
5096 << " register class\n");
5097 if (VF.isScalar()) {
5098 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5099 TargetNumRegisters = ForceTargetNumScalarRegs;
5100 } else {
5101 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5102 TargetNumRegisters = ForceTargetNumVectorRegs;
5103 }
5104 unsigned MaxLocalUsers = Pair.second;
5105 unsigned LoopInvariantRegs = 0;
5106 if (R.LoopInvariantRegs.find(Pair.first) != R.LoopInvariantRegs.end())
5107 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
5108
5109 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5110 MaxLocalUsers);
5111 // Don't count the induction variable as interleaved.
5113 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5114 std::max(1U, (MaxLocalUsers - 1)));
5115 }
5116
5117 IC = std::min(IC, TmpIC);
5118 }
5119
5120 // Clamp the interleave ranges to reasonable counts.
5121 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5122
5123 // Check if the user has overridden the max.
5124 if (VF.isScalar()) {
5125 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5126 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5127 } else {
5128 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5129 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5130 }
5131
5132 unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
5133 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5134 if (KnownTC > 0) {
5135 // At least one iteration must be scalar when this constraint holds. So the
5136 // maximum available iterations for interleaving is one less.
5137 unsigned AvailableTC =
5138 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5139
5140 // If trip count is known we select between two prospective ICs, where
5141 // 1) the aggressive IC is capped by the trip count divided by VF
5142 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5143 // The final IC is selected in a way that the epilogue loop trip count is
5144 // minimized while maximizing the IC itself, so that we either run the
5145 // vector loop at least once if it generates a small epilogue loop, or else
5146 // we run the vector loop at least twice.
5147
5148 unsigned InterleaveCountUB = bit_floor(
5149 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
5150 unsigned InterleaveCountLB = bit_floor(std::max(
5151 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5152 MaxInterleaveCount = InterleaveCountLB;
5153
5154 if (InterleaveCountUB != InterleaveCountLB) {
5155 unsigned TailTripCountUB =
5156 (AvailableTC % (EstimatedVF * InterleaveCountUB));
5157 unsigned TailTripCountLB =
5158 (AvailableTC % (EstimatedVF * InterleaveCountLB));
5159 // If both produce same scalar tail, maximize the IC to do the same work
5160 // in fewer vector loop iterations
5161 if (TailTripCountUB == TailTripCountLB)
5162 MaxInterleaveCount = InterleaveCountUB;
5163 }
5164 } else if (BestKnownTC && *BestKnownTC > 0) {
5165 // At least one iteration must be scalar when this constraint holds. So the
5166 // maximum available iterations for interleaving is one less.
5167 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5168 ? (*BestKnownTC) - 1
5169 : *BestKnownTC;
5170
5171 // If trip count is an estimated compile time constant, limit the
5172 // IC to be capped by the trip count divided by VF * 2, such that the vector
5173 // loop runs at least twice to make interleaving seem profitable when there
5174 // is an epilogue loop present. Since exact Trip count is not known we
5175 // choose to be conservative in our IC estimate.
5176 MaxInterleaveCount = bit_floor(std::max(
5177 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5178 }
5179
5180 assert(MaxInterleaveCount > 0 &&
5181 "Maximum interleave count must be greater than 0");
5182
5183 // Clamp the calculated IC to be between the 1 and the max interleave count
5184 // that the target and trip count allows.
5185 if (IC > MaxInterleaveCount)
5186 IC = MaxInterleaveCount;
5187 else
5188 // Make sure IC is greater than 0.
5189 IC = std::max(1u, IC);
5190
5191 assert(IC > 0 && "Interleave count must be greater than 0.");
5192
5193 // Interleave if we vectorized this loop and there is a reduction that could
5194 // benefit from interleaving.
5195 if (VF.isVector() && HasReductions) {
5196 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5197 return IC;
5198 }
5199
5200 // For any scalar loop that either requires runtime checks or predication we
5201 // are better off leaving this to the unroller. Note that if we've already
5202 // vectorized the loop we will have done the runtime check and so interleaving
5203 // won't require further checks.
5204 bool ScalarInterleavingRequiresPredication =
5205 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5206 return Legal->blockNeedsPredication(BB);
5207 }));
5208 bool ScalarInterleavingRequiresRuntimePointerCheck =
5210
5211 // We want to interleave small loops in order to reduce the loop overhead and
5212 // potentially expose ILP opportunities.
5213 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5214 << "LV: IC is " << IC << '\n'
5215 << "LV: VF is " << VF << '\n');
5216 const bool AggressivelyInterleaveReductions =
5217 TTI.enableAggressiveInterleaving(HasReductions);
5218 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5219 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5220 // We assume that the cost overhead is 1 and we use the cost model
5221 // to estimate the cost of the loop and interleave until the cost of the
5222 // loop overhead is about 5% of the cost of the loop.
5223 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5224 SmallLoopCost / *LoopCost.getValue()));
5225
5226 // Interleave until store/load ports (estimated by max interleave count) are
5227 // saturated.
5228 unsigned NumStores = Legal->getNumStores();
5229 unsigned NumLoads = Legal->getNumLoads();
5230 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5231 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5232
5233 // There is little point in interleaving for reductions containing selects
5234 // and compares when VF=1 since it may just create more overhead than it's
5235 // worth for loops with small trip counts. This is because we still have to
5236 // do the final reduction after the loop.
5237 bool HasSelectCmpReductions =
5238 HasReductions &&
5239 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5240 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5241 RecurKind RK = RdxDesc.getRecurrenceKind();
5242 return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
5243 RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK);
5244 });
5245 if (HasSelectCmpReductions) {
5246 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5247 return 1;
5248 }
5249
5250 // If we have a scalar reduction (vector reductions are already dealt with
5251 // by this point), we can increase the critical path length if the loop
5252 // we're interleaving is inside another loop. For tree-wise reductions
5253 // set the limit to 2, and for ordered reductions it's best to disable
5254 // interleaving entirely.
5255 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5256 bool HasOrderedReductions =
5257 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5258 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5259 return RdxDesc.isOrdered();
5260 });
5261 if (HasOrderedReductions) {
5262 LLVM_DEBUG(
5263 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5264 return 1;
5265 }
5266
5267 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5268 SmallIC = std::min(SmallIC, F);
5269 StoresIC = std::min(StoresIC, F);
5270 LoadsIC = std::min(LoadsIC, F);
5271 }
5272
5274 std::max(StoresIC, LoadsIC) > SmallIC) {
5275 LLVM_DEBUG(
5276 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5277 return std::max(StoresIC, LoadsIC);
5278 }
5279
5280 // If there are scalar reductions and TTI has enabled aggressive
5281 // interleaving for reductions, we will interleave to expose ILP.
5282 if (VF.isScalar() && AggressivelyInterleaveReductions) {
5283 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5284 // Interleave no less than SmallIC but not as aggressive as the normal IC
5285 // to satisfy the rare situation when resources are too limited.
5286 return std::max(IC / 2, SmallIC);
5287 }
5288
5289 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5290 return SmallIC;
5291 }
5292
5293 // Interleave if this is a large loop (small loops are already dealt with by
5294 // this point) that could benefit from interleaving.
5295 if (AggressivelyInterleaveReductions) {
5296 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5297 return IC;
5298 }
5299
5300 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5301 return 1;
5302}
5303
5306 // This function calculates the register usage by measuring the highest number
5307 // of values that are alive at a single location. Obviously, this is a very
5308 // rough estimation. We scan the loop in a topological order in order and
5309 // assign a number to each instruction. We use RPO to ensure that defs are
5310 // met before their users. We assume that each instruction that has in-loop
5311 // users starts an interval. We record every time that an in-loop value is
5312 // used, so we have a list of the first and last occurrences of each
5313 // instruction. Next, we transpose this data structure into a multi map that
5314 // holds the list of intervals that *end* at a specific location. This multi
5315 // map allows us to perform a linear search. We scan the instructions linearly
5316 // and record each time that a new interval starts, by placing it in a set.
5317 // If we find this value in the multi-map then we remove it from the set.
5318 // The max register usage is the maximum size of the set.
5319 // We also search for instructions that are defined outside the loop, but are
5320 // used inside the loop. We need this number separately from the max-interval
5321 // usage number because when we unroll, loop-invariant values do not take
5322 // more register.
5324 DFS.perform(LI);
5325
5326 RegisterUsage RU;
5327
5328 // Each 'key' in the map opens a new interval. The values
5329 // of the map are the index of the 'last seen' usage of the
5330 // instruction that is the key.
5332
5333 // Maps instruction to its index.
5335 // Marks the end of each interval.
5336 IntervalMap EndPoint;
5337 // Saves the list of instruction indices that are used in the loop.
5339 // Saves the list of values that are used in the loop but are defined outside
5340 // the loop (not including non-instruction values such as arguments and
5341 // constants).
5342 SmallSetVector<Instruction *, 8> LoopInvariants;
5343
5344 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5345 for (Instruction &I : BB->instructionsWithoutDebug()) {
5346 IdxToInstr.push_back(&I);
5347
5348 // Save the end location of each USE.
5349 for (Value *U : I.operands()) {
5350 auto *Instr = dyn_cast<Instruction>(U);
5351
5352 // Ignore non-instruction values such as arguments, constants, etc.
5353 // FIXME: Might need some motivation why these values are ignored. If
5354 // for example an argument is used inside the loop it will increase the
5355 // register pressure (so shouldn't we add it to LoopInvariants).
5356 if (!Instr)
5357 continue;
5358
5359 // If this instruction is outside the loop then record it and continue.
5360 if (!TheLoop->contains(Instr)) {
5361 LoopInvariants.insert(Instr);
5362 continue;
5363 }
5364
5365 // Overwrite previous end points.
5366 EndPoint[Instr] = IdxToInstr.size();
5367 Ends.insert(Instr);
5368 }
5369 }
5370 }
5371
5372 // Saves the list of intervals that end with the index in 'key'.
5373 using InstrList = SmallVector<Instruction *, 2>;
5375
5376 // Transpose the EndPoints to a list of values that end at each index.
5377 for (auto &Interval : EndPoint)
5378 TransposeEnds[Interval.second].push_back(Interval.first);
5379
5380 SmallPtrSet<Instruction *, 8> OpenIntervals;
5383
5384 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5385
5386 const auto &TTICapture = TTI;
5387 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5388 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
5389 (VF.isScalable() &&
5390 !TTICapture.isElementTypeLegalForScalableVector(Ty)))
5391 return 0;
5392 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5393 };
5394
5395 for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) {
5396 Instruction *I = IdxToInstr[Idx];
5397
5398 // Remove all of the instructions that end at this location.
5399 InstrList &List = TransposeEnds[Idx];
5400 for (Instruction *ToRemove : List)
5401 OpenIntervals.erase(ToRemove);
5402
5403 // Ignore instructions that are never used within the loop.
5404 if (!Ends.count(I))
5405 continue;
5406
5407 // Skip ignored values.
5408 if (ValuesToIgnore.count(I))
5409 continue;
5410
5412
5413 // For each VF find the maximum usage of registers.
5414 for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
5415 // Count the number of registers used, per register class, given all open
5416 // intervals.
5417 // Note that elements in this SmallMapVector will be default constructed
5418 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5419 // there is no previous entry for ClassID.
5421
5422 if (VFs[J].isScalar()) {
5423 for (auto *Inst : OpenIntervals) {
5424 unsigned ClassID =
5425 TTI.getRegisterClassForType(false, Inst->getType());
5426 // FIXME: The target might use more than one register for the type
5427 // even in the scalar case.
5428 RegUsage[ClassID] += 1;
5429 }
5430 } else {
5432 for (auto *Inst : OpenIntervals) {
5433 // Skip ignored values for VF > 1.
5434 if (VecValuesToIgnore.count(Inst))
5435 continue;
5436 if (isScalarAfterVectorization(Inst, VFs[J])) {
5437 unsigned ClassID =
5438 TTI.getRegisterClassForType(false, Inst->getType());
5439 // FIXME: The target might use more than one register for the type
5440 // even in the scalar case.
5441 RegUsage[ClassID] += 1;
5442 } else {
5443 unsigned ClassID =
5444 TTI.getRegisterClassForType(true, Inst->getType());
5445 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]);
5446 }
5447 }
5448 }
5449
5450 for (const auto &Pair : RegUsage) {
5451 auto &Entry = MaxUsages[J][Pair.first];
5452 Entry = std::max(Entry, Pair.second);
5453 }
5454 }
5455
5456 LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
5457 << OpenIntervals.size() << '\n');
5458
5459 // Add the current instruction to the list of open intervals.
5460 OpenIntervals.insert(I);
5461 }
5462
5463 for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
5464 // Note that elements in this SmallMapVector will be default constructed
5465 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5466 // there is no previous entry for ClassID.
5468
5469 for (auto *Inst : LoopInvariants) {
5470 // FIXME: The target might use more than one register for the type
5471 // even in the scalar case.
5472 bool IsScalar = all_of(Inst->users(), [&](User *U) {
5473 auto *I = cast<Instruction>(U);
5474 return TheLoop != LI->getLoopFor(I->getParent()) ||
5475 isScalarAfterVectorization(I, VFs[Idx]);
5476 });
5477
5478 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
5479 unsigned ClassID =
5480 TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5481 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5482 }
5483
5484 LLVM_DEBUG({
5485 dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
5486 dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
5487 << " item\n";
5488 for (const auto &pair : MaxUsages[Idx]) {
5489 dbgs() << "LV(REG): RegisterClass: "
5490 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5491 << " registers\n";
5492 }
5493 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5494 << " item\n";
5495 for (const auto &pair : Invariant) {
5496 dbgs() << "LV(REG): RegisterClass: "
5497 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5498 << " registers\n";
5499 }
5500 });
5501
5502 RU.LoopInvariantRegs = Invariant;
5503 RU.MaxLocalUsers = MaxUsages[Idx];
5504 RUs[Idx] = RU;
5505 }
5506
5507 return RUs;
5508}
5509
5510bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5511 ElementCount VF) {
5512 // TODO: Cost model for emulated masked load/store is completely
5513 // broken. This hack guides the cost model to use an artificially
5514 // high enough value to practically disable vectorization with such
5515 // operations, except where previously deployed legality hack allowed
5516 // using very low cost values. This is to avoid regressions coming simply
5517 // from moving "masked load/store" check from legality to cost model.
5518 // Masked Load/Gather emulation was previously never allowed.
5519 // Limited number of Masked Store/Scatter emulation was allowed.
5521 "Expecting a scalar emulated instruction");
5522 return isa<LoadInst>(I) ||
5523 (isa<StoreInst>(I) &&
5524 NumPredStores > NumberOfStoresToPredicate);
5525}
5526
5528 // If we aren't vectorizing the loop, or if we've already collected the
5529 // instructions to scalarize, there's nothing to do. Collection may already
5530 // have occurred if we have a user-selected VF and are now computing the
5531 // expected cost for interleaving.
5532 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5533 return;
5534
5535 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5536 // not profitable to scalarize any instructions, the presence of VF in the
5537 // map will indicate that we've analyzed it already.
5538 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5539
5540 PredicatedBBsAfterVectorization[VF].clear();
5541
5542 // Find all the instructions that are scalar with predication in the loop and
5543 // determine if it would be better to not if-convert the blocks they are in.
5544 // If so, we also record the instructions to scalarize.
5545 for (BasicBlock *BB : TheLoop->blocks()) {
5547 continue;
5548 for (Instruction &I : *BB)
5549 if (isScalarWithPredication(&I, VF)) {
5550 ScalarCostsTy ScalarCosts;
5551 // Do not apply discount logic for:
5552 // 1. Scalars after vectorization, as there will only be a single copy
5553 // of the instruction.
5554 // 2. Scalable VF, as that would lead to invalid scalarization costs.
5555 // 3. Emulated masked memrefs, if a hacked cost is needed.
5556 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5557 !useEmulatedMaskMemRefHack(&I, VF) &&
5558 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
5559 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5560 // Check if we decided to scalarize a call. If so, update the widening
5561 // decision of the call to CM_Scalarize with the computed scalar cost.
5562 for (const auto &[I, _] : ScalarCosts) {
5563 auto *CI = dyn_cast<CallInst>(I);
5564 if (!CI || !CallWideningDecisions.contains({CI, VF}))
5565 continue;
5566 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
5567 CallWideningDecisions[{CI, VF}].Cost = ScalarCosts[CI];
5568 }
5569 }
5570 // Remember that BB will remain after vectorization.
5571 PredicatedBBsAfterVectorization[VF].insert(BB);
5572 for (auto *Pred : predecessors(BB)) {
5573 if (Pred->getSingleSuccessor() == BB)
5574 PredicatedBBsAfterVectorization[VF].insert(Pred);
5575 }
5576 }
5577 }
5578}
5579
5580InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5581 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5582 assert(!isUniformAfterVectorization(PredInst, VF) &&
5583 "Instruction marked uniform-after-vectorization will be predicated");
5584
5585 // Initialize the discount to zero, meaning that the scalar version and the
5586 // vector version cost the same.
5587 InstructionCost Discount = 0;
5588
5589 // Holds instructions to analyze. The instructions we visit are mapped in
5590 // ScalarCosts. Those instructions are the ones that would be scalarized if
5591 // we find that the scalar version costs less.
5593
5594 // Returns true if the given instruction can be scalarized.
5595 auto CanBeScalarized = [&](Instruction *I) -> bool {
5596 // We only attempt to scalarize instructions forming a single-use chain
5597 // from the original predicated block that would otherwise be vectorized.
5598 // Although not strictly necessary, we give up on instructions we know will
5599 // already be scalar to avoid traversing chains that are unlikely to be
5600 // beneficial.
5601 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5603 return false;
5604
5605 // If the instruction is scalar with predication, it will be analyzed
5606 // separately. We ignore it within the context of PredInst.
5607 if (isScalarWithPredication(I, VF))
5608 return false;
5609
5610 // If any of the instruction's operands are uniform after vectorization,
5611 // the instruction cannot be scalarized. This prevents, for example, a
5612 // masked load from being scalarized.
5613 //
5614 // We assume we will only emit a value for lane zero of an instruction
5615 // marked uniform after vectorization, rather than VF identical values.
5616 // Thus, if we scalarize an instruction that uses a uniform, we would
5617 // create uses of values corresponding to the lanes we aren't emitting code
5618 // for. This behavior can be changed by allowing getScalarValue to clone
5619 // the lane zero values for uniforms rather than asserting.
5620 for (Use &U : I->operands())
5621 if (auto *J = dyn_cast<Instruction>(U.get()))
5622 if (isUniformAfterVectorization(J, VF))
5623 return false;
5624
5625 // Otherwise, we can scalarize the instruction.
5626 return true;
5627 };
5628
5629 // Compute the expected cost discount from scalarizing the entire expression
5630 // feeding the predicated instruction. We currently only consider expressions
5631 // that are single-use instruction chains.
5632 Worklist.push_back(PredInst);
5633 while (!Worklist.empty()) {
5634 Instruction *I = Worklist.pop_back_val();
5635
5636 // If we've already analyzed the instruction, there's nothing to do.
5637 if (ScalarCosts.contains(I))
5638 continue;
5639
5640 // Compute the cost of the vector instruction. Note that this cost already
5641 // includes the scalarization overhead of the predicated instruction.
5642 InstructionCost VectorCost = getInstructionCost(I, VF);
5643
5644 // Compute the cost of the scalarized instruction. This cost is the cost of
5645 // the instruction as if it wasn't if-converted and instead remained in the
5646 // predicated block. We will scale this cost by block probability after
5647 // computing the scalarization overhead.
5648 InstructionCost ScalarCost =
5650
5651 // Compute the scalarization overhead of needed insertelement instructions
5652 // and phi nodes.
5654 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5655 ScalarCost += TTI.getScalarizationOverhead(
5656 cast<VectorType>(toVectorTy(I->getType(), VF)),
5657 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5658 /*Extract*/ false, CostKind);
5659 ScalarCost +=
5660 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5661 }
5662
5663 // Compute the scalarization overhead of needed extractelement
5664 // instructions. For each of the instruction's operands, if the operand can
5665 // be scalarized, add it to the worklist; otherwise, account for the
5666 // overhead.
5667 for (Use &U : I->operands())
5668 if (auto *J = dyn_cast<Instruction>(U.get())) {
5669 assert(VectorType::isValidElementType(J->getType()) &&
5670 "Instruction has non-scalar type");
5671 if (CanBeScalarized(J))
5672 Worklist.push_back(J);
5673 else if (needsExtract(J, VF)) {
5674 ScalarCost += TTI.getScalarizationOverhead(
5675 cast<VectorType>(toVectorTy(J->getType(), VF)),
5676 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5677 /*Extract*/ true, CostKind);
5678 }
5679 }
5680
5681 // Scale the total scalar cost by block probability.
5682 ScalarCost /= getReciprocalPredBlockProb();
5683
5684 // Compute the discount. A non-negative discount means the vector version
5685 // of the instruction costs more, and scalarizing would be beneficial.
5686 Discount += VectorCost - ScalarCost;
5687 ScalarCosts[I] = ScalarCost;
5688 }
5689
5690 return Discount;
5691}
5692
5695
5696 // If the vector loop gets executed exactly once with the given VF, ignore the
5697 // costs of comparison and induction instructions, as they'll get simplified
5698 // away.
5699 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5701 if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
5703 ValuesToIgnoreForVF);
5704
5705 // For each block.
5706 for (BasicBlock *BB : TheLoop->blocks()) {
5707 InstructionCost BlockCost;
5708
5709 // For each instruction in the old loop.
5710 for (Instruction &I : BB->instructionsWithoutDebug()) {
5711 // Skip ignored values.
5712 if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
5713 (VF.isVector() && VecValuesToIgnore.count(&I)))
5714 continue;
5715
5717
5718 // Check if we should override the cost.
5719 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5721
5722 BlockCost += C;
5723 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5724 << VF << " For instruction: " << I << '\n');
5725 }
5726
5727 // If we are vectorizing a predicated block, it will have been
5728 // if-converted. This means that the block's instructions (aside from
5729 // stores and instructions that may divide by zero) will now be
5730 // unconditionally executed. For the scalar case, we may not always execute
5731 // the predicated block, if it is an if-else block. Thus, scale the block's
5732 // cost by the probability of executing it. blockNeedsPredication from
5733 // Legal is used so as to not include all blocks in tail folded loops.
5734 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5735 BlockCost /= getReciprocalPredBlockProb();
5736
5737 Cost += BlockCost;
5738 }
5739
5740 return Cost;
5741}
5742
5743/// Gets Address Access SCEV after verifying that the access pattern
5744/// is loop invariant except the induction variable dependence.
5745///
5746/// This SCEV can be sent to the Target in order to estimate the address
5747/// calculation cost.
5749 Value *Ptr,
5752 const Loop *TheLoop) {
5753
5754 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5755 if (!Gep)
5756 return nullptr;
5757
5758 // We are looking for a gep with all loop invariant indices except for one
5759 // which should be an induction variable.
5760 auto *SE = PSE.getSE();
5761 unsigned NumOperands = Gep->getNumOperands();
5762 for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
5763 Value *Opd = Gep->getOperand(Idx);
5764 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5765 !Legal->isInductionVariable(Opd))
5766 return nullptr;
5767 }
5768
5769 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5770 return PSE.getSCEV(Ptr);
5771}
5772
5774LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5775 ElementCount VF) {
5776 assert(VF.isVector() &&
5777 "Scalarization cost of instruction implies vectorization.");
5778 if (VF.isScalable())
5780
5781 Type *ValTy = getLoadStoreType(I);
5782 auto *SE = PSE.getSE();
5783
5784 unsigned AS = getLoadStoreAddressSpace(I);
5786 Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5787 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5788 // that it is being called from this specific place.
5789
5790 // Figure out whether the access is strided and get the stride value
5791 // if it's known in compile time
5792 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5793
5794 // Get the cost of the scalar memory instruction and address computation.
5796 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5797
5798 // Don't pass *I here, since it is scalar but will actually be part of a
5799 // vectorized loop where the user of it is a vectorized instruction.
5801 const Align Alignment = getLoadStoreAlignment(I);
5802 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5803 ValTy->getScalarType(),
5804 Alignment, AS, CostKind);
5805
5806 // Get the overhead of the extractelement and insertelement instructions
5807 // we might create due to scalarization.
5808 Cost += getScalarizationOverhead(I, VF, CostKind);
5809
5810 // If we have a predicated load/store, it will need extra i1 extracts and
5811 // conditional branches, but may not be executed for each vector lane. Scale
5812 // the cost by the probability of executing the predicated block.
5813 if (isPredicatedInst(I)) {
5815
5816 // Add the cost of an i1 extract and a branch
5817 auto *VecI1Ty =
5820 VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
5821 /*Insert=*/false, /*Extract=*/true, CostKind);
5822 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5823
5824 if (useEmulatedMaskMemRefHack(I, VF))
5825 // Artificially setting to a high enough value to practically disable
5826 // vectorization with such operations.
5827 Cost = 3000000;
5828 }
5829
5830 return Cost;
5831}
5832
5834LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5835 ElementCount VF) {
5836 Type *ValTy = getLoadStoreType(I);
5837 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5839 unsigned AS = getLoadStoreAddressSpace(I);
5840 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5842
5843 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5844 "Stride should be 1 or -1 for consecutive memory access");
5845 const Align Alignment = getLoadStoreAlignment(I);
5847 if (Legal->isMaskRequired(I)) {
5848 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5849 CostKind);
5850 } else {
5851 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5852 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5853 CostKind, OpInfo, I);
5854 }
5855
5856 bool Reverse = ConsecutiveStride < 0;
5857 if (Reverse)
5859 CostKind, 0);
5860 return Cost;
5861}
5862
5864LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5865 ElementCount VF) {
5866 assert(Legal->isUniformMemOp(*I, VF));
5867
5868 Type *ValTy = getLoadStoreType(I);
5869 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5870 const Align Alignment = getLoadStoreAlignment(I);
5871 unsigned AS = getLoadStoreAddressSpace(I);
5873 if (isa<LoadInst>(I)) {
5874 return TTI.getAddressComputationCost(ValTy) +
5875 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5876 CostKind) +
5878 }
5879 StoreInst *SI = cast<StoreInst>(I);
5880
5881 bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5882 return TTI.getAddressComputationCost(ValTy) +
5883 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5884 CostKind) +
5885 (IsLoopInvariantStoreValue
5886 ? 0
5887 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5888 CostKind, VF.getKnownMinValue() - 1));
5889}
5890
5892LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5893 ElementCount VF) {
5894 Type *ValTy = getLoadStoreType(I);
5895 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5896 const Align Alignment = getLoadStoreAlignment(I);
5898
5899 return TTI.getAddressComputationCost(VectorTy) +
5901 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
5903}
5904
5906LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5907 ElementCount VF) {
5908 const auto *Group = getInterleavedAccessGroup(I);
5909 assert(Group && "Fail to get an interleaved access group.");
5910
5911 Instruction *InsertPos = Group->getInsertPos();
5912 Type *ValTy = getLoadStoreType(InsertPos);
5913 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5914 unsigned AS = getLoadStoreAddressSpace(InsertPos);
5916
5917 unsigned InterleaveFactor = Group->getFactor();
5918 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5919
5920 // Holds the indices of existing members in the interleaved group.
5922 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5923 if (Group->getMember(IF))
5924 Indices.push_back(IF);
5925
5926 // Calculate the cost of the whole interleaved group.
5927 bool UseMaskForGaps =
5928 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5929 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
5931 InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5932 Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
5933 UseMaskForGaps);
5934
5935 if (Group->isReverse()) {
5936 // TODO: Add support for reversed masked interleaved access.
5938 "Reverse masked interleaved access not supported.");
5939 Cost += Group->getNumMembers() *
5941 CostKind, 0);
5942 }
5943 return Cost;
5944}
5945
5946std::optional<InstructionCost>
5948 Instruction *I, ElementCount VF, Type *Ty,
5950 using namespace llvm::PatternMatch;
5951 // Early exit for no inloop reductions
5952 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5953 return std::nullopt;
5954 auto *VectorTy = cast<VectorType>(Ty);
5955
5956 // We are looking for a pattern of, and finding the minimal acceptable cost:
5957 // reduce(mul(ext(A), ext(B))) or
5958 // reduce(mul(A, B)) or
5959 // reduce(ext(A)) or
5960 // reduce(A).
5961 // The basic idea is that we walk down the tree to do that, finding the root
5962 // reduction instruction in InLoopReductionImmediateChains. From there we find
5963 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5964 // of the components. If the reduction cost is lower then we return it for the
5965 // reduction instruction and 0 for the other instructions in the pattern. If
5966 // it is not we return an invalid cost specifying the orignal cost method
5967 // should be used.
5968 Instruction *RetI = I;
5969 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5970 if (!RetI->hasOneUser())
5971 return std::nullopt;
5972 RetI = RetI->user_back();
5973 }
5974
5975 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5976 RetI->user_back()->getOpcode() == Instruction::Add) {
5977 RetI = RetI->user_back();
5978 }
5979
5980 // Test if the found instruction is a reduction, and if not return an invalid
5981 // cost specifying the parent to use the original cost modelling.
5982 if (!InLoopReductionImmediateChains.count(RetI))
5983 return std::nullopt;
5984
5985 // Find the reduction this chain is a part of and calculate the basic cost of
5986 // the reduction on its own.
5987 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
5988 Instruction *ReductionPhi = LastChain;
5989 while (!isa<PHINode>(ReductionPhi))
5990 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5991
5992 const RecurrenceDescriptor &RdxDesc =
5993 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
5994
5995 InstructionCost BaseCost;
5996 RecurKind RK = RdxDesc.getRecurrenceKind();
5999 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
6000 RdxDesc.getFastMathFlags(), CostKind);
6001 } else {
6003 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6004 }
6005
6006 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6007 // normal fmul instruction to the cost of the fadd reduction.
6008 if (RK == RecurKind::FMulAdd)
6009 BaseCost +=
6010 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6011
6012 // If we're using ordered reductions then we can just return the base cost
6013 // here, since getArithmeticReductionCost calculates the full ordered
6014 // reduction cost when FP reassociation is not allowed.
6015 if (useOrderedReductions(RdxDesc))
6016 return BaseCost;
6017
6018 // Get the operand that was not the reduction chain and match it to one of the
6019 // patterns, returning the better cost if it is found.
6020 Instruction *RedOp = RetI->getOperand(1) == LastChain
6021 ? dyn_cast<Instruction>(RetI->getOperand(0))
6022 : dyn_cast<Instruction>(RetI->getOperand(1));
6023
6024 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6025
6026 Instruction *Op0, *Op1;
6027 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6028 match(RedOp,
6030 match(Op0, m_ZExtOrSExt(m_Value())) &&
6031 Op0->getOpcode() == Op1->getOpcode() &&
6032 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6034 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6035
6036 // Matched reduce.add(ext(mul(ext(A), ext(B)))
6037 // Note that the extend opcodes need to all match, or if A==B they will have
6038 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6039 // which is equally fine.
6040 bool IsUnsigned = isa<ZExtInst>(Op0);
6041 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6042 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6043
6044 InstructionCost ExtCost =
6045 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6047 InstructionCost MulCost =
6048 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6049 InstructionCost Ext2Cost =
6050 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6052
6054 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6055
6056 if (RedCost.isValid() &&
6057 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6058 return I == RetI ? RedCost : 0;
6059 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6060 !TheLoop->isLoopInvariant(RedOp)) {
6061 // Matched reduce(ext(A))
6062 bool IsUnsigned = isa<ZExtInst>(RedOp);
6063 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6065 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6066 RdxDesc.getFastMathFlags(), CostKind);
6067
6068 InstructionCost ExtCost =
6069 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6071 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6072 return I == RetI ? RedCost : 0;
6073 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6074 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6075 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6076 Op0->getOpcode() == Op1->getOpcode() &&
6078 bool IsUnsigned = isa<ZExtInst>(Op0);
6079 Type *Op0Ty = Op0->getOperand(0)->getType();
6080 Type *Op1Ty = Op1->getOperand(0)->getType();
6081 Type *LargestOpTy =
6082 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6083 : Op0Ty;
6084 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6085
6086 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6087 // different sizes. We take the largest type as the ext to reduce, and add
6088 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6090 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6093 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6095 InstructionCost MulCost =
6096 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6097
6099 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6100 InstructionCost ExtraExtCost = 0;
6101 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6102 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6103 ExtraExtCost = TTI.getCastInstrCost(
6104 ExtraExtOp->getOpcode(), ExtType,
6105 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6107 }
6108
6109 if (RedCost.isValid() &&
6110 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6111 return I == RetI ? RedCost : 0;
6112 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6113 // Matched reduce.add(mul())
6114 InstructionCost MulCost =
6115 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6116
6118 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6119
6120 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6121 return I == RetI ? RedCost : 0;
6122 }
6123 }
6124
6125 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6126}
6127
6129LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6130 ElementCount VF) {
6131 // Calculate scalar cost only. Vectorization cost should be ready at this
6132 // moment.
6133 if (VF.isScalar()) {
6134 Type *ValTy = getLoadStoreType(I);
6135 const Align Alignment = getLoadStoreAlignment(I);
6136 unsigned AS = getLoadStoreAddressSpace(I);
6137
6138 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6139 return TTI.getAddressComputationCost(ValTy) +
6140 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6141 TTI::TCK_RecipThroughput, OpInfo, I);
6142 }
6143 return getWideningCost(I, VF);
6144}
6145
6146InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6148
6149 // There is no mechanism yet to create a scalable scalarization loop,
6150 // so this is currently Invalid.
6151 if (VF.isScalable())
6153
6154 if (VF.isScalar())
6155 return 0;
6156
6158 Type *RetTy = toVectorTy(I->getType(), VF);
6159 if (!RetTy->isVoidTy() &&
6160 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6162 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6163 /*Insert*/ true,
6164 /*Extract*/ false, CostKind);
6165
6166 // Some targets keep addresses scalar.
6167 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6168 return Cost;
6169
6170 // Some targets support efficient element stores.
6171 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6172 return Cost;
6173
6174 // Collect operands to consider.
6175 CallInst *CI = dyn_cast<CallInst>(I);
6176 Instruction::op_range Ops = CI ? CI->args() : I->operands();
6177
6178 // Skip operands that do not require extraction/scalarization and do not incur
6179 // any overhead.
6181 for (auto *V : filterExtractingOperands(Ops, VF))
6182 Tys.push_back(maybeVectorizeType(V->getType(), VF));
6184 filterExtractingOperands(Ops, VF), Tys, CostKind);
6185}
6186
6188 if (VF.isScalar())
6189 return;
6190 NumPredStores = 0;
6191 for (BasicBlock *BB : TheLoop->blocks()) {
6192 // For each instruction in the old loop.
6193 for (Instruction &I : *BB) {
6195 if (!Ptr)
6196 continue;
6197
6198 // TODO: We should generate better code and update the cost model for
6199 // predicated uniform stores. Today they are treated as any other
6200 // predicated store (see added test cases in
6201 // invariant-store-vectorization.ll).
6202 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6203 NumPredStores++;
6204
6205 if (Legal->isUniformMemOp(I, VF)) {
6206 auto IsLegalToScalarize = [&]() {
6207 if (!VF.isScalable())
6208 // Scalarization of fixed length vectors "just works".
6209 return true;
6210
6211 // We have dedicated lowering for unpredicated uniform loads and
6212 // stores. Note that even with tail folding we know that at least
6213 // one lane is active (i.e. generalized predication is not possible
6214 // here), and the logic below depends on this fact.
6215 if (!foldTailByMasking())
6216 return true;
6217
6218 // For scalable vectors, a uniform memop load is always
6219 // uniform-by-parts and we know how to scalarize that.
6220 if (isa<LoadInst>(I))
6221 return true;
6222
6223 // A uniform store isn't neccessarily uniform-by-part
6224 // and we can't assume scalarization.
6225 auto &SI = cast<StoreInst>(I);
6226 return TheLoop->isLoopInvariant(SI.getValueOperand());
6227 };
6228
6229 const InstructionCost GatherScatterCost =
6231 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6232
6233 // Load: Scalar load + broadcast
6234 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6235 // FIXME: This cost is a significant under-estimate for tail folded
6236 // memory ops.
6237 const InstructionCost ScalarizationCost =
6238 IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
6240
6241 // Choose better solution for the current VF, Note that Invalid
6242 // costs compare as maximumal large. If both are invalid, we get
6243 // scalable invalid which signals a failure and a vectorization abort.
6244 if (GatherScatterCost < ScalarizationCost)
6245 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6246 else
6247 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6248 continue;
6249 }
6250
6251 // We assume that widening is the best solution when possible.
6252 if (memoryInstructionCanBeWidened(&I, VF)) {
6253 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6254 int ConsecutiveStride = Legal->isConsecutivePtr(
6256 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6257 "Expected consecutive stride.");
6258 InstWidening Decision =
6259 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6260 setWideningDecision(&I, VF, Decision, Cost);
6261 continue;
6262 }
6263
6264 // Choose between Interleaving, Gather/Scatter or Scalarization.
6266 unsigned NumAccesses = 1;
6267 if (isAccessInterleaved(&I)) {
6268 const auto *Group = getInterleavedAccessGroup(&I);
6269 assert(Group && "Fail to get an interleaved access group.");
6270
6271 // Make one decision for the whole group.
6272 if (getWideningDecision(&I, VF) != CM_Unknown)
6273 continue;
6274
6275 NumAccesses = Group->getNumMembers();
6277 InterleaveCost = getInterleaveGroupCost(&I, VF);
6278 }
6279
6280 InstructionCost GatherScatterCost =
6282 ? getGatherScatterCost(&I, VF) * NumAccesses
6284
6285 InstructionCost ScalarizationCost =
6286 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6287
6288 // Choose better solution for the current VF,
6289 // write down this decision and use it during vectorization.
6291 InstWidening Decision;
6292 if (InterleaveCost <= GatherScatterCost &&
6293 InterleaveCost < ScalarizationCost) {
6294 Decision = CM_Interleave;
6295 Cost = InterleaveCost;
6296 } else if (GatherScatterCost < ScalarizationCost) {
6297 Decision = CM_GatherScatter;
6298 Cost = GatherScatterCost;
6299 } else {
6300 Decision = CM_Scalarize;
6301 Cost = ScalarizationCost;
6302 }
6303 // If the instructions belongs to an interleave group, the whole group
6304 // receives the same decision. The whole group receives the cost, but
6305 // the cost will actually be assigned to one instruction.
6306 if (const auto *Group = getInterleavedAccessGroup(&I))
6307 setWideningDecision(Group, VF, Decision, Cost);
6308 else
6309 setWideningDecision(&I, VF, Decision, Cost);
6310 }
6311 }
6312
6313 // Make sure that any load of address and any other address computation
6314 // remains scalar unless there is gather/scatter support. This avoids
6315 // inevitable extracts into address registers, and also has the benefit of
6316 // activating LSR more, since that pass can't optimize vectorized
6317 // addresses.
6319 return;
6320
6321 // Start with all scalar pointer uses.
6323 for (BasicBlock *BB : TheLoop->blocks())
6324 for (Instruction &I : *BB) {
6325 Instruction *PtrDef =
6326 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6327 if (PtrDef && TheLoop->contains(PtrDef) &&
6329 AddrDefs.insert(PtrDef);
6330 }
6331
6332 // Add all instructions used to generate the addresses.
6334 append_range(Worklist, AddrDefs);
6335 while (!Worklist.empty()) {
6336 Instruction *I = Worklist.pop_back_val();
6337 for (auto &Op : I->operands())
6338 if (auto *InstOp = dyn_cast<Instruction>(Op))
6339 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6340 AddrDefs.insert(InstOp).second)
6341 Worklist.push_back(InstOp);
6342 }
6343
6344 for (auto *I : AddrDefs) {
6345 if (isa<LoadInst>(I)) {
6346 // Setting the desired widening decision should ideally be handled in
6347 // by cost functions, but since this involves the task of finding out
6348 // if the loaded register is involved in an address computation, it is
6349 // instead changed here when we know this is the case.
6350 InstWidening Decision = getWideningDecision(I, VF);
6351 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6352 // Scalarize a widened load of address.
6354 I, VF, CM_Scalarize,
6355 (VF.getKnownMinValue() *
6356 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6357 else if (const auto *Group = getInterleavedAccessGroup(I)) {
6358 // Scalarize an interleave group of address loads.
6359 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6360 if (Instruction *Member = Group->getMember(I))
6362 Member, VF, CM_Scalarize,
6363 (VF.getKnownMinValue() *
6364 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6365 }
6366 }
6367 } else
6368 // Make sure I gets scalarized and a cost estimate without
6369 // scalarization overhead.
6370 ForcedScalars[VF].insert(I);
6371 }
6372}
6373
6375 assert(!VF.isScalar() &&
6376 "Trying to set a vectorization decision for a scalar VF");
6377
6378 auto ForcedScalar = ForcedScalars.find(VF);
6379 for (BasicBlock *BB : TheLoop->blocks()) {
6380 // For each instruction in the old loop.
6381 for (Instruction &I : *BB) {
6382 CallInst *CI = dyn_cast<CallInst>(&I);
6383
6384 if (!CI)
6385 continue;
6386
6391 Function *ScalarFunc = CI->getCalledFunction();
6392 Type *ScalarRetTy = CI->getType();
6393 SmallVector<Type *, 4> Tys, ScalarTys;
6394 for (auto &ArgOp : CI->args())
6395 ScalarTys.push_back(ArgOp->getType());
6396
6397 // Estimate cost of scalarized vector call. The source operands are
6398 // assumed to be vectors, so we need to extract individual elements from
6399 // there, execute VF scalar calls, and then gather the result into the
6400 // vector return value.
6401 InstructionCost ScalarCallCost =
6402 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6403
6404 // Compute costs of unpacking argument values for the scalar calls and
6405 // packing the return values to a vector.
6406 InstructionCost ScalarizationCost =
6407 getScalarizationOverhead(CI, VF, CostKind);
6408
6409 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6410 // Honor ForcedScalars and UniformAfterVectorization decisions.
6411 // TODO: For calls, it might still be more profitable to widen. Use
6412 // VPlan-based cost model to compare different options.
6413 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
6414 ForcedScalar->second.contains(CI)) ||
6415 isUniformAfterVectorization(CI, VF))) {
6416 setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
6417 Intrinsic::not_intrinsic, std::nullopt,
6418 ScalarCost);
6419 continue;
6420 }
6421
6422 bool MaskRequired = Legal->isMaskRequired(CI);
6423 // Compute corresponding vector type for return value and arguments.
6424 Type *RetTy = toVectorTy(ScalarRetTy, VF);
6425 for (Type *ScalarTy : ScalarTys)
6426 Tys.push_back(toVectorTy(ScalarTy, VF));
6427
6428 // An in-loop reduction using an fmuladd intrinsic is a special case;
6429 // we don't want the normal cost for that intrinsic.
6431 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6434 std::nullopt, *RedCost);
6435 continue;
6436 }
6437
6438 // Find the cost of vectorizing the call, if we can find a suitable
6439 // vector variant of the function.
6440 bool UsesMask = false;
6441 VFInfo FuncInfo;
6442 Function *VecFunc = nullptr;
6443 // Search through any available variants for one we can use at this VF.
6444 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6445 // Must match requested VF.
6446 if (Info.Shape.VF != VF)
6447 continue;
6448
6449 // Must take a mask argument if one is required
6450 if (MaskRequired && !Info.isMasked())
6451 continue;
6452
6453 // Check that all parameter kinds are supported
6454 bool ParamsOk = true;
6455 for (VFParameter Param : Info.Shape.Parameters) {
6456 switch (Param.ParamKind) {
6458 break;
6460 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6461 // Make sure the scalar parameter in the loop is invariant.
6462 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6463 TheLoop))
6464 ParamsOk = false;
6465 break;
6466 }
6468 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6469 // Find the stride for the scalar parameter in this loop and see if
6470 // it matches the stride for the variant.
6471 // TODO: do we need to figure out the cost of an extract to get the
6472 // first lane? Or do we hope that it will be folded away?
6473 ScalarEvolution *SE = PSE.getSE();
6474 const auto *SAR =
6475 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6476
6477 if (!SAR || SAR->getLoop() != TheLoop) {
6478 ParamsOk = false;
6479 break;
6480 }
6481
6482 const SCEVConstant *Step =
6483 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6484
6485 if (!Step ||
6486 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6487 ParamsOk = false;
6488
6489 break;
6490 }
6492 UsesMask = true;
6493 break;
6494 default:
6495 ParamsOk = false;
6496 break;
6497 }
6498 }
6499
6500 if (!ParamsOk)
6501 continue;
6502
6503 // Found a suitable candidate, stop here.
6504 VecFunc = CI->getModule()->getFunction(Info.VectorName);
6505 FuncInfo = Info;
6506 break;
6507 }
6508
6509 // Add in the cost of synthesizing a mask if one wasn't required.
6510 InstructionCost MaskCost = 0;
6511 if (VecFunc && UsesMask && !MaskRequired)
6512 MaskCost = TTI.getShuffleCost(
6515 VecFunc->getFunctionType()->getContext()),
6516 VF));
6517
6518 if (TLI && VecFunc && !CI->isNoBuiltin())
6519 VectorCost =
6520 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6521
6522 // Find the cost of an intrinsic; some targets may have instructions that
6523 // perform the operation without needing an actual call.
6525 if (IID != Intrinsic::not_intrinsic)
6526 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6527
6528 InstructionCost Cost = ScalarCost;
6529 InstWidening Decision = CM_Scalarize;
6530
6531 if (VectorCost <= Cost) {
6532 Cost = VectorCost;
6533 Decision = CM_VectorCall;
6534 }
6535
6536 if (IntrinsicCost <= Cost) {
6537 Cost = IntrinsicCost;
6538 Decision = CM_IntrinsicCall;
6539 }
6540
6541 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6543 }
6544 }
6545}
6546
6548 if (!Legal->isInvariant(Op))
6549 return false;
6550 // Consider Op invariant, if it or its operands aren't predicated
6551 // instruction in the loop. In that case, it is not trivially hoistable.
6552 auto *OpI = dyn_cast<Instruction>(Op);
6553 return !OpI || !TheLoop->contains(OpI) ||
6554 (!isPredicatedInst(OpI) &&
6555 (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
6556 all_of(OpI->operands(),
6557 [this](Value *Op) { return shouldConsiderInvariant(Op); }));
6558}
6559
6562 ElementCount VF) {
6563 // If we know that this instruction will remain uniform, check the cost of
6564 // the scalar version.
6566 VF = ElementCount::getFixed(1);
6567
6568 if (VF.isVector() && isProfitableToScalarize(I, VF))
6569 return InstsToScalarize[VF][I];
6570
6571 // Forced scalars do not have any scalarization overhead.
6572 auto ForcedScalar = ForcedScalars.find(VF);
6573 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6574 auto InstSet = ForcedScalar->second;
6575 if (InstSet.count(I))
6577 VF.getKnownMinValue();
6578 }
6579
6580 Type *RetTy = I->getType();
6582 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6583 auto *SE = PSE.getSE();
6585
6586 auto HasSingleCopyAfterVectorization = [this](Instruction *I,
6587 ElementCount VF) -> bool {
6588 if (VF.isScalar())
6589 return true;
6590
6591 auto Scalarized = InstsToScalarize.find(VF);
6592 assert(Scalarized != InstsToScalarize.end() &&
6593 "VF not yet analyzed for scalarization profitability");
6594 return !Scalarized->second.count(I) &&
6595 llvm::all_of(I->users(), [&](User *U) {
6596 auto *UI = cast<Instruction>(U);
6597 return !Scalarized->second.count(UI);
6598 });
6599 };
6600 (void)HasSingleCopyAfterVectorization;
6601
6602 Type *VectorTy;
6603 if (isScalarAfterVectorization(I, VF)) {
6604 // With the exception of GEPs and PHIs, after scalarization there should
6605 // only be one copy of the instruction generated in the loop. This is
6606 // because the VF is either 1, or any instructions that need scalarizing
6607 // have already been dealt with by the time we get here. As a result,
6608 // it means we don't have to multiply the instruction cost by VF.
6609 assert(I->getOpcode() == Instruction::GetElementPtr ||
6610 I->getOpcode() == Instruction::PHI ||
6611 (I->getOpcode() == Instruction::BitCast &&
6612 I->getType()->isPointerTy()) ||
6613 HasSingleCopyAfterVectorization(I, VF));
6614 VectorTy = RetTy;
6615 } else
6616 VectorTy = toVectorTy(RetTy, VF);
6617
6618 if (VF.isVector() && VectorTy->isVectorTy() &&
6619 !TTI.getNumberOfParts(VectorTy))
6621
6622 // TODO: We need to estimate the cost of intrinsic calls.
6623 switch (I->getOpcode()) {
6624 case Instruction::GetElementPtr:
6625 // We mark this instruction as zero-cost because the cost of GEPs in
6626 // vectorized code depends on whether the corresponding memory instruction
6627 // is scalarized or not. Therefore, we handle GEPs with the memory
6628 // instruction cost.
6629 return 0;
6630 case Instruction::Br: {
6631 // In cases of scalarized and predicated instructions, there will be VF
6632 // predicated blocks in the vectorized loop. Each branch around these
6633 // blocks requires also an extract of its vector compare i1 element.
6634 // Note that the conditional branch from the loop latch will be replaced by
6635 // a single branch controlling the loop, so there is no extra overhead from
6636 // scalarization.
6637 bool ScalarPredicatedBB = false;
6638 BranchInst *BI = cast<BranchInst>(I);
6639 if (VF.isVector() && BI->isConditional() &&
6640 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6641 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6642 BI->getParent() != TheLoop->getLoopLatch())
6643 ScalarPredicatedBB = true;
6644
6645 if (ScalarPredicatedBB) {
6646 // Not possible to scalarize scalable vector with predicated instructions.
6647 if (VF.isScalable())
6649 // Return cost for branches around scalarized and predicated blocks.
6650 auto *VecI1Ty =
6651 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6652 return (
6654 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
6655 /*Insert*/ false, /*Extract*/ true, CostKind) +
6656 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6657 }
6658
6659 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6660 // The back-edge branch will remain, as will all scalar branches.
6661 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6662
6663 // This branch will be eliminated by if-conversion.
6664 return 0;
6665 // Note: We currently assume zero cost for an unconditional branch inside
6666 // a predicated block since it will become a fall-through, although we
6667 // may decide in the future to call TTI for all branches.
6668 }
6669 case Instruction::Switch: {
6670 if (VF.isScalar())
6671 return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6672 auto *Switch = cast<SwitchInst>(I);
6673 return Switch->getNumCases() *
6675 Instruction::ICmp,
6676 toVectorTy(Switch->getCondition()->getType(), VF),
6677 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6679 }
6680 case Instruction::PHI: {
6681 auto *Phi = cast<PHINode>(I);
6682
6683 // First-order recurrences are replaced by vector shuffles inside the loop.
6684 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6685 // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
6686 // penultimate value of the recurrence.
6687 // TODO: Consider vscale_range info.
6688 if (VF.isScalable() && VF.getKnownMinValue() == 1)
6691 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6693 cast<VectorType>(VectorTy), Mask, CostKind,
6694 VF.getKnownMinValue() - 1);
6695 }
6696
6697 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6698 // converted into select instructions. We require N - 1 selects per phi
6699 // node, where N is the number of incoming values.
6700 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6701 Type *ResultTy = Phi->getType();
6702
6703 // All instructions in an Any-of reduction chain are narrowed to bool.
6704 // Check if that is the case for this phi node.
6705 auto *HeaderUser = cast_if_present<PHINode>(
6706 find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
6707 auto *Phi = dyn_cast<PHINode>(U);
6708 if (Phi && Phi->getParent() == TheLoop->getHeader())
6709 return Phi;
6710 return nullptr;
6711 }));
6712 if (HeaderUser) {
6713 auto &ReductionVars = Legal->getReductionVars();
6714 auto Iter = ReductionVars.find(HeaderUser);
6715 if (Iter != ReductionVars.end() &&
6717 Iter->second.getRecurrenceKind()))
6718 ResultTy = Type::getInt1Ty(Phi->getContext());
6719 }
6720 return (Phi->getNumIncomingValues() - 1) *
6722 Instruction::Select, toVectorTy(ResultTy, VF),
6723 toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6725 }
6726
6727 // When tail folding with EVL, if the phi is part of an out of loop
6728 // reduction then it will be transformed into a wide vp_merge.
6729 if (VF.isVector() && foldTailWithEVL() &&
6732 Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6733 {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6734 return TTI.getIntrinsicInstrCost(ICA, CostKind);
6735 }
6736
6737 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6738 }
6739 case Instruction::UDiv:
6740 case Instruction::SDiv:
6741 case Instruction::URem:
6742 case Instruction::SRem:
6743 if (VF.isVector() && isPredicatedInst(I)) {
6744 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6745 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6746 ScalarCost : SafeDivisorCost;
6747 }
6748 // We've proven all lanes safe to speculate, fall through.
6749 [[fallthrough]];
6750 case Instruction::Add:
6751 case Instruction::Sub: {
6752 auto Info = Legal->getHistogramInfo(I);
6753 if (Info && VF.isVector()) {
6754 const HistogramInfo *HGram = Info.value();
6755 // Assume that a non-constant update value (or a constant != 1) requires
6756 // a multiply, and add that into the cost.
6758 ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
6759 if (!RHS || RHS->getZExtValue() != 1)
6760 MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
6761
6762 // Find the cost of the histogram operation itself.
6763 Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6764 Type *ScalarTy = I->getType();
6765 Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6766 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6767 Type::getVoidTy(I->getContext()),
6768 {PtrTy, ScalarTy, MaskTy});
6769
6770 // Add the costs together with the add/sub operation.
6773 MulCost + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
6774 }
6775 [[fallthrough]];
6776 }
6777 case Instruction::FAdd:
6778 case Instruction::FSub:
6779 case Instruction::Mul:
6780 case Instruction::FMul:
6781 case Instruction::FDiv:
6782 case Instruction::FRem:
6783 case Instruction::Shl:
6784 case Instruction::LShr:
6785 case Instruction::AShr:
6786 case Instruction::And:
6787 case Instruction::Or:
6788 case Instruction::Xor: {
6789 // If we're speculating on the stride being 1, the multiplication may
6790 // fold away. We can generalize this for all operations using the notion
6791 // of neutral elements. (TODO)
6792 if (I->getOpcode() == Instruction::Mul &&
6793 (PSE.getSCEV(I->getOperand(0))->isOne() ||
6794 PSE.getSCEV(I->getOperand(1))->isOne()))
6795 return 0;
6796
6797 // Detect reduction patterns
6798 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6799 return *RedCost;
6800
6801 // Certain instructions can be cheaper to vectorize if they have a constant
6802 // second vector operand. One example of this are shifts on x86.
6803 Value *Op2 = I->getOperand(1);
6804 if (!isa<Constant>(Op2) && PSE.getSE()->isSCEVable(Op2->getType()) &&
6805 isa<SCEVConstant>(PSE.getSCEV(Op2))) {
6806 Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
6807 }
6808 auto Op2Info = TTI.getOperandInfo(Op2);
6809 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6812
6813 SmallVector<const Value *, 4> Operands(I->operand_values());
6815 I->getOpcode(), VectorTy, CostKind,
6816 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6817 Op2Info, Operands, I, TLI);
6818 }
6819 case Instruction::FNeg: {
6821 I->getOpcode(), VectorTy, CostKind,
6822 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6823 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6824 I->getOperand(0), I);
6825 }
6826 case Instruction::Select: {
6827 SelectInst *SI = cast<SelectInst>(I);
6828 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6829 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6830
6831 const Value *Op0, *Op1;
6832 using namespace llvm::PatternMatch;
6833 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6834 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6835 // select x, y, false --> x & y
6836 // select x, true, y --> x | y
6837 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6838 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6839 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6840 Op1->getType()->getScalarSizeInBits() == 1);
6841
6844 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6845 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6846 }
6847
6848 Type *CondTy = SI->getCondition()->getType();
6849 if (!ScalarCond)
6850 CondTy = VectorType::get(CondTy, VF);
6851
6853 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6854 Pred = Cmp->getPredicate();
6855 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6856 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
6857 {TTI::OK_AnyValue, TTI::OP_None}, I);
6858 }
6859 case Instruction::ICmp:
6860 case Instruction::FCmp: {
6861 Type *ValTy = I->getOperand(0)->getType();
6862
6864 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6865 (void)Op0AsInstruction;
6866 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6867 MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6868 "if both the operand and the compare are marked for "
6869 "truncation, they must have the same bitwidth");
6870 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
6871 }
6872
6873 VectorTy = toVectorTy(ValTy, VF);
6874 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6875 cast<CmpInst>(I)->getPredicate(), CostKind,
6876 {TTI::OK_AnyValue, TTI::OP_None},
6877 {TTI::OK_AnyValue, TTI::OP_None}, I);
6878 }
6879 case Instruction::Store:
6880 case Instruction::Load: {
6881 ElementCount Width = VF;
6882 if (Width.isVector()) {
6883 InstWidening Decision = getWideningDecision(I, Width);
6884 assert(Decision != CM_Unknown &&
6885 "CM decision should be taken at this point");
6888 if (Decision == CM_Scalarize)
6889 Width = ElementCount::getFixed(1);
6890 }
6891 VectorTy = toVectorTy(getLoadStoreType(I), Width);
6892 return getMemoryInstructionCost(I, VF);
6893 }
6894 case Instruction::BitCast:
6895 if (I->getType()->isPointerTy())
6896 return 0;
6897 [[fallthrough]];
6898 case Instruction::ZExt:
6899 case Instruction::SExt:
6900 case Instruction::FPToUI:
6901 case Instruction::FPToSI:
6902 case Instruction::FPExt:
6903 case Instruction::PtrToInt:
6904 case Instruction::IntToPtr:
6905 case Instruction::SIToFP:
6906 case Instruction::UIToFP:
6907 case Instruction::Trunc:
6908 case Instruction::FPTrunc: {
6909 // Computes the CastContextHint from a Load/Store instruction.
6910 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6911 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6912 "Expected a load or a store!");
6913
6914 if (VF.isScalar() || !TheLoop->contains(I))
6916
6917 switch (getWideningDecision(I, VF)) {
6929 llvm_unreachable("Instr did not go through cost modelling?");
6932 llvm_unreachable_internal("Instr has invalid widening decision");
6933 }
6934
6935 llvm_unreachable("Unhandled case!");
6936 };
6937
6938 unsigned Opcode = I->getOpcode();
6940 // For Trunc, the context is the only user, which must be a StoreInst.
6941 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6942 if (I->hasOneUse())
6943 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6944 CCH = ComputeCCH(Store);
6945 }
6946 // For Z/Sext, the context is the operand, which must be a LoadInst.
6947 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6948 Opcode == Instruction::FPExt) {
6949 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6950 CCH = ComputeCCH(Load);
6951 }
6952
6953 // We optimize the truncation of induction variables having constant
6954 // integer steps. The cost of these truncations is the same as the scalar
6955 // operation.
6956 if (isOptimizableIVTruncate(I, VF)) {
6957 auto *Trunc = cast<TruncInst>(I);
6958 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6959 Trunc->getSrcTy(), CCH, CostKind, Trunc);
6960 }
6961
6962 // Detect reduction patterns
6963 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6964 return *RedCost;
6965
6966 Type *SrcScalarTy = I->getOperand(0)->getType();
6967 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6968 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6969 SrcScalarTy =
6970 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6971 Type *SrcVecTy =
6972 VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6973
6975 // If the result type is <= the source type, there will be no extend
6976 // after truncating the users to the minimal required bitwidth.
6977 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6978 (I->getOpcode() == Instruction::ZExt ||
6979 I->getOpcode() == Instruction::SExt))
6980 return 0;
6981 }
6982
6983 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6984 }
6985 case Instruction::Call:
6986 return getVectorCallCost(cast<CallInst>(I), VF);
6987 case Instruction::ExtractValue:
6989 case Instruction::Alloca:
6990 // We cannot easily widen alloca to a scalable alloca, as
6991 // the result would need to be a vector of pointers.
6992 if (VF.isScalable())
6994 [[fallthrough]];
6995 default:
6996 // This opcode is unknown. Assume that it is the same as 'mul'.
6997 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6998 } // end of switch.
6999}
7000
7002 // Ignore ephemeral values.
7004
7005 SmallVector<Value *, 4> DeadInterleavePointerOps;
7007
7008 // If a scalar epilogue is required, users outside the loop won't use
7009 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
7010 // that is the case.
7011 bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
7012 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
7013 return RequiresScalarEpilogue &&
7014 !TheLoop->contains(cast<Instruction>(U)->getParent());
7015 };
7016
7018 DFS.perform(LI);
7019 MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;
7020 for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
7021 for (Instruction &I : reverse(*BB)) {
7022 // Find all stores to invariant variables. Since they are going to sink
7023 // outside the loop we do not need calculate cost for them.
7024 StoreInst *SI;
7025 if ((SI = dyn_cast<StoreInst>(&I)) &&
7026 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
7027 ValuesToIgnore.insert(&I);
7028 DeadInvariantStoreOps[SI->getPointerOperand()].push_back(
7029 SI->getValueOperand());
7030 }
7031
7032 if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
7033 continue;
7034
7035 // Add instructions that would be trivially dead and are only used by
7036 // values already ignored to DeadOps to seed worklist.
7038 all_of(I.users(), [this, IsLiveOutDead](User *U) {
7039 return VecValuesToIgnore.contains(U) ||
7040 ValuesToIgnore.contains(U) || IsLiveOutDead(U);
7041 }))
7042 DeadOps.push_back(&I);
7043
7044 // For interleave groups, we only create a pointer for the start of the
7045 // interleave group. Queue up addresses of group members except the insert
7046 // position for further processing.
7047 if (isAccessInterleaved(&I)) {
7048 auto *Group = getInterleavedAccessGroup(&I);
7049 if (Group->getInsertPos() == &I)
7050 continue;
7051 Value *PointerOp = getLoadStorePointerOperand(&I);
7052 DeadInterleavePointerOps.push_back(PointerOp);
7053 }
7054
7055 // Queue branches for analysis. They are dead, if their successors only
7056 // contain dead instructions.
7057 if (auto *Br = dyn_cast<BranchInst>(&I)) {
7058 if (Br->isConditional())
7059 DeadOps.push_back(&I);
7060 }
7061 }
7062
7063 // Mark ops feeding interleave group members as free, if they are only used
7064 // by other dead computations.
7065 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
7066 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
7067 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
7068 Instruction *UI = cast<Instruction>(U);
7069 return !VecValuesToIgnore.contains(U) &&
7070 (!isAccessInterleaved(UI) ||
7071 getInterleavedAccessGroup(UI)->getInsertPos() == UI);
7072 }))
7073 continue;
7074 VecValuesToIgnore.insert(Op);
7075 DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
7076 }
7077
7078 for (const auto &[_, Ops] : DeadInvariantStoreOps) {
7079 for (Value *Op : ArrayRef(Ops).drop_back())
7080 DeadOps.push_back(Op);
7081 }
7082 // Mark ops that would be trivially dead and are only used by ignored
7083 // instructions as free.
7084 BasicBlock *Header = TheLoop->getHeader();
7085
7086 // Returns true if the block contains only dead instructions. Such blocks will
7087 // be removed by VPlan-to-VPlan transforms and won't be considered by the
7088 // VPlan-based cost model, so skip them in the legacy cost-model as well.
7089 auto IsEmptyBlock = [this](BasicBlock *BB) {
7090 return all_of(*BB, [this](Instruction &I) {
7091 return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
7092 (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());
7093 });
7094 };
7095 for (unsigned I = 0; I != DeadOps.size(); ++I) {
7096 auto *Op = dyn_cast<Instruction>(DeadOps[I]);
7097
7098 // Check if the branch should be considered dead.
7099 if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {
7100 BasicBlock *ThenBB = Br->getSuccessor(0);
7101 BasicBlock *ElseBB = Br->getSuccessor(1);
7102 // Don't considers branches leaving the loop for simplification.
7103 if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
7104 continue;
7105 bool ThenEmpty = IsEmptyBlock(ThenBB);
7106 bool ElseEmpty = IsEmptyBlock(ElseBB);
7107 if ((ThenEmpty && ElseEmpty) ||
7108 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
7109 ElseBB->phis().empty()) ||
7110 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
7111 ThenBB->phis().empty())) {
7112 VecValuesToIgnore.insert(Br);
7113 DeadOps.push_back(Br->getCondition());
7114 }
7115 continue;
7116 }
7117
7118 // Skip any op that shouldn't be considered dead.
7119 if (!Op || !TheLoop->contains(Op) ||
7120 (isa<PHINode>(Op) && Op->getParent() == Header) ||
7122 any_of(Op->users(), [this, IsLiveOutDead](User *U) {
7123 return !VecValuesToIgnore.contains(U) &&
7124 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
7125 }))
7126 continue;
7127
7128 if (!TheLoop->contains(Op->getParent()))
7129 continue;
7130
7131 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
7132 // which applies for both scalar and vector versions. Otherwise it is only
7133 // dead in vector versions, so only add it to VecValuesToIgnore.
7134 if (all_of(Op->users(),
7135 [this](User *U) { return ValuesToIgnore.contains(U); }))
7136 ValuesToIgnore.insert(Op);
7137
7138 VecValuesToIgnore.insert(Op);
7139 DeadOps.append(Op->op_begin(), Op->op_end());
7140 }
7141
7142 // Ignore type-promoting instructions we identified during reduction
7143 // detection.
7144 for (const auto &Reduction : Legal->getReductionVars()) {
7145 const RecurrenceDescriptor &RedDes = Reduction.second;
7146 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7147 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7148 }
7149 // Ignore type-casting instructions we identified during induction
7150 // detection.
7151 for (const auto &Induction : Legal->getInductionVars()) {
7152 const InductionDescriptor &IndDes = Induction.second;
7153 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7154 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7155 }
7156}
7157
7159 for (const auto &Reduction : Legal->getReductionVars()) {
7160 PHINode *Phi = Reduction.first;
7161 const RecurrenceDescriptor &RdxDesc = Reduction.second;
7162
7163 // We don't collect reductions that are type promoted (yet).
7164 if (RdxDesc.getRecurrenceType() != Phi->getType())
7165 continue;
7166
7167 // If the target would prefer this reduction to happen "in-loop", then we
7168 // want to record it as such.
7169 unsigned Opcode = RdxDesc.getOpcode();
7170 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7171 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7173 continue;
7174
7175 // Check that we can correctly put the reductions into the loop, by
7176 // finding the chain of operations that leads from the phi to the loop
7177 // exit value.
7178 SmallVector<Instruction *, 4> ReductionOperations =
7179 RdxDesc.getReductionOpChain(Phi, TheLoop);
7180 bool InLoop = !ReductionOperations.empty();
7181
7182 if (InLoop) {
7183 InLoopReductions.insert(Phi);
7184 // Add the elements to InLoopReductionImmediateChains for cost modelling.
7185 Instruction *LastChain = Phi;
7186 for (auto *I : ReductionOperations) {
7187 InLoopReductionImmediateChains[I] = LastChain;
7188 LastChain = I;
7189 }
7190 }
7191 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7192 << " reduction for phi: " << *Phi << "\n");
7193 }
7194}
7195
7196// This function will select a scalable VF if the target supports scalable
7197// vectors and a fixed one otherwise.
7198// TODO: we could return a pair of values that specify the max VF and
7199// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7200// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7201// doesn't have a cost model that can choose which plan to execute if
7202// more than one is generated.
7205 unsigned WidestType;
7206 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7207
7212
7214 unsigned N = RegSize.getKnownMinValue() / WidestType;
7215 return ElementCount::get(N, RegSize.isScalable());
7216}
7217
7220 ElementCount VF = UserVF;
7221 // Outer loop handling: They may require CFG and instruction level
7222 // transformations before even evaluating whether vectorization is profitable.
7223 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7224 // the vectorization pipeline.
7225 if (!OrigLoop->isInnermost()) {
7226 // If the user doesn't provide a vectorization factor, determine a
7227 // reasonable one.
7228 if (UserVF.isZero()) {
7229 VF = determineVPlanVF(TTI, CM);
7230 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7231
7232 // Make sure we have a VF > 1 for stress testing.
7233 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7234 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7235 << "overriding computed VF.\n");
7236 VF = ElementCount::getFixed(4);
7237 }
7238 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7240 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7241 << "not supported by the target.\n");
7243 "Scalable vectorization requested but not supported by the target",
7244 "the scalable user-specified vectorization width for outer-loop "
7245 "vectorization cannot be used because the target does not support "
7246 "scalable vectors.",
7247 "ScalableVFUnfeasible", ORE, OrigLoop);
7249 }
7250 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7252 "VF needs to be a power of two");
7253 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7254 << "VF " << VF << " to build VPlans.\n");
7255 buildVPlans(VF, VF);
7256
7257 // For VPlan build stress testing, we bail out after VPlan construction.
7260
7261 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7262 }
7263
7264 LLVM_DEBUG(
7265 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7266 "VPlan-native path.\n");
7268}
7269
7270void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7271 assert(OrigLoop->isInnermost() && "Inner loop expected.");
7274
7275 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7276 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7277 return;
7278
7279 // Invalidate interleave groups if all blocks of loop will be predicated.
7280 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7282 LLVM_DEBUG(
7283 dbgs()
7284 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7285 "which requires masked-interleaved support.\n");
7287 // Invalidating interleave groups also requires invalidating all decisions
7288 // based on them, which includes widening decisions and uniform and scalar
7289 // values.
7291 }
7292
7293 if (CM.foldTailByMasking())
7295
7296 ElementCount MaxUserVF =
7297 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7298 if (UserVF) {
7299 if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
7301 "UserVF ignored because it may be larger than the maximal safe VF",
7302 "InvalidUserVF", ORE, OrigLoop);
7303 } else {
7305 "VF needs to be a power of two");
7306 // Collect the instructions (and their associated costs) that will be more
7307 // profitable to scalarize.
7309 if (CM.selectUserVectorizationFactor(UserVF)) {
7310 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7311 buildVPlansWithVPRecipes(UserVF, UserVF);
7313 return;
7314 }
7315 reportVectorizationInfo("UserVF ignored because of invalid costs.",
7316 "InvalidCost", ORE, OrigLoop);
7317 }
7318 }
7319
7320 // Collect the Vectorization Factor Candidates.
7321 SmallVector<ElementCount> VFCandidates;
7322 for (auto VF = ElementCount::getFixed(1);
7323 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7324 VFCandidates.push_back(VF);
7325 for (auto VF = ElementCount::getScalable(1);
7326 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7327 VFCandidates.push_back(VF);
7328
7330 for (const auto &VF : VFCandidates) {
7331 // Collect Uniform and Scalar instructions after vectorization with VF.
7333
7334 // Collect the instructions (and their associated costs) that will be more
7335 // profitable to scalarize.
7336 if (VF.isVector())
7338 }
7339
7340 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7341 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7342
7344}
7345
7347 ElementCount VF) const {
7348 if (ForceTargetInstructionCost.getNumOccurrences())
7349 return InstructionCost(ForceTargetInstructionCost.getNumOccurrences());
7350 return CM.getInstructionCost(UI, VF);
7351}
7352
7353bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
7354 return CM.ValuesToIgnore.contains(UI) ||
7355 (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
7356 SkipCostComputation.contains(UI);
7357}
7358
7360LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
7361 VPCostContext &CostCtx) const {
7363 // Cost modeling for inductions is inaccurate in the legacy cost model
7364 // compared to the recipes that are generated. To match here initially during
7365 // VPlan cost model bring up directly use the induction costs from the legacy
7366 // cost model. Note that we do this as pre-processing; the VPlan may not have
7367 // any recipes associated with the original induction increment instruction
7368 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7369 // the cost of induction phis and increments (both that are represented by
7370 // recipes and those that are not), to avoid distinguishing between them here,
7371 // and skip all recipes that represent induction phis and increments (the
7372 // former case) later on, if they exist, to avoid counting them twice.
7373 // Similarly we pre-compute the cost of any optimized truncates.
7374 // TODO: Switch to more accurate costing based on VPlan.
7375 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
7376 Instruction *IVInc = cast<Instruction>(
7377 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
7378 SmallVector<Instruction *> IVInsts = {IVInc};
7379 for (unsigned I = 0; I != IVInsts.size(); I++) {
7380 for (Value *Op : IVInsts[I]->operands()) {
7381 auto *OpI = dyn_cast<Instruction>(Op);
7382 if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
7383 continue;
7384 IVInsts.push_back(OpI);
7385 }
7386 }
7387 IVInsts.push_back(IV);
7388 for (User *U : IV->users()) {
7389 auto *CI = cast<Instruction>(U);
7390 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7391 continue;
7392 IVInsts.push_back(CI);
7393 }
7394
7395 // If the vector loop gets executed exactly once with the given VF, ignore
7396 // the costs of comparison and induction instructions, as they'll get
7397 // simplified away.
7398 // TODO: Remove this code after stepping away from the legacy cost model and
7399 // adding code to simplify VPlans before calculating their costs.
7400 auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
7401 if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
7403 CostCtx.SkipCostComputation);
7404
7405 for (Instruction *IVInst : IVInsts) {
7406 if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
7407 continue;
7408 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
7409 LLVM_DEBUG({
7410 dbgs() << "Cost of " << InductionCost << " for VF " << VF
7411 << ": induction instruction " << *IVInst << "\n";
7412 });
7413 Cost += InductionCost;
7414 CostCtx.SkipCostComputation.insert(IVInst);
7415 }
7416 }
7417
7418 /// Compute the cost of all exiting conditions of the loop using the legacy
7419 /// cost model. This is to match the legacy behavior, which adds the cost of
7420 /// all exit conditions. Note that this over-estimates the cost, as there will
7421 /// be a single condition to control the vector loop.
7423 CM.TheLoop->getExitingBlocks(Exiting);
7424 SetVector<Instruction *> ExitInstrs;
7425 // Collect all exit conditions.
7426 for (BasicBlock *EB : Exiting) {
7427 auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7428 if (!Term)
7429 continue;
7430 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7431 ExitInstrs.insert(CondI);
7432 }
7433 }
7434 // Compute the cost of all instructions only feeding the exit conditions.
7435 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7436 Instruction *CondI = ExitInstrs[I];
7437 if (!OrigLoop->contains(CondI) ||
7438 !CostCtx.SkipCostComputation.insert(CondI).second)
7439 continue;
7440 InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
7441 LLVM_DEBUG({
7442 dbgs() << "Cost of " << CondICost << " for VF " << VF
7443 << ": exit condition instruction " << *CondI << "\n";
7444 });
7445 Cost += CondICost;
7446 for (Value *Op : CondI->operands()) {
7447 auto *OpI = dyn_cast<Instruction>(Op);
7448 if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {
7449 return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
7450 !ExitInstrs.contains(cast<Instruction>(U));
7451 }))
7452 continue;
7453 ExitInstrs.insert(OpI);
7454 }
7455 }
7456
7457 // The legacy cost model has special logic to compute the cost of in-loop
7458 // reductions, which may be smaller than the sum of all instructions involved
7459 // in the reduction.
7460 // TODO: Switch to costing based on VPlan once the logic has been ported.
7461 for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7462 if (ForceTargetInstructionCost.getNumOccurrences())
7463 continue;
7464
7465 if (!CM.isInLoopReduction(RedPhi))
7466 continue;
7467
7468 const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7469 SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7470 ChainOps.end());
7471 auto IsZExtOrSExt = [](const unsigned Opcode) -> bool {
7472 return Opcode == Instruction::ZExt || Opcode == Instruction::SExt;
7473 };
7474 // Also include the operands of instructions in the chain, as the cost-model
7475 // may mark extends as free.
7476 //
7477 // For ARM, some of the instruction can folded into the reducion
7478 // instruction. So we need to mark all folded instructions free.
7479 // For example: We can fold reduce(mul(ext(A), ext(B))) into one
7480 // instruction.
7481 for (auto *ChainOp : ChainOps) {
7482 for (Value *Op : ChainOp->operands()) {
7483 if (auto *I = dyn_cast<Instruction>(Op)) {
7484 ChainOpsAndOperands.insert(I);
7485 if (I->getOpcode() == Instruction::Mul) {
7486 auto *Ext0 = dyn_cast<Instruction>(I->getOperand(0));
7487 auto *Ext1 = dyn_cast<Instruction>(I->getOperand(1));
7488 if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 &&
7489 Ext0->getOpcode() == Ext1->getOpcode()) {
7490 ChainOpsAndOperands.insert(Ext0);
7491 ChainOpsAndOperands.insert(Ext1);
7492 }
7493 }
7494 }
7495 }
7496 }
7497
7498 // Pre-compute the cost for I, if it has a reduction pattern cost.
7499 for (Instruction *I : ChainOpsAndOperands) {
7500 auto ReductionCost = CM.getReductionPatternCost(
7501 I, VF, toVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
7502 if (!ReductionCost)
7503 continue;
7504
7505 assert(!CostCtx.SkipCostComputation.contains(I) &&
7506 "reduction op visited multiple times");
7507 CostCtx.SkipCostComputation.insert(I);
7508 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7509 << ":\n in-loop reduction " << *I << "\n");
7510 Cost += *ReductionCost;
7511 }
7512 }
7513
7514 // Pre-compute the costs for branches except for the backedge, as the number
7515 // of replicate regions in a VPlan may not directly match the number of
7516 // branches, which would lead to different decisions.
7517 // TODO: Compute cost of branches for each replicate region in the VPlan,
7518 // which is more accurate than the legacy cost model.
7519 for (BasicBlock *BB : OrigLoop->blocks()) {
7520 if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
7521 continue;
7522 CostCtx.SkipCostComputation.insert(BB->getTerminator());
7523 if (BB == OrigLoop->getLoopLatch())
7524 continue;
7525 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7526 Cost += BranchCost;
7527 }
7528
7529 // Pre-compute costs for instructions that are forced-scalar or profitable to
7530 // scalarize. Their costs will be computed separately in the legacy cost
7531 // model.
7532 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
7533 if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
7534 continue;
7535 CostCtx.SkipCostComputation.insert(ForcedScalar);
7536 InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
7537 LLVM_DEBUG({
7538 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
7539 << ": forced scalar " << *ForcedScalar << "\n";
7540 });
7541 Cost += ForcedCost;
7542 }
7543 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
7544 if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
7545 continue;
7546 CostCtx.SkipCostComputation.insert(Scalarized);
7547 LLVM_DEBUG({
7548 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
7549 << ": profitable to scalarize " << *Scalarized << "\n";
7550 });
7551 Cost += ScalarCost;
7552 }
7553
7554 return Cost;
7555}
7556
7557InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7558 ElementCount VF) const {
7559 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM);
7560 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
7561
7562 // Now compute and add the VPlan-based cost.
7563 Cost += Plan.cost(VF, CostCtx);
7564#ifndef NDEBUG
7565 unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF);
7566 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
7567 << " (Estimated cost per lane: ");
7568 if (Cost.isValid()) {
7569 double CostPerLane = double(*Cost.getValue()) / EstimatedWidth;
7570 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
7571 } else /* No point dividing an invalid cost - it will still be invalid */
7572 LLVM_DEBUG(dbgs() << "Invalid");
7573 LLVM_DEBUG(dbgs() << ")\n");
7574#endif
7575 return Cost;
7576}
7577
7578#ifndef NDEBUG
7579/// Return true if the original loop \ TheLoop contains any instructions that do
7580/// not have corresponding recipes in \p Plan and are not marked to be ignored
7581/// in \p CostCtx. This means the VPlan contains simplification that the legacy
7582/// cost-model did not account for.
7584 VPCostContext &CostCtx,
7585 Loop *TheLoop) {
7586 // First collect all instructions for the recipes in Plan.
7587 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
7588 if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7589 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7590 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7591 return &WidenMem->getIngredient();
7592 return nullptr;
7593 };
7594
7595 DenseSet<Instruction *> SeenInstrs;
7596 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7597 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
7598 for (VPRecipeBase &R : *VPBB) {
7599 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7600 auto *IG = IR->getInterleaveGroup();
7601 unsigned NumMembers = IG->getNumMembers();
7602 for (unsigned I = 0; I != NumMembers; ++I) {
7603 if (Instruction *M = IG->getMember(I))
7604 SeenInstrs.insert(M);
7605 }
7606 continue;
7607 }
7608 if (Instruction *UI = GetInstructionForCost(&R))
7609 SeenInstrs.insert(UI);
7610 }
7611 }
7612
7613 // Return true if the loop contains any instructions that are not also part of
7614 // the VPlan or are skipped for VPlan-based cost computations. This indicates
7615 // that the VPlan contains extra simplifications.
7616 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7617 TheLoop](BasicBlock *BB) {
7618 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7619 if (isa<PHINode>(&I) && BB == TheLoop->getHeader())
7620 return false;
7621 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7622 });
7623 });
7624}
7625#endif
7626
7628 if (VPlans.empty())
7630 // If there is a single VPlan with a single VF, return it directly.
7631 VPlan &FirstPlan = *VPlans[0];
7632 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7633 return {*FirstPlan.vectorFactors().begin(), 0, 0};
7634
7636 assert(hasPlanWithVF(ScalarVF) &&
7637 "More than a single plan/VF w/o any plan having scalar VF");
7638
7639 // TODO: Compute scalar cost using VPlan-based cost model.
7640 InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7641 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7642 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7643 VectorizationFactor BestFactor = ScalarFactor;
7644
7645 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7646 if (ForceVectorization) {
7647 // Ignore scalar width, because the user explicitly wants vectorization.
7648 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7649 // evaluation.
7650 BestFactor.Cost = InstructionCost::getMax();
7651 }
7652
7653 for (auto &P : VPlans) {
7654 for (ElementCount VF : P->vectorFactors()) {
7655 if (VF.isScalar())
7656 continue;
7657 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7658 LLVM_DEBUG(
7659 dbgs()
7660 << "LV: Not considering vector loop of width " << VF
7661 << " because it will not generate any vector instructions.\n");
7662 continue;
7663 }
7664
7665 InstructionCost Cost = cost(*P, VF);
7666 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7667 if (isMoreProfitable(CurrentFactor, BestFactor))
7668 BestFactor = CurrentFactor;
7669
7670 // If profitable add it to ProfitableVF list.
7671 if (isMoreProfitable(CurrentFactor, ScalarFactor))
7672 ProfitableVFs.push_back(CurrentFactor);
7673 }
7674 }
7675
7676#ifndef NDEBUG
7677 // Select the optimal vectorization factor according to the legacy cost-model.
7678 // This is now only used to verify the decisions by the new VPlan-based
7679 // cost-model and will be retired once the VPlan-based cost-model is
7680 // stabilized.
7681 VectorizationFactor LegacyVF = selectVectorizationFactor();
7682 VPlan &BestPlan = getPlanFor(BestFactor.Width);
7683
7684 // Pre-compute the cost and use it to check if BestPlan contains any
7685 // simplifications not accounted for in the legacy cost model. If that's the
7686 // case, don't trigger the assertion, as the extra simplifications may cause a
7687 // different VF to be picked by the VPlan-based cost model.
7688 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM);
7689 precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7690 assert((BestFactor.Width == LegacyVF.Width ||
7692 CostCtx, OrigLoop) ||
7694 CostCtx, OrigLoop)) &&
7695 " VPlan cost model and legacy cost model disagreed");
7696 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7697 "when vectorizing, the scalar cost must be computed.");
7698#endif
7699
7700 return BestFactor;
7701}
7702
7705 // Reserve first location for self reference to the LoopID metadata node.
7706 MDs.push_back(nullptr);
7707 bool IsUnrollMetadata = false;
7708 MDNode *LoopID = L->getLoopID();
7709 if (LoopID) {
7710 // First find existing loop unrolling disable metadata.
7711 for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
7712 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
7713 if (MD) {
7714 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7715 IsUnrollMetadata =
7716 S && S->getString().starts_with("llvm.loop.unroll.disable");
7717 }
7718 MDs.push_back(LoopID->getOperand(I));
7719 }
7720 }
7721
7722 if (!IsUnrollMetadata) {
7723 // Add runtime unroll disable metadata.
7724 LLVMContext &Context = L->getHeader()->getContext();
7725 SmallVector<Metadata *, 1> DisableOperands;
7726 DisableOperands.push_back(
7727 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7728 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7729 MDs.push_back(DisableNode);
7730 MDNode *NewLoopID = MDNode::get(Context, MDs);
7731 // Set operand 0 to refer to the loop id itself.
7732 NewLoopID->replaceOperandWith(0, NewLoopID);
7733 L->setLoopID(NewLoopID);
7734 }
7735}
7736
7737// If \p R is a ComputeReductionResult when vectorizing the epilog loop,
7738// fix the reduction's scalar PHI node by adding the incoming value from the
7739// main vector loop.
7741 VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock,
7742 BasicBlock *BypassBlock) {
7743 auto *EpiRedResult = dyn_cast<VPInstruction>(R);
7744 if (!EpiRedResult ||
7745 EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7746 return;
7747
7748 auto *EpiRedHeaderPhi =
7749 cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
7750 const RecurrenceDescriptor &RdxDesc =
7751 EpiRedHeaderPhi->getRecurrenceDescriptor();
7752 Value *MainResumeValue =
7753 EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7755 RdxDesc.getRecurrenceKind())) {
7756 auto *Cmp = cast<ICmpInst>(MainResumeValue);
7757 assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7758 "AnyOf expected to start with ICMP_NE");
7759 assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() &&
7760 "AnyOf expected to start by comparing main resume value to original "
7761 "start value");
7762 MainResumeValue = Cmp->getOperand(0);
7763 }
7764 PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
7765
7766 // When fixing reductions in the epilogue loop we should already have
7767 // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7768 // over the incoming values correctly.
7769 using namespace VPlanPatternMatch;
7770 auto IsResumePhi = [](VPUser *U) {
7771 return match(
7772 U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue()));
7773 };
7774 assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 &&
7775 "ResumePhi must have a single user");
7776 auto *EpiResumePhiVPI =
7777 cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi));
7778 auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true));
7779 EpiResumePhi->setIncomingValueForBlock(
7780 BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
7781}
7782
7784 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7785 InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue,
7786 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7787 assert(BestVPlan.hasVF(BestVF) &&
7788 "Trying to execute plan with unsupported VF");
7789 assert(BestVPlan.hasUF(BestUF) &&
7790 "Trying to execute plan with unsupported UF");
7791 assert(
7792 ((VectorizingEpilogue && ExpandedSCEVs) ||
7793 (!VectorizingEpilogue && !ExpandedSCEVs)) &&
7794 "expanded SCEVs to reuse can only be used during epilogue vectorization");
7795
7796 // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7797 // cost model is complete for better cost estimates.
7798 VPlanTransforms::unrollByUF(BestVPlan, BestUF,
7799 OrigLoop->getHeader()->getContext());
7800 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7802
7803 // Perform the actual loop transformation.
7804 VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV,
7805 &BestVPlan, Legal->getWidestInductionType());
7806
7807#ifdef EXPENSIVE_CHECKS
7808 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7809#endif
7810
7811 // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7812 // making any changes to the CFG.
7813 if (!BestVPlan.getEntry()->empty()) {
7814 State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7816 BestVPlan.getEntry()->execute(&State);
7817 }
7818 if (!ILV.getTripCount())
7819 ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
7820 else
7821 assert(VectorizingEpilogue && "should only re-use the existing trip "
7822 "count during epilogue vectorization");
7823
7824 // 1. Set up the skeleton for vectorization, including vector pre-header and
7825 // middle block. The vector loop is created during VPlan execution.
7827 ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
7828 if (VectorizingEpilogue)
7830
7831 // Only use noalias metadata when using memory checks guaranteeing no overlap
7832 // across all iterations.
7833 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7834 std::unique_ptr<LoopVersioning> LVer = nullptr;
7835 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7837
7838 // We currently don't use LoopVersioning for the actual loop cloning but we
7839 // still use it to add the noalias metadata.
7840 // TODO: Find a better way to re-use LoopVersioning functionality to add
7841 // metadata.
7842 LVer = std::make_unique<LoopVersioning>(
7843 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7844 PSE.getSE());
7845 State.LVer = &*LVer;
7847 }
7848
7850
7851 //===------------------------------------------------===//
7852 //
7853 // Notice: any optimization or new instruction that go
7854 // into the code below should also be implemented in
7855 // the cost-model.
7856 //
7857 //===------------------------------------------------===//
7858
7859 // 2. Copy and widen instructions from the old loop into the new loop.
7860 BestVPlan.prepareToExecute(
7861 ILV.getTripCount(),
7863
7864 BestVPlan.execute(&State);
7865
7866 auto *ExitVPBB = BestVPlan.getMiddleBlock();
7867 // 2.5 When vectorizing the epilogue, fix reduction and induction resume
7868 // values from the additional bypass block.
7869 if (VectorizingEpilogue) {
7871 "Epilogue vectorisation not yet supported with early exits");
7872 BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
7873 for (VPRecipeBase &R : *ExitVPBB) {
7875 &R, State, State.CFG.VPBB2IRBB[ExitVPBB], BypassBlock);
7876 }
7877 BasicBlock *PH = OrigLoop->getLoopPreheader();
7878 for (const auto &[IVPhi, _] : Legal->getInductionVars()) {
7879 auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
7881 Inc->setIncomingValueForBlock(BypassBlock, V);
7882 }
7883 }
7884
7885 // 2.6. Maintain Loop Hints
7886 // Keep all loop hints from the original loop on the vector loop (we'll
7887 // replace the vectorizer-specific hints below).
7888 MDNode *OrigLoopID = OrigLoop->getLoopID();
7889
7890 std::optional<MDNode *> VectorizedLoopID =
7893
7894 VPBasicBlock *HeaderVPBB =
7896 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7897 if (VectorizedLoopID)
7898 L->setLoopID(*VectorizedLoopID);
7899 else {
7900 // Keep all loop hints from the original loop on the vector loop (we'll
7901 // replace the vectorizer-specific hints below).
7902 if (MDNode *LID = OrigLoop->getLoopID())
7903 L->setLoopID(LID);
7904
7905 LoopVectorizeHints Hints(L, true, *ORE);
7906 Hints.setAlreadyVectorized();
7907 }
7909 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7910 if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
7912
7913 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7914 // predication, updating analyses.
7915 ILV.fixVectorizedLoop(State);
7916
7918
7919 // 4. Adjust branch weight of the branch in the middle block.
7920 auto *MiddleTerm =
7921 cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
7922 if (MiddleTerm->isConditional() &&
7923 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7924 // Assume that `Count % VectorTripCount` is equally distributed.
7925 unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
7926 assert(TripCount > 0 && "trip count should not be zero");
7927 const uint32_t Weights[] = {1, TripCount - 1};
7928 setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7929 }
7930
7931 return State.ExpandedSCEVs;
7932}
7933
7934//===--------------------------------------------------------------------===//
7935// EpilogueVectorizerMainLoop
7936//===--------------------------------------------------------------------===//
7937
7938/// This function is partially responsible for generating the control flow
7939/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7941 const SCEV2ValueTy &ExpandedSCEVs) {
7943
7944 // Generate the code to check the minimum iteration count of the vector
7945 // epilogue (see below).
7949
7950 // Generate the code to check any assumptions that we've made for SCEV
7951 // expressions.
7953
7954 // Generate the code that checks at runtime if arrays overlap. We put the
7955 // checks into a separate block to make the more common case of few elements
7956 // faster.
7958
7959 // Generate the iteration count check for the main loop, *after* the check
7960 // for the epilogue loop, so that the path-length is shorter for the case
7961 // that goes directly through the vector epilogue. The longer-path length for
7962 // the main loop is compensated for, by the gain from vectorizing the larger
7963 // trip count. Note: the branch will get updated later on when we vectorize
7964 // the epilogue.
7967
7968 // Generate the induction variable.
7970
7971 // Generate VPValues and ResumePhi recipes for wide inductions in the epilogue
7972 // plan only. Other inductions only need a resume value for the canonical
7973 // induction, which will get created during epilogue skeleton construction.
7975 for (VPRecipeBase &H :
7977 if (auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&H))
7978 WideIVs.insert(WideIV->getPHINode());
7979 }
7980 createInductionResumeVPValues(ExpandedSCEVs, nullptr, &WideIVs);
7981
7982 return LoopVectorPreHeader;
7983}
7984
7986 LLVM_DEBUG({
7987 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7988 << "Main Loop VF:" << EPI.MainLoopVF
7989 << ", Main Loop UF:" << EPI.MainLoopUF
7990 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7991 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7992 });
7993}
7994
7997 dbgs() << "intermediate fn:\n"
7998 << *OrigLoop->getHeader()->getParent() << "\n";
7999 });
8000}
8001
8002BasicBlock *
8004 bool ForEpilogue) {
8005 assert(Bypass && "Expected valid bypass basic block.");
8006 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
8007 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
8008 Value *Count = getTripCount();
8009 // Reuse existing vector loop preheader for TC checks.
8010 // Note that new preheader block is generated for vector loop.
8011 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
8012 IRBuilder<> Builder(TCCheckBlock->getTerminator());
8013
8014 // Generate code to check if the loop's trip count is less than VF * UF of the
8015 // main vector loop.
8016 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
8017 : VF.isVector())
8020
8021 Value *CheckMinIters = Builder.CreateICmp(
8022 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
8023 "min.iters.check");
8024
8025 if (!ForEpilogue)
8026 TCCheckBlock->setName("vector.main.loop.iter.check");
8027
8028 // Create new preheader for vector loop.
8029 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
8030 DT, LI, nullptr, "vector.ph");
8031
8032 if (ForEpilogue) {
8033 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
8034 DT->getNode(Bypass)->getIDom()) &&
8035 "TC check is expected to dominate Bypass");
8036
8037 LoopBypassBlocks.push_back(TCCheckBlock);
8038
8039 // Save the trip count so we don't have to regenerate it in the
8040 // vec.epilog.iter.check. This is safe to do because the trip count
8041 // generated here dominates the vector epilog iter check.
8042 EPI.TripCount = Count;
8043 }
8044
8045 BranchInst &BI =
8046 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
8048 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
8049 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
8050
8051 introduceCheckBlockInVPlan(Plan, TCCheckBlock);
8052 return TCCheckBlock;
8053}
8054
8055//===--------------------------------------------------------------------===//
8056// EpilogueVectorizerEpilogueLoop
8057//===--------------------------------------------------------------------===//
8058
8059/// This function is partially responsible for generating the control flow
8060/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8061BasicBlock *
8063 const SCEV2ValueTy &ExpandedSCEVs) {
8064 createVectorLoopSkeleton("vec.epilog.");
8065
8066 // Now, compare the remaining count and if there aren't enough iterations to
8067 // execute the vectorized epilogue skip to the scalar part.
8068 LoopVectorPreHeader->setName("vec.epilog.ph");
8069 BasicBlock *VecEpilogueIterationCountCheck =
8071 nullptr, "vec.epilog.iter.check", true);
8073 VecEpilogueIterationCountCheck);
8074 AdditionalBypassBlock = VecEpilogueIterationCountCheck;
8075
8076 // Adjust the control flow taking the state info from the main loop
8077 // vectorization into account.
8079 "expected this to be saved from the previous pass.");
8081 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8082
8084 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8085
8086 if (EPI.SCEVSafetyCheck)
8088 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8089 if (EPI.MemSafetyCheck)
8091 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8092
8095 // Keep track of bypass blocks, as they feed start values to the induction and
8096 // reduction phis in the scalar loop preheader.
8097 if (EPI.SCEVSafetyCheck)
8099 if (EPI.MemSafetyCheck)
8102
8103 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
8104 // reductions which merge control-flow from the latch block and the middle
8105 // block. Update the incoming values here and move the Phi into the preheader.
8106 SmallVector<PHINode *, 4> PhisInBlock;
8107 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
8108 PhisInBlock.push_back(&Phi);
8109
8110 for (PHINode *Phi : PhisInBlock) {
8111 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
8112 Phi->replaceIncomingBlockWith(
8113 VecEpilogueIterationCountCheck->getSinglePredecessor(),
8114 VecEpilogueIterationCountCheck);
8115
8116 // If the phi doesn't have an incoming value from the
8117 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
8118 // value and also those from other check blocks. This is needed for
8119 // reduction phis only.
8120 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
8121 return EPI.EpilogueIterationCountCheck == IncB;
8122 }))
8123 continue;
8124 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
8125 if (EPI.SCEVSafetyCheck)
8126 Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
8127 if (EPI.MemSafetyCheck)
8128 Phi->removeIncomingValue(EPI.MemSafetyCheck);
8129 }
8130
8131 // Generate induction resume values. These variables save the new starting
8132 // indexes for the scalar loop. They are used to test if there are any tail
8133 // iterations left once the vector loop has completed.
8134 // Note that when the vectorized epilogue is skipped due to iteration count
8135 // check, then the resume value for the induction variable comes from
8136 // the trip count of the main vector loop, passed as the second argument.
8138
8139 return LoopVectorPreHeader;
8140}
8141
8142BasicBlock *
8144 BasicBlock *Bypass, BasicBlock *Insert) {
8145
8147 "Expected trip count to have been saved in the first pass.");
8148 assert(
8149 (!isa<Instruction>(EPI.TripCount) ||
8150 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8151 "saved trip count does not dominate insertion point.");
8152 Value *TC = EPI.TripCount;
8153 IRBuilder<> Builder(Insert->getTerminator());
8154 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8155
8156 // Generate code to check if the loop's trip count is less than VF * UF of the
8157 // vector epilogue loop.
8158 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
8161
8162 Value *CheckMinIters =
8163 Builder.CreateICmp(P, Count,
8166 "min.epilog.iters.check");
8167
8168 BranchInst &BI =
8169 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
8171 unsigned MainLoopStep = UF * VF.getKnownMinValue();
8172 unsigned EpilogueLoopStep =
8174 // We assume the remaining `Count` is equally distributed in
8175 // [0, MainLoopStep)
8176 // So the probability for `Count < EpilogueLoopStep` should be
8177 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
8178 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
8179 const uint32_t Weights[] = {EstimatedSkipCount,
8180 MainLoopStep - EstimatedSkipCount};
8181 setBranchWeights(BI, Weights, /*IsExpected=*/false);
8182 }
8183 ReplaceInstWithInst(Insert->getTerminator(), &BI);
8184 LoopBypassBlocks.push_back(Insert);
8185
8186 // A new entry block has been created for the epilogue VPlan. Hook it in, as
8187 // otherwise we would try to modify the entry to the main vector loop.
8189 VPBasicBlock *OldEntry = Plan.getEntry();
8190 VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
8191 Plan.setEntry(NewEntry);
8192 delete OldEntry;
8193
8195 return Insert;
8196}
8197
8199 LLVM_DEBUG({
8200 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8201 << "Epilogue Loop VF:" << EPI.EpilogueVF
8202 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8203 });
8204}
8205
8208 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8209 });
8210}
8211
8212iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
8214 std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
8215 return getVPValueOrAddLiveIn(Op);
8216 };
8217 return map_range(Operands, Fn);
8218}
8219
8221 BasicBlock *Src = SI->getParent();
8222 assert(!OrigLoop->isLoopExiting(Src) &&
8223 all_of(successors(Src),
8224 [this](BasicBlock *Succ) {
8225 return OrigLoop->getHeader() != Succ;
8226 }) &&
8227 "unsupported switch either exiting loop or continuing to header");
8228 // Create masks where the terminator in Src is a switch. We create mask for
8229 // all edges at the same time. This is more efficient, as we can create and
8230 // collect compares for all cases once.
8231 VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition());
8232 BasicBlock *DefaultDst = SI->getDefaultDest();
8234 for (auto &C : SI->cases()) {
8235 BasicBlock *Dst = C.getCaseSuccessor();
8236 assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created");
8237 // Cases whose destination is the same as default are redundant and can be
8238 // ignored - they will get there anyhow.
8239 if (Dst == DefaultDst)
8240 continue;
8241 auto &Compares = Dst2Compares[Dst];
8242 VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue());
8243 Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
8244 }
8245
8246 // We need to handle 2 separate cases below for all entries in Dst2Compares,
8247 // which excludes destinations matching the default destination.
8248 VPValue *SrcMask = getBlockInMask(Src);
8249 VPValue *DefaultMask = nullptr;
8250 for (const auto &[Dst, Conds] : Dst2Compares) {
8251 // 1. Dst is not the default destination. Dst is reached if any of the cases
8252 // with destination == Dst are taken. Join the conditions for each case
8253 // whose destination == Dst using an OR.
8254 VPValue *Mask = Conds[0];
8255 for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
8256 Mask = Builder.createOr(Mask, V);
8257 if (SrcMask)
8258 Mask = Builder.createLogicalAnd(SrcMask, Mask);
8259 EdgeMaskCache[{Src, Dst}] = Mask;
8260
8261 // 2. Create the mask for the default destination, which is reached if none
8262 // of the cases with destination != default destination are taken. Join the
8263 // conditions for each case where the destination is != Dst using an OR and
8264 // negate it.
8265 DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
8266 }
8267
8268 if (DefaultMask) {
8269 DefaultMask = Builder.createNot(DefaultMask);
8270 if (SrcMask)
8271 DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
8272 }
8273 EdgeMaskCache[{Src, DefaultDst}] = DefaultMask;
8274}
8275
8277 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8278
8279 // Look for cached value.
8280 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8281 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8282 if (ECEntryIt != EdgeMaskCache.end())
8283 return ECEntryIt->second;
8284
8285 if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) {
8287 assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?");
8288 return EdgeMaskCache[Edge];
8289 }
8290
8291 VPValue *SrcMask = getBlockInMask(Src);
8292
8293 // The terminator has to be a branch inst!
8294 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8295 assert(BI && "Unexpected terminator found");
8296 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8297 return EdgeMaskCache[Edge] = SrcMask;
8298
8299 // If source is an exiting block, we know the exit edge is dynamically dead
8300 // in the vector loop, and thus we don't need to restrict the mask. Avoid
8301 // adding uses of an otherwise potentially dead instruction unless we are
8302 // vectorizing a loop with uncountable exits. In that case, we always
8303 // materialize the mask.
8304 if (OrigLoop->isLoopExiting(Src) &&
8305 Src != Legal->getUncountableEarlyExitingBlock())
8306 return EdgeMaskCache[Edge] = SrcMask;
8307
8308 VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
8309 assert(EdgeMask && "No Edge Mask found for condition");
8310
8311 if (BI->getSuccessor(0) != Dst)
8312 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8313
8314 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8315 // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
8316 // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
8317 // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8318 EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
8319 }
8320
8321 return EdgeMaskCache[Edge] = EdgeMask;
8322}
8323
8325 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8326
8327 // Look for cached value.
8328 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8329 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
8330 assert(ECEntryIt != EdgeMaskCache.end() &&
8331 "looking up mask for edge which has not been created");
8332 return ECEntryIt->second;
8333}
8334
8336 BasicBlock *Header = OrigLoop->getHeader();
8337
8338 // When not folding the tail, use nullptr to model all-true mask.
8339 if (!CM.foldTailByMasking()) {
8340 BlockMaskCache[Header] = nullptr;
8341 return;
8342 }
8343
8344 // Introduce the early-exit compare IV <= BTC to form header block mask.
8345 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8346 // constructing the desired canonical IV in the header block as its first
8347 // non-phi instructions.
8348
8349 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8350 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8351 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8352 HeaderVPBB->insert(IV, NewInsertionPoint);
8353
8354 VPBuilder::InsertPointGuard Guard(Builder);
8355 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8356 VPValue *BlockMask = nullptr;
8358 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8359 BlockMaskCache[Header] = BlockMask;
8360}
8361
8363 // Return the cached value.
8364 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8365 assert(BCEntryIt != BlockMaskCache.end() &&
8366 "Trying to access mask for block without one.");
8367 return BCEntryIt->second;
8368}
8369
8371 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8372 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8373 assert(OrigLoop->getHeader() != BB &&
8374 "Loop header must have cached block mask");
8375
8376 // All-one mask is modelled as no-mask following the convention for masked
8377 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8378 VPValue *BlockMask = nullptr;
8379 // This is the block mask. We OR all unique incoming edges.
8380 for (auto *Predecessor :
8382 VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
8383 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8384 BlockMaskCache[BB] = EdgeMask;
8385 return;
8386 }
8387
8388 if (!BlockMask) { // BlockMask has its initialized nullptr value.
8389 BlockMask = EdgeMask;
8390 continue;
8391 }
8392
8393 BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8394 }
8395
8396 BlockMaskCache[BB] = BlockMask;
8397}
8398
8400VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8401 VFRange &Range) {
8402 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8403 "Must be called with either a load or store");
8404
8405 auto WillWiden = [&](ElementCount VF) -> bool {
8407 CM.getWideningDecision(I, VF);
8409 "CM decision should be taken at this point.");
8411 return true;
8412 if (CM.isScalarAfterVectorization(I, VF) ||
8413 CM.isProfitableToScalarize(I, VF))
8414 return false;
8416 };
8417
8419 return nullptr;
8420
8421 VPValue *Mask = nullptr;
8422 if (Legal->isMaskRequired(I))
8423 Mask = getBlockInMask(I->getParent());
8424
8425 // Determine if the pointer operand of the access is either consecutive or
8426 // reverse consecutive.
8428 CM.getWideningDecision(I, Range.Start);
8430 bool Consecutive =
8432
8433 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8434 if (Consecutive) {
8435 auto *GEP = dyn_cast<GetElementPtrInst>(
8436 Ptr->getUnderlyingValue()->stripPointerCasts());
8437 VPSingleDefRecipe *VectorPtr;
8438 if (Reverse)
8439 VectorPtr = new VPReverseVectorPointerRecipe(
8440 Ptr, &Plan.getVF(), getLoadStoreType(I),
8441 GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds()
8443 I->getDebugLoc());
8444 else
8445 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
8446 GEP ? GEP->getNoWrapFlags()
8448 I->getDebugLoc());
8449 Builder.getInsertBlock()->appendRecipe(VectorPtr);
8450 Ptr = VectorPtr;
8451 }
8452 if (LoadInst *Load = dyn_cast<LoadInst>(I))
8453 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
8454 I->getDebugLoc());
8455
8456 StoreInst *Store = cast<StoreInst>(I);
8457 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
8458 Reverse, I->getDebugLoc());
8459}
8460
8461/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8462/// insert a recipe to expand the step for the induction recipe.
8465 VPValue *Start, const InductionDescriptor &IndDesc,
8466 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
8467 assert(IndDesc.getStartValue() ==
8468 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8469 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8470 "step must be loop invariant");
8471
8472 VPValue *Step =
8474 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8475 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8476 IndDesc, TruncI,
8477 TruncI->getDebugLoc());
8478 }
8479 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8480 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8481 IndDesc, Phi->getDebugLoc());
8482}
8483
8484VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8486
8487 // Check if this is an integer or fp induction. If so, build the recipe that
8488 // produces its scalar and vector values.
8489 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8490 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8491 *PSE.getSE(), *OrigLoop);
8492
8493 // Check if this is pointer induction. If so, build the recipe for it.
8494 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8495 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8496 *PSE.getSE());
8498 Phi, Operands[0], Step, *II,
8500 [&](ElementCount VF) {
8501 return CM.isScalarAfterVectorization(Phi, VF);
8502 },
8503 Range),
8504 Phi->getDebugLoc());
8505 }
8506 return nullptr;
8507}
8508
8509VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8511 // Optimize the special case where the source is a constant integer
8512 // induction variable. Notice that we can only optimize the 'trunc' case
8513 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8514 // (c) other casts depend on pointer size.
8515
8516 // Determine whether \p K is a truncation based on an induction variable that
8517 // can be optimized.
8518 auto IsOptimizableIVTruncate =
8519 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8520 return [=](ElementCount VF) -> bool {
8521 return CM.isOptimizableIVTruncate(K, VF);
8522 };
8523 };
8524
8526 IsOptimizableIVTruncate(I), Range)) {
8527
8528 auto *Phi = cast<PHINode>(I->getOperand(0));
8530 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
8531 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8532 *OrigLoop);
8533 }
8534 return nullptr;
8535}
8536
8537VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8539 unsigned NumIncoming = Phi->getNumIncomingValues();
8540
8541 // We know that all PHIs in non-header blocks are converted into selects, so
8542 // we don't have to worry about the insertion order and we can just use the
8543 // builder. At this point we generate the predication tree. There may be
8544 // duplications since this is a simple recursive scan, but future
8545 // optimizations will clean it up.
8546 SmallVector<VPValue *, 2> OperandsWithMask;
8547
8548 for (unsigned In = 0; In < NumIncoming; In++) {
8549 OperandsWithMask.push_back(Operands[In]);
8550 VPValue *EdgeMask =
8551 getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8552 if (!EdgeMask) {
8553 assert(In == 0 && "Both null and non-null edge masks found");
8555 "Distinct incoming values with one having a full mask");
8556 break;
8557 }
8558 OperandsWithMask.push_back(EdgeMask);
8559 }
8560 return new VPBlendRecipe(Phi, OperandsWithMask);
8561}
8562
8563VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8565 VFRange &Range) {
8567 [this, CI](ElementCount VF) {
8568 return CM.isScalarWithPredication(CI, VF);
8569 },
8570 Range);
8571
8572 if (IsPredicated)
8573 return nullptr;
8574
8576 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8577 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8578 ID == Intrinsic::pseudoprobe ||
8579 ID == Intrinsic::experimental_noalias_scope_decl))
8580 return nullptr;
8581
8582 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8583
8584 // Is it beneficial to perform intrinsic call compared to lib call?
8585 bool ShouldUseVectorIntrinsic =
8587 [&](ElementCount VF) -> bool {
8588 return CM.getCallWideningDecision(CI, VF).Kind ==
8590 },
8591 Range);
8592 if (ShouldUseVectorIntrinsic)
8593 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
8594 CI->getDebugLoc());
8595
8596 Function *Variant = nullptr;
8597 std::optional<unsigned> MaskPos;
8598 // Is better to call a vectorized version of the function than to to scalarize
8599 // the call?
8600 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8601 [&](ElementCount VF) -> bool {
8602 // The following case may be scalarized depending on the VF.
8603 // The flag shows whether we can use a usual Call for vectorized
8604 // version of the instruction.
8605
8606 // If we've found a variant at a previous VF, then stop looking. A
8607 // vectorized variant of a function expects input in a certain shape
8608 // -- basically the number of input registers, the number of lanes
8609 // per register, and whether there's a mask required.
8610 // We store a pointer to the variant in the VPWidenCallRecipe, so
8611 // once we have an appropriate variant it's only valid for that VF.
8612 // This will force a different vplan to be generated for each VF that
8613 // finds a valid variant.
8614 if (Variant)
8615 return false;
8617 CM.getCallWideningDecision(CI, VF);
8619 Variant = Decision.Variant;
8620 MaskPos = Decision.MaskPos;
8621 return true;
8622 }
8623
8624 return false;
8625 },
8626 Range);
8627 if (ShouldUseVectorCall) {
8628 if (MaskPos.has_value()) {
8629 // We have 2 cases that would require a mask:
8630 // 1) The block needs to be predicated, either due to a conditional
8631 // in the scalar loop or use of an active lane mask with
8632 // tail-folding, and we use the appropriate mask for the block.
8633 // 2) No mask is required for the block, but the only available
8634 // vector variant at this VF requires a mask, so we synthesize an
8635 // all-true mask.
8636 VPValue *Mask = nullptr;
8637 if (Legal->isMaskRequired(CI))
8638 Mask = getBlockInMask(CI->getParent());
8639 else
8640 Mask = Plan.getOrAddLiveIn(
8642
8643 Ops.insert(Ops.begin() + *MaskPos, Mask);
8644 }
8645
8646 Ops.push_back(Operands.back());
8647 return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc());
8648 }
8649
8650 return nullptr;
8651}
8652
8653bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8654 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8655 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8656 // Instruction should be widened, unless it is scalar after vectorization,
8657 // scalarization is profitable or it is predicated.
8658 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8659 return CM.isScalarAfterVectorization(I, VF) ||
8660 CM.isProfitableToScalarize(I, VF) ||
8661 CM.isScalarWithPredication(I, VF);
8662 };
8664 Range);
8665}
8666
8667VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8669 VPBasicBlock *VPBB) {
8670 switch (I->getOpcode()) {
8671 default:
8672 return nullptr;
8673 case Instruction::SDiv:
8674 case Instruction::UDiv:
8675 case Instruction::SRem:
8676 case Instruction::URem: {
8677 // If not provably safe, use a select to form a safe divisor before widening the
8678 // div/rem operation itself. Otherwise fall through to general handling below.
8679 if (CM.isPredicatedInst(I)) {
8681 VPValue *Mask = getBlockInMask(I->getParent());
8682 VPValue *One =
8683 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8684 auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
8685 Ops[1] = SafeRHS;
8686 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8687 }
8688 [[fallthrough]];
8689 }
8690 case Instruction::Add:
8691 case Instruction::And:
8692 case Instruction::AShr:
8693 case Instruction::FAdd:
8694 case Instruction::FCmp:
8695 case Instruction::FDiv:
8696 case Instruction::FMul:
8697 case Instruction::FNeg:
8698 case Instruction::FRem:
8699 case Instruction::FSub:
8700 case Instruction::ICmp:
8701 case Instruction::LShr:
8702 case Instruction::Mul:
8703 case Instruction::Or:
8704 case Instruction::Select:
8705 case Instruction::Shl:
8706 case Instruction::Sub:
8707 case Instruction::Xor:
8708 case Instruction::Freeze:
8710 if (Instruction::isBinaryOp(I->getOpcode())) {
8711 // The legacy cost model uses SCEV to check if some of the operands are
8712 // constants. To match the legacy cost model's behavior, use SCEV to try
8713 // to replace operands with constants.
8714 ScalarEvolution &SE = *PSE.getSE();
8715 auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
8716 Value *V = Op->getUnderlyingValue();
8717 if (isa<Constant>(V) || !SE.isSCEVable(V->getType()))
8718 return Op;
8719 auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V));
8720 if (!C)
8721 return Op;
8722 return Plan.getOrAddLiveIn(C->getValue());
8723 };
8724 // For Mul, the legacy cost model checks both operands.
8725 if (I->getOpcode() == Instruction::Mul)
8726 NewOps[0] = GetConstantViaSCEV(NewOps[0]);
8727 // For other binops, the legacy cost model only checks the second operand.
8728 NewOps[1] = GetConstantViaSCEV(NewOps[1]);
8729 }
8730 return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
8731 };
8732}
8733
8735VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
8737 // FIXME: Support other operations.
8738 unsigned Opcode = HI->Update->getOpcode();
8739 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
8740 "Histogram update operation must be an Add or Sub");
8741
8743 // Bucket address.
8744 HGramOps.push_back(Operands[1]);
8745 // Increment value.
8746 HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
8747
8748 // In case of predicated execution (due to tail-folding, or conditional
8749 // execution, or both), pass the relevant mask.
8750 if (Legal->isMaskRequired(HI->Store))
8751 HGramOps.push_back(getBlockInMask(HI->Store->getParent()));
8752
8753 return new VPHistogramRecipe(Opcode,
8754 make_range(HGramOps.begin(), HGramOps.end()),
8755 HI->Store->getDebugLoc());
8756}
8757
8759 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8760 for (VPHeaderPHIRecipe *R : PhisToFix) {
8761 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8762 VPRecipeBase *IncR =
8763 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8764 R->addOperand(IncR->getVPSingleValue());
8765 }
8766}
8767
8769 VFRange &Range) {
8771 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8772 Range);
8773
8774 bool IsPredicated = CM.isPredicatedInst(I);
8775
8776 // Even if the instruction is not marked as uniform, there are certain
8777 // intrinsic calls that can be effectively treated as such, so we check for
8778 // them here. Conservatively, we only do this for scalable vectors, since
8779 // for fixed-width VFs we can always fall back on full scalarization.
8780 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8781 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8782 case Intrinsic::assume:
8783 case Intrinsic::lifetime_start:
8784 case Intrinsic::lifetime_end:
8785 // For scalable vectors if one of the operands is variant then we still
8786 // want to mark as uniform, which will generate one instruction for just
8787 // the first lane of the vector. We can't scalarize the call in the same
8788 // way as for fixed-width vectors because we don't know how many lanes
8789 // there are.
8790 //
8791 // The reasons for doing it this way for scalable vectors are:
8792 // 1. For the assume intrinsic generating the instruction for the first
8793 // lane is still be better than not generating any at all. For
8794 // example, the input may be a splat across all lanes.
8795 // 2. For the lifetime start/end intrinsics the pointer operand only
8796 // does anything useful when the input comes from a stack object,
8797 // which suggests it should always be uniform. For non-stack objects
8798 // the effect is to poison the object, which still allows us to
8799 // remove the call.
8800 IsUniform = true;
8801 break;
8802 default:
8803 break;
8804 }
8805 }
8806 VPValue *BlockInMask = nullptr;
8807 if (!IsPredicated) {
8808 // Finalize the recipe for Instr, first if it is not predicated.
8809 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8810 } else {
8811 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8812 // Instructions marked for predication are replicated and a mask operand is
8813 // added initially. Masked replicate recipes will later be placed under an
8814 // if-then construct to prevent side-effects. Generate recipes to compute
8815 // the block mask for this region.
8816 BlockInMask = getBlockInMask(I->getParent());
8817 }
8818
8819 // Note that there is some custom logic to mark some intrinsics as uniform
8820 // manually above for scalable vectors, which this assert needs to account for
8821 // as well.
8822 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8823 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8824 "Should not predicate a uniform recipe");
8825 auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
8826 IsUniform, BlockInMask);
8827 return Recipe;
8828}
8829
8833 VFRange &Range, VPBasicBlock *VPBB) {
8834 // First, check for specific widening recipes that deal with inductions, Phi
8835 // nodes, calls and memory operations.
8836 VPRecipeBase *Recipe;
8837 if (auto *Phi = dyn_cast<PHINode>(Instr)) {
8838 if (Phi->getParent() != OrigLoop->getHeader())
8839 return tryToBlend(Phi, Operands);
8840
8841 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8842 return Recipe;
8843
8844 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8845 assert((Legal->isReductionVariable(Phi) ||
8846 Legal->isFixedOrderRecurrence(Phi)) &&
8847 "can only widen reductions and fixed-order recurrences here");
8848 VPValue *StartV = Operands[0];
8849 if (Legal->isReductionVariable(Phi)) {
8850 const RecurrenceDescriptor &RdxDesc =
8851 Legal->getReductionVars().find(Phi)->second;
8852 assert(RdxDesc.getRecurrenceStartValue() ==
8853 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8854 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8855 CM.isInLoopReduction(Phi),
8856 CM.useOrderedReductions(RdxDesc));
8857 } else {
8858 // TODO: Currently fixed-order recurrences are modeled as chains of
8859 // first-order recurrences. If there are no users of the intermediate
8860 // recurrences in the chain, the fixed order recurrence should be modeled
8861 // directly, enabling more efficient codegen.
8862 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8863 }
8864
8865 PhisToFix.push_back(PhiRecipe);
8866 return PhiRecipe;
8867 }
8868
8869 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8870 cast<TruncInst>(Instr), Operands, Range)))
8871 return Recipe;
8872
8873 // All widen recipes below deal only with VF > 1.
8875 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8876 return nullptr;
8877
8878 if (auto *CI = dyn_cast<CallInst>(Instr))
8879 return tryToWidenCall(CI, Operands, Range);
8880
8881 if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
8882 if (auto HistInfo = Legal->getHistogramInfo(SI))
8883 return tryToWidenHistogram(*HistInfo, Operands);
8884
8885 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8886 return tryToWidenMemory(Instr, Operands, Range);
8887
8888 if (!shouldWiden(Instr, Range))
8889 return nullptr;
8890
8891 if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr))
8892 return new VPWidenGEPRecipe(GEP,
8893 make_range(Operands.begin(), Operands.end()));
8894
8895 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8896 return new VPWidenSelectRecipe(
8897 *SI, make_range(Operands.begin(), Operands.end()));
8898 }
8899
8900 if (auto *CI = dyn_cast<CastInst>(Instr)) {
8901 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8902 *CI);
8903 }
8904
8905 return tryToWiden(Instr, Operands, VPBB);
8906}
8907
8908void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8909 ElementCount MaxVF) {
8910 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8911
8912 auto MaxVFTimes2 = MaxVF * 2;
8913 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8914 VFRange SubRange = {VF, MaxVFTimes2};
8915 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8916 // Now optimize the initial VPlan.
8917 if (!Plan->hasVF(ElementCount::getFixed(1)))
8919 CM.getMinimalBitwidths());
8921 // TODO: try to put it close to addActiveLaneMask().
8922 // Discard the plan if it is not EVL-compatible
8924 *Plan, CM.getMaxSafeElements()))
8925 break;
8926 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8927 VPlans.push_back(std::move(Plan));
8928 }
8929 VF = SubRange.End;
8930 }
8931}
8932
8933// Add the necessary canonical IV and branch recipes required to control the
8934// loop.
8935static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8936 DebugLoc DL) {
8937 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8938 auto *StartV = Plan.getOrAddLiveIn(StartIdx);
8939
8940 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8941 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8942 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8943 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8944 Header->insert(CanonicalIVPHI, Header->begin());
8945
8946 VPBuilder Builder(TopRegion->getExitingBasicBlock());
8947 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8948 auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8949 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
8950 "index.next");
8951 CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8952
8953 // Add the BranchOnCount VPInstruction to the latch.
8955 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8956}
8957
8958/// Create resume phis in the scalar preheader for first-order recurrences and
8959/// reductions and update the VPIRInstructions wrapping the original phis in the
8960/// scalar header.
8961static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
8962 auto *ScalarPH = Plan.getScalarPreheader();
8963 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
8964 VPBuilder ScalarPHBuilder(ScalarPH);
8965 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8966 VPValue *OneVPV = Plan.getOrAddLiveIn(
8967 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
8968 for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) {
8969 auto *ScalarPhiIRI = cast<VPIRInstruction>(&ScalarPhiR);
8970 auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction());
8971 if (!ScalarPhiI)
8972 break;
8973 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
8974 if (!isa<VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe>(VectorPhiR))
8975 continue;
8976 // The backedge value provides the value to resume coming out of a loop,
8977 // which for FORs is a vector whose last element needs to be extracted. The
8978 // start value provides the value if the loop is bypassed.
8979 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
8980 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
8981 if (IsFOR)
8982 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
8983 VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {},
8984 "vector.recur.extract");
8985 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
8986 auto *ResumePhiR = ScalarPHBuilder.createNaryOp(
8988 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
8989 ScalarPhiIRI->addOperand(ResumePhiR);
8990 }
8991}
8992
8993// Collect VPIRInstructions for phis in the exit blocks that are modeled
8994// in VPlan and add the exiting VPValue as operand. Some exiting values are not
8995// modeled explicitly yet and won't be included. Those are un-truncated
8996// VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction
8997// increments.
8999 Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
9001 auto *MiddleVPBB = Plan.getMiddleBlock();
9002 SetVector<VPIRInstruction *> ExitUsersToFix;
9003 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
9004 for (VPRecipeBase &R : *ExitVPBB) {
9005 auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
9006 if (!ExitIRI)
9007 continue;
9008 auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
9009 if (!ExitPhi)
9010 break;
9011 for (VPBlockBase *PredVPBB : ExitVPBB->getPredecessors()) {
9012 BasicBlock *ExitingBB = OrigLoop->getLoopLatch();
9013 if (PredVPBB != MiddleVPBB) {
9014 SmallVector<BasicBlock *> ExitingBlocks;
9015 OrigLoop->getExitingBlocks(ExitingBlocks);
9016 assert(ExitingBlocks.size() == 2 && "only support 2 exiting blocks");
9017 ExitingBB = ExitingBB == ExitingBlocks[0] ? ExitingBlocks[1]
9018 : ExitingBlocks[0];
9019 }
9020 Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
9021 VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
9022 // Exit values for inductions are computed and updated outside of VPlan
9023 // and independent of induction recipes.
9024 // TODO: Compute induction exit values in VPlan.
9025 if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
9026 !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
9027 isa<VPWidenPointerInductionRecipe>(V) ||
9028 (isa<Instruction>(IncomingValue) &&
9029 OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
9030 any_of(IncomingValue->users(), [&Inductions](User *U) {
9031 auto *P = dyn_cast<PHINode>(U);
9032 return P && Inductions.contains(P);
9033 }))) {
9034 if (ExitVPBB->getSinglePredecessor() == MiddleVPBB)
9035 continue;
9036 }
9037 ExitUsersToFix.insert(ExitIRI);
9038 ExitIRI->addOperand(V);
9039 }
9040 }
9041 }
9042 return ExitUsersToFix;
9043}
9044
9045// Add exit values to \p Plan. Extracts are added for each entry in \p
9046// ExitUsersToFix if needed and their operands are updated. Returns true if all
9047// exit users can be handled, otherwise return false.
9048static bool
9050 const SetVector<VPIRInstruction *> &ExitUsersToFix) {
9051 if (ExitUsersToFix.empty())
9052 return true;
9053
9054 auto *MiddleVPBB = Plan.getMiddleBlock();
9055 VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9056
9057 // Introduce extract for exiting values and update the VPIRInstructions
9058 // modeling the corresponding LCSSA phis.
9059 for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9060 for (const auto &[Idx, Op] : enumerate(ExitIRI->operands())) {
9061 // Pass live-in values used by exit phis directly through to their users
9062 // in the exit block.
9063 if (Op->isLiveIn())
9064 continue;
9065
9066 // Currently only live-ins can be used by exit values from blocks not
9067 // exiting via the vector latch through to the middle block.
9068 if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB)
9069 return false;
9070
9071 LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
9072 VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
9073 {Op, Plan.getOrAddLiveIn(ConstantInt::get(
9074 IntegerType::get(Ctx, 32), 1))});
9075 ExitIRI->setOperand(Idx, Ext);
9076 }
9077 }
9078 return true;
9079}
9080
9081/// Handle users in the exit block for first order reductions in the original
9082/// exit block. The penultimate value of recurrences is fed to their LCSSA phi
9083/// users in the original exit block using the VPIRInstruction wrapping to the
9084/// LCSSA phi.
9086 VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) {
9087 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
9088 auto *ScalarPHVPBB = Plan.getScalarPreheader();
9089 auto *MiddleVPBB = Plan.getMiddleBlock();
9090 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
9091 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9092 VPValue *TwoVPV = Plan.getOrAddLiveIn(
9093 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2));
9094
9095 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
9096 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
9097 if (!FOR)
9098 continue;
9099
9100 // This is the second phase of vectorizing first-order recurrences, creating
9101 // extract for users outside the loop. An overview of the transformation is
9102 // described below. Suppose we have the following loop with some use after
9103 // the loop of the last a[i-1],
9104 //
9105 // for (int i = 0; i < n; ++i) {
9106 // t = a[i - 1];
9107 // b[i] = a[i] - t;
9108 // }
9109 // use t;
9110 //
9111 // There is a first-order recurrence on "a". For this loop, the shorthand
9112 // scalar IR looks like:
9113 //
9114 // scalar.ph:
9115 // s.init = a[-1]
9116 // br scalar.body
9117 //
9118 // scalar.body:
9119 // i = phi [0, scalar.ph], [i+1, scalar.body]
9120 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
9121 // s2 = a[i]
9122 // b[i] = s2 - s1
9123 // br cond, scalar.body, exit.block
9124 //
9125 // exit.block:
9126 // use = lcssa.phi [s1, scalar.body]
9127 //
9128 // In this example, s1 is a recurrence because it's value depends on the
9129 // previous iteration. In the first phase of vectorization, we created a
9130 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
9131 // for users in the scalar preheader and exit block.
9132 //
9133 // vector.ph:
9134 // v_init = vector(..., ..., ..., a[-1])
9135 // br vector.body
9136 //
9137 // vector.body
9138 // i = phi [0, vector.ph], [i+4, vector.body]
9139 // v1 = phi [v_init, vector.ph], [v2, vector.body]
9140 // v2 = a[i, i+1, i+2, i+3]
9141 // b[i] = v2 - v1
9142 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
9143 // b[i, i+1, i+2, i+3] = v2 - v1
9144 // br cond, vector.body, middle.block
9145 //
9146 // middle.block:
9147 // vector.recur.extract.for.phi = v2(2)
9148 // vector.recur.extract = v2(3)
9149 // br cond, scalar.ph, exit.block
9150 //
9151 // scalar.ph:
9152 // scalar.recur.init = phi [vector.recur.extract, middle.block],
9153 // [s.init, otherwise]
9154 // br scalar.body
9155 //
9156 // scalar.body:
9157 // i = phi [0, scalar.ph], [i+1, scalar.body]
9158 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
9159 // s2 = a[i]
9160 // b[i] = s2 - s1
9161 // br cond, scalar.body, exit.block
9162 //
9163 // exit.block:
9164 // lo = lcssa.phi [s1, scalar.body],
9165 // [vector.recur.extract.for.phi, middle.block]
9166 //
9167 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
9168 // Extract the penultimate value of the recurrence and use it as operand for
9169 // the VPIRInstruction modeling the phi.
9170 for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9171 if (ExitIRI->getOperand(0) != FOR)
9172 continue;
9173 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
9174 VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {},
9175 "vector.recur.extract.for.phi");
9176 ExitIRI->setOperand(0, PenultimateElement);
9177 ExitUsersToFix.remove(ExitIRI);
9178 }
9179 }
9180}
9181
9183LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9184
9186
9187 // ---------------------------------------------------------------------------
9188 // Build initial VPlan: Scan the body of the loop in a topological order to
9189 // visit each basic block after having visited its predecessor basic blocks.
9190 // ---------------------------------------------------------------------------
9191
9192 // Create initial VPlan skeleton, having a basic block for the pre-header
9193 // which contains SCEV expansions that need to happen before the CFG is
9194 // modified; a basic block for the vector pre-header, followed by a region for
9195 // the vector loop, followed by the middle basic block. The skeleton vector
9196 // loop region contains a header and latch basic blocks.
9197
9198 bool RequiresScalarEpilogueCheck =
9200 [this](ElementCount VF) {
9201 return !CM.requiresScalarEpilogue(VF.isVector());
9202 },
9203 Range);
9205 PSE, RequiresScalarEpilogueCheck,
9206 CM.foldTailByMasking(), OrigLoop);
9207
9208 // Don't use getDecisionAndClampRange here, because we don't know the UF
9209 // so this function is better to be conservative, rather than to split
9210 // it up into different VPlans.
9211 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
9212 bool IVUpdateMayOverflow = false;
9213 for (ElementCount VF : Range)
9214 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
9215
9217 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
9218 // Use NUW for the induction increment if we proved that it won't overflow in
9219 // the vector loop or when not folding the tail. In the later case, we know
9220 // that the canonical induction increment will not overflow as the vector trip
9221 // count is >= increment and a multiple of the increment.
9222 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
9223 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
9224
9225 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
9226
9227 // ---------------------------------------------------------------------------
9228 // Pre-construction: record ingredients whose recipes we'll need to further
9229 // process after constructing the initial VPlan.
9230 // ---------------------------------------------------------------------------
9231
9232 // For each interleave group which is relevant for this (possibly trimmed)
9233 // Range, add it to the set of groups to be later applied to the VPlan and add
9234 // placeholders for its members' Recipes which we'll be replacing with a
9235 // single VPInterleaveRecipe.
9237 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
9238 bool Result = (VF.isVector() && // Query is illegal for VF == 1
9239 CM.getWideningDecision(IG->getInsertPos(), VF) ==
9241 // For scalable vectors, the only interleave factor currently supported
9242 // must be power of 2 since we require the (de)interleave2 intrinsics
9243 // instead of shufflevectors.
9244 assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) &&
9245 "Unsupported interleave factor for scalable vectors");
9246 return Result;
9247 };
9248 if (!getDecisionAndClampRange(ApplyIG, Range))
9249 continue;
9250 InterleaveGroups.insert(IG);
9251 }
9252
9253 // ---------------------------------------------------------------------------
9254 // Construct recipes for the instructions in the loop
9255 // ---------------------------------------------------------------------------
9256
9257 // Scan the body of the loop in a topological order to visit each basic block
9258 // after having visited its predecessor basic blocks.
9259 LoopBlocksDFS DFS(OrigLoop);
9260 DFS.perform(LI);
9261
9262 VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
9263 VPBasicBlock *VPBB = HeaderVPBB;
9264 BasicBlock *HeaderBB = OrigLoop->getHeader();
9265 bool NeedsMasks =
9266 CM.foldTailByMasking() ||
9267 any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
9268 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
9269 return Legal->blockNeedsPredication(BB) || NeedsBlends;
9270 });
9271 auto *MiddleVPBB = Plan->getMiddleBlock();
9272 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
9273 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9274 // Relevant instructions from basic block BB will be grouped into VPRecipe
9275 // ingredients and fill a new VPBasicBlock.
9276 if (VPBB != HeaderVPBB)
9277 VPBB->setName(BB->getName());
9278 Builder.setInsertPoint(VPBB);
9279
9280 if (VPBB == HeaderVPBB)
9281 RecipeBuilder.createHeaderMask();
9282 else if (NeedsMasks)
9283 RecipeBuilder.createBlockInMask(BB);
9284
9285 // Introduce each ingredient into VPlan.
9286 // TODO: Model and preserve debug intrinsics in VPlan.
9287 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
9288 Instruction *Instr = &I;
9290 auto *Phi = dyn_cast<PHINode>(Instr);
9291 if (Phi && Phi->getParent() == HeaderBB) {
9292 Operands.push_back(Plan->getOrAddLiveIn(
9293 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9294 } else {
9295 auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
9296 Operands = {OpRange.begin(), OpRange.end()};
9297 }
9298
9299 // The stores with invariant address inside the loop will be deleted, and
9300 // in the exit block, a uniform store recipe will be created for the final
9301 // invariant store of the reduction.
9302 StoreInst *SI;
9303 if ((SI = dyn_cast<StoreInst>(&I)) &&
9304 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
9305 // Only create recipe for the final invariant store of the reduction.
9306 if (!Legal->isInvariantStoreOfReduction(SI))
9307 continue;
9308 auto *Recipe = new VPReplicateRecipe(
9309 SI, RecipeBuilder.mapToVPValues(Instr->operands()),
9310 true /* IsUniform */);
9311 Recipe->insertBefore(*MiddleVPBB, MBIP);
9312 continue;
9313 }
9314
9315 VPRecipeBase *Recipe =
9316 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
9317 if (!Recipe)
9318 Recipe = RecipeBuilder.handleReplication(Instr, Range);
9319
9320 RecipeBuilder.setRecipe(Instr, Recipe);
9321 if (isa<VPHeaderPHIRecipe>(Recipe)) {
9322 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
9323 // the following cases, VPHeaderPHIRecipes may be created after non-phi
9324 // recipes and need to be moved to the phi section of HeaderVPBB:
9325 // * tail-folding (non-phi recipes computing the header mask are
9326 // introduced earlier than regular header phi recipes, and should appear
9327 // after them)
9328 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
9329
9330 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
9331 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
9332 "unexpected recipe needs moving");
9333 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9334 } else
9335 VPBB->appendRecipe(Recipe);
9336 }
9337
9339 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
9340 }
9341
9342 // After here, VPBB should not be used.
9343 VPBB = nullptr;
9344
9345 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
9346 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
9347 "entry block must be set to a VPRegionBlock having a non-empty entry "
9348 "VPBasicBlock");
9349 RecipeBuilder.fixHeaderPhis();
9350
9351 if (auto *UncountableExitingBlock =
9354 *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
9355 }
9356 addScalarResumePhis(RecipeBuilder, *Plan);
9358 OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
9359 addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
9360 if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
9362 "Some exit values in loop with uncountable exit not supported yet",
9363 "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
9364 return nullptr;
9365 }
9366
9367 // ---------------------------------------------------------------------------
9368 // Transform initial VPlan: Apply previously taken decisions, in order, to
9369 // bring the VPlan to its final state.
9370 // ---------------------------------------------------------------------------
9371
9372 // Adjust the recipes for any inloop reductions.
9373 adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
9374
9375 // Interleave memory: for each Interleave Group we marked earlier as relevant
9376 // for this VPlan, replace the Recipes widening its memory instructions with a
9377 // single VPInterleaveRecipe at its insertion point.
9379 *Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
9380
9381 for (ElementCount VF : Range)
9382 Plan->addVF(VF);
9383 Plan->setName("Initial VPlan");
9384
9385 // Replace VPValues for known constant strides guaranteed by predicate scalar
9386 // evolution.
9387 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
9388 auto *R = cast<VPRecipeBase>(&U);
9389 return R->getParent()->getParent() ||
9390 R->getParent() ==
9391 Plan->getVectorLoopRegion()->getSinglePredecessor();
9392 };
9393 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
9394 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
9395 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
9396 // Only handle constant strides for now.
9397 if (!ScevStride)
9398 continue;
9399
9400 auto *CI = Plan->getOrAddLiveIn(
9401 ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
9402 if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
9403 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9404
9405 // The versioned value may not be used in the loop directly but through a
9406 // sext/zext. Add new live-ins in those cases.
9407 for (Value *U : StrideV->users()) {
9408 if (!isa<SExtInst, ZExtInst>(U))
9409 continue;
9410 VPValue *StrideVPV = Plan->getLiveIn(U);
9411 if (!StrideVPV)
9412 continue;
9413 unsigned BW = U->getType()->getScalarSizeInBits();
9414 APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
9415 : ScevStride->getAPInt().zext(BW);
9416 VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
9417 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9418 }
9419 }
9420
9422 return Legal->blockNeedsPredication(BB);
9423 });
9424
9425 // Sink users of fixed-order recurrence past the recipe defining the previous
9426 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
9428 return nullptr;
9429
9430 if (useActiveLaneMask(Style)) {
9431 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
9432 // TailFoldingStyle is visible there.
9433 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
9434 bool WithoutRuntimeCheck =
9436 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
9437 WithoutRuntimeCheck);
9438 }
9439 return Plan;
9440}
9441
9442VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9443 // Outer loop handling: They may require CFG and instruction level
9444 // transformations before even evaluating whether vectorization is profitable.
9445 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9446 // the vectorization pipeline.
9447 assert(!OrigLoop->isInnermost());
9448 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9449
9450 // Create new empty VPlan
9451 auto Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), PSE,
9452 true, false, OrigLoop);
9453
9454 // Build hierarchical CFG
9455 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9456 HCFGBuilder.buildHierarchicalCFG();
9457
9458 for (ElementCount VF : Range)
9459 Plan->addVF(VF);
9460
9462 Plan,
9463 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9464 *PSE.getSE(), *TLI);
9465
9466 // Remove the existing terminator of the exiting block of the top-most region.
9467 // A BranchOnCount will be added instead when adding the canonical IV recipes.
9468 auto *Term =
9469 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9470 Term->eraseFromParent();
9471
9472 // Tail folding is not supported for outer loops, so the induction increment
9473 // is guaranteed to not wrap.
9474 bool HasNUW = true;
9475 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
9476 DebugLoc());
9477 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9478 return Plan;
9479}
9480
9481// Adjust the recipes for reductions. For in-loop reductions the chain of
9482// instructions leading from the loop exit instr to the phi need to be converted
9483// to reductions, with one operand being vector and the other being the scalar
9484// reduction chain. For other reductions, a select is introduced between the phi
9485// and users outside the vector region when folding the tail.
9486//
9487// A ComputeReductionResult recipe is added to the middle block, also for
9488// in-loop reductions which compute their result in-loop, because generating
9489// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
9490//
9491// Adjust AnyOf reductions; replace the reduction phi for the selected value
9492// with a boolean reduction phi node to check if the condition is true in any
9493// iteration. The final value is selected by the final ComputeReductionResult.
9494void LoopVectorizationPlanner::adjustRecipesForReductions(
9495 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
9496 using namespace VPlanPatternMatch;
9497 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
9498 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
9499 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
9500 for (VPRecipeBase &R : Header->phis()) {
9501 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9502 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
9503 continue;
9504
9505 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9506 RecurKind Kind = RdxDesc.getRecurrenceKind();
9507 assert(
9510 "AnyOf and FindLast reductions are not allowed for in-loop reductions");
9511
9512 // Collect the chain of "link" recipes for the reduction starting at PhiR.
9514 Worklist.insert(PhiR);
9515 for (unsigned I = 0; I != Worklist.size(); ++I) {
9516 VPSingleDefRecipe *Cur = Worklist[I];
9517 for (VPUser *U : Cur->users()) {
9518 auto *UserRecipe = cast<VPSingleDefRecipe>(U);
9519 if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9520 assert((UserRecipe->getParent() == MiddleVPBB ||
9521 UserRecipe->getParent() == Plan->getScalarPreheader()) &&
9522 "U must be either in the loop region, the middle block or the "
9523 "scalar preheader.");
9524 continue;
9525 }
9526 Worklist.insert(UserRecipe);
9527 }
9528 }
9529
9530 // Visit operation "Links" along the reduction chain top-down starting from
9531 // the phi until LoopExitValue. We keep track of the previous item
9532 // (PreviousLink) to tell which of the two operands of a Link will remain
9533 // scalar and which will be reduced. For minmax by select(cmp), Link will be
9534 // the select instructions. Blend recipes of in-loop reduction phi's will
9535 // get folded to their non-phi operand, as the reduction recipe handles the
9536 // condition directly.
9537 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9538 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9539 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9540
9541 // Index of the first operand which holds a non-mask vector operand.
9542 unsigned IndexOfFirstOperand;
9543 // Recognize a call to the llvm.fmuladd intrinsic.
9544 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9545 VPValue *VecOp;
9546 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9547 if (IsFMulAdd) {
9548 assert(
9550 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9551 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9552 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
9553 CurrentLink->getOperand(2) == PreviousLink &&
9554 "expected a call where the previous link is the added operand");
9555
9556 // If the instruction is a call to the llvm.fmuladd intrinsic then we
9557 // need to create an fmul recipe (multiplying the first two operands of
9558 // the fmuladd together) to use as the vector operand for the fadd
9559 // reduction.
9560 VPInstruction *FMulRecipe = new VPInstruction(
9561 Instruction::FMul,
9562 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9563 CurrentLinkI->getFastMathFlags());
9564 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9565 VecOp = FMulRecipe;
9566 } else {
9567 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
9568 if (PhiR->isInLoop() && Blend) {
9569 assert(Blend->getNumIncomingValues() == 2 &&
9570 "Blend must have 2 incoming values");
9571 if (Blend->getIncomingValue(0) == PhiR)
9572 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9573 else {
9574 assert(Blend->getIncomingValue(1) == PhiR &&
9575 "PhiR must be an operand of the blend");
9576 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9577 }
9578 continue;
9579 }
9580
9582 if (isa<VPWidenRecipe>(CurrentLink)) {
9583 assert(isa<CmpInst>(CurrentLinkI) &&
9584 "need to have the compare of the select");
9585 continue;
9586 }
9587 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9588 "must be a select recipe");
9589 IndexOfFirstOperand = 1;
9590 } else {
9591 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9592 "Expected to replace a VPWidenSC");
9593 IndexOfFirstOperand = 0;
9594 }
9595 // Note that for non-commutable operands (cmp-selects), the semantics of
9596 // the cmp-select are captured in the recurrence kind.
9597 unsigned VecOpId =
9598 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9599 ? IndexOfFirstOperand + 1
9600 : IndexOfFirstOperand;
9601 VecOp = CurrentLink->getOperand(VecOpId);
9602 assert(VecOp != PreviousLink &&
9603 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9604 (VecOpId - IndexOfFirstOperand)) ==
9605 PreviousLink &&
9606 "PreviousLink must be the operand other than VecOp");
9607 }
9608
9609 BasicBlock *BB = CurrentLinkI->getParent();
9610 VPValue *CondOp = nullptr;
9612 CondOp = RecipeBuilder.getBlockInMask(BB);
9613
9614 auto *RedRecipe = new VPReductionRecipe(
9615 RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp,
9616 CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
9617 // Append the recipe to the end of the VPBasicBlock because we need to
9618 // ensure that it comes after all of it's inputs, including CondOp.
9619 // Note that this transformation may leave over dead recipes (including
9620 // CurrentLink), which will be cleaned by a later VPlan transform.
9621 LinkVPBB->appendRecipe(RedRecipe);
9622 CurrentLink->replaceAllUsesWith(RedRecipe);
9623 PreviousLink = RedRecipe;
9624 }
9625 }
9626 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9627 Builder.setInsertPoint(&*LatchVPBB->begin());
9628 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9629 for (VPRecipeBase &R :
9630 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9631 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9632 if (!PhiR)
9633 continue;
9634
9635 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9636 // If tail is folded by masking, introduce selects between the phi
9637 // and the users outside the vector region of each reduction, at the
9638 // beginning of the dedicated latch block.
9639 auto *OrigExitingVPV = PhiR->getBackedgeValue();
9640 auto *NewExitingVPV = PhiR->getBackedgeValue();
9641 if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9642 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9643 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9644 "reduction recipe must be defined before latch");
9645 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9646 std::optional<FastMathFlags> FMFs =
9647 PhiTy->isFloatingPointTy()
9648 ? std::make_optional(RdxDesc.getFastMathFlags())
9649 : std::nullopt;
9650 NewExitingVPV =
9651 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9652 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9653 return isa<VPInstruction>(&U) &&
9654 cast<VPInstruction>(&U)->getOpcode() ==
9656 });
9658 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy))
9659 PhiR->setOperand(1, NewExitingVPV);
9660 }
9661
9662 // If the vector reduction can be performed in a smaller type, we truncate
9663 // then extend the loop exit value to enable InstCombine to evaluate the
9664 // entire expression in the smaller type.
9665 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9666 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9668 RdxDesc.getRecurrenceKind())) {
9669 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9670 Type *RdxTy = RdxDesc.getRecurrenceType();
9671 auto *Trunc =
9672 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9673 auto *Extnd =
9674 RdxDesc.isSigned()
9675 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9676 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9677
9678 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9679 Extnd->insertAfter(Trunc);
9680 if (PhiR->getOperand(1) == NewExitingVPV)
9681 PhiR->setOperand(1, Extnd->getVPSingleValue());
9682 NewExitingVPV = Extnd;
9683 }
9684
9685 // We want code in the middle block to appear to execute on the location of
9686 // the scalar loop's latch terminator because: (a) it is all compiler
9687 // generated, (b) these instructions are always executed after evaluating
9688 // the latch conditional branch, and (c) other passes may add new
9689 // predecessors which terminate on this line. This is the easiest way to
9690 // ensure we don't accidentally cause an extra step back into the loop while
9691 // debugging.
9692 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9693
9694 // TODO: At the moment ComputeReductionResult also drives creation of the
9695 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9696 // even for in-loop reductions, until the reduction resume value handling is
9697 // also modeled in VPlan.
9698 auto *FinalReductionResult = new VPInstruction(
9699 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9700 // Update all users outside the vector region.
9701 OrigExitingVPV->replaceUsesWithIf(
9702 FinalReductionResult, [](VPUser &User, unsigned) {
9703 auto *Parent = cast<VPRecipeBase>(&User)->getParent();
9704 return Parent && !Parent->getParent();
9705 });
9706 FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9707
9708 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9709 // with a boolean reduction phi node to check if the condition is true in
9710 // any iteration. The final value is selected by the final
9711 // ComputeReductionResult.
9713 RdxDesc.getRecurrenceKind())) {
9714 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9715 return isa<VPWidenSelectRecipe>(U) ||
9716 (isa<VPReplicateRecipe>(U) &&
9717 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9718 Instruction::Select);
9719 }));
9720 VPValue *Cmp = Select->getOperand(0);
9721 // If the compare is checking the reduction PHI node, adjust it to check
9722 // the start value.
9723 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
9724 for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
9725 if (CmpR->getOperand(I) == PhiR)
9726 CmpR->setOperand(I, PhiR->getStartValue());
9727 }
9728 VPBuilder::InsertPointGuard Guard(Builder);
9729 Builder.setInsertPoint(Select);
9730
9731 // If the true value of the select is the reduction phi, the new value is
9732 // selected if the negated condition is true in any iteration.
9733 if (Select->getOperand(1) == PhiR)
9734 Cmp = Builder.createNot(Cmp);
9735 VPValue *Or = Builder.createOr(PhiR, Cmp);
9736 Select->getVPSingleValue()->replaceAllUsesWith(Or);
9737
9738 // Convert the reduction phi to operate on bools.
9739 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9740 OrigLoop->getHeader()->getContext())));
9741 continue;
9742 }
9743
9745 RdxDesc.getRecurrenceKind())) {
9746 // Adjust the start value for FindLastIV recurrences to use the sentinel
9747 // value after generating the ResumePhi recipe, which uses the original
9748 // start value.
9749 PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
9750 }
9751 }
9752
9754}
9755
9757 assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
9758
9759 // Fast-math-flags propagate from the original induction instruction.
9761 if (FPBinOp)
9762 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9763
9764 Value *Step = State.get(getStepValue(), VPLane(0));
9765 Value *CanonicalIV = State.get(getOperand(1), VPLane(0));
9766 Value *DerivedIV = emitTransformedIndex(
9767 State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9768 Kind, cast_if_present<BinaryOperator>(FPBinOp));
9769 DerivedIV->setName(Name);
9770 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9771
9772 State.set(this, DerivedIV, VPLane(0));
9773}
9774
9777 if (State.Lane) { // Generate a single instance.
9778 assert((State.VF.isScalar() || !isUniform()) &&
9779 "uniform recipe shouldn't be predicated");
9780 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9781 State.ILV->scalarizeInstruction(UI, this, *State.Lane, State);
9782 // Insert scalar instance packing it into a vector.
9783 if (State.VF.isVector() && shouldPack()) {
9784 // If we're constructing lane 0, initialize to start from poison.
9785 if (State.Lane->isFirstLane()) {
9786 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9788 VectorType::get(UI->getType(), State.VF));
9789 State.set(this, Poison);
9790 }
9791 State.packScalarIntoVectorValue(this, *State.Lane);
9792 }
9793 return;
9794 }
9795
9796 if (IsUniform) {
9797 // Uniform within VL means we need to generate lane 0.
9798 State.ILV->scalarizeInstruction(UI, this, VPLane(0), State);
9799 return;
9800 }
9801
9802 // A store of a loop varying value to a uniform address only needs the last
9803 // copy of the store.
9804 if (isa<StoreInst>(UI) &&
9806 auto Lane = VPLane::getLastLaneForVF(State.VF);
9807 State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
9808 return;
9809 }
9810
9811 // Generate scalar instances for all VF lanes.
9812 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9813 const unsigned EndLane = State.VF.getKnownMinValue();
9814 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9815 State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
9816}
9817
9818// Determine how to lower the scalar epilogue, which depends on 1) optimising
9819// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9820// predication, and 4) a TTI hook that analyses whether the loop is suitable
9821// for predication.
9826 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9827 // don't look at hints or options, and don't request a scalar epilogue.
9828 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9829 // LoopAccessInfo (due to code dependency and not being able to reliably get
9830 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9831 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9832 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9833 // back to the old way and vectorize with versioning when forced. See D81345.)
9834 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9838
9839 // 2) If set, obey the directives
9840 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9848 };
9849 }
9850
9851 // 3) If set, obey the hints
9852 switch (Hints.getPredicate()) {
9857 };
9858
9859 // 4) if the TTI hook indicates this is profitable, request predication.
9860 TailFoldingInfo TFI(TLI, &LVL, IAI);
9863
9865}
9866
9867// Process the loop in the VPlan-native vectorization path. This path builds
9868// VPlan upfront in the vectorization pipeline, which allows to apply
9869// VPlan-to-VPlan transformations from the very beginning without modifying the
9870// input LLVM IR.
9877 LoopVectorizationRequirements &Requirements) {
9878
9879 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9880 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9881 return false;
9882 }
9883 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9884 Function *F = L->getHeader()->getParent();
9885 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9886
9888 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9889
9890 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9891 &Hints, IAI);
9892 // Use the planner for outer loop vectorization.
9893 // TODO: CM is not used at this point inside the planner. Turn CM into an
9894 // optional argument if we don't need it in the future.
9895 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9896 ORE);
9897
9898 // Get user vectorization factor.
9899 ElementCount UserVF = Hints.getWidth();
9900
9902
9903 // Plan how to best vectorize, return the best VF and its cost.
9904 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9905
9906 // If we are stress testing VPlan builds, do not attempt to generate vector
9907 // code. Masked vector code generation support will follow soon.
9908 // Also, do not attempt to vectorize if no vector code will be produced.
9910 return false;
9911
9912 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
9913
9914 {
9915 bool AddBranchWeights =
9916 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9917 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
9918 AddBranchWeights);
9919 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9920 VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
9921 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9922 << L->getHeader()->getParent()->getName() << "\"\n");
9923 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9924 }
9925
9926 reportVectorization(ORE, L, VF, 1);
9927
9928 // Mark the loop as already vectorized to avoid vectorizing again.
9929 Hints.setAlreadyVectorized();
9930 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9931 return true;
9932}
9933
9934// Emit a remark if there are stores to floats that required a floating point
9935// extension. If the vectorized loop was generated with floating point there
9936// will be a performance penalty from the conversion overhead and the change in
9937// the vector width.
9940 for (BasicBlock *BB : L->getBlocks()) {
9941 for (Instruction &Inst : *BB) {
9942 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9943 if (S->getValueOperand()->getType()->isFloatTy())
9944 Worklist.push_back(S);
9945 }
9946 }
9947 }
9948
9949 // Traverse the floating point stores upwards searching, for floating point
9950 // conversions.
9953 while (!Worklist.empty()) {
9954 auto *I = Worklist.pop_back_val();
9955 if (!L->contains(I))
9956 continue;
9957 if (!Visited.insert(I).second)
9958 continue;
9959
9960 // Emit a remark if the floating point store required a floating
9961 // point conversion.
9962 // TODO: More work could be done to identify the root cause such as a
9963 // constant or a function return type and point the user to it.
9964 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9965 ORE->emit([&]() {
9966 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9967 I->getDebugLoc(), L->getHeader())
9968 << "floating point conversion changes vector width. "
9969 << "Mixed floating point precision requires an up/down "
9970 << "cast that will negatively impact performance.";
9971 });
9972
9973 for (Use &Op : I->operands())
9974 if (auto *OpI = dyn_cast<Instruction>(Op))
9975 Worklist.push_back(OpI);
9976 }
9977}
9978
9979static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9980 VectorizationFactor &VF, Loop *L,
9981 const TargetTransformInfo &TTI,
9984 InstructionCost CheckCost = Checks.getCost();
9985 if (!CheckCost.isValid())
9986 return false;
9987
9988 // When interleaving only scalar and vector cost will be equal, which in turn
9989 // would lead to a divide by 0. Fall back to hard threshold.
9990 if (VF.Width.isScalar()) {
9991 if (CheckCost > VectorizeMemoryCheckThreshold) {
9992 LLVM_DEBUG(
9993 dbgs()
9994 << "LV: Interleaving only is not profitable due to runtime checks\n");
9995 return false;
9996 }
9997 return true;
9998 }
9999
10000 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
10001 uint64_t ScalarC = *VF.ScalarCost.getValue();
10002 if (ScalarC == 0)
10003 return true;
10004
10005 // First, compute the minimum iteration count required so that the vector
10006 // loop outperforms the scalar loop.
10007 // The total cost of the scalar loop is
10008 // ScalarC * TC
10009 // where
10010 // * TC is the actual trip count of the loop.
10011 // * ScalarC is the cost of a single scalar iteration.
10012 //
10013 // The total cost of the vector loop is
10014 // RtC + VecC * (TC / VF) + EpiC
10015 // where
10016 // * RtC is the cost of the generated runtime checks
10017 // * VecC is the cost of a single vector iteration.
10018 // * TC is the actual trip count of the loop
10019 // * VF is the vectorization factor
10020 // * EpiCost is the cost of the generated epilogue, including the cost
10021 // of the remaining scalar operations.
10022 //
10023 // Vectorization is profitable once the total vector cost is less than the
10024 // total scalar cost:
10025 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
10026 //
10027 // Now we can compute the minimum required trip count TC as
10028 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
10029 //
10030 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
10031 // the computations are performed on doubles, not integers and the result
10032 // is rounded up, hence we get an upper estimate of the TC.
10033 unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
10034 uint64_t RtC = *CheckCost.getValue();
10035 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
10036 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
10037
10038 // Second, compute a minimum iteration count so that the cost of the
10039 // runtime checks is only a fraction of the total scalar loop cost. This
10040 // adds a loop-dependent bound on the overhead incurred if the runtime
10041 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
10042 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
10043 // cost, compute
10044 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
10045 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
10046
10047 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
10048 // epilogue is allowed, choose the next closest multiple of VF. This should
10049 // partly compensate for ignoring the epilogue cost.
10050 uint64_t MinTC = std::max(MinTC1, MinTC2);
10051 if (SEL == CM_ScalarEpilogueAllowed)
10052 MinTC = alignTo(MinTC, IntVF);
10054
10055 LLVM_DEBUG(
10056 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
10057 << VF.MinProfitableTripCount << "\n");
10058
10059 // Skip vectorization if the expected trip count is less than the minimum
10060 // required trip count.
10061 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
10064 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
10065 "trip count < minimum profitable VF ("
10066 << *ExpectedTC << " < " << VF.MinProfitableTripCount
10067 << ")\n");
10068
10069 return false;
10070 }
10071 }
10072 return true;
10073}
10074
10076 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10078 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10080
10081/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
10082/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
10083static void
10085 const SCEV2ValueTy &ExpandedSCEVs,
10086 const EpilogueLoopVectorizationInfo &EPI) {
10087 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
10088 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10089 Header->setName("vec.epilog.vector.body");
10090
10091 // Re-use the trip count and steps expanded for the main loop, as
10092 // skeleton creation needs it as a value that dominates both the scalar
10093 // and vector epilogue loops
10094 // TODO: This is a workaround needed for epilogue vectorization and it
10095 // should be removed once induction resume value creation is done
10096 // directly in VPlan.
10097 for (auto &R : make_early_inc_range(*Plan.getEntry())) {
10098 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
10099 if (!ExpandR)
10100 continue;
10101 auto *ExpandedVal =
10102 Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10103 ExpandR->replaceAllUsesWith(ExpandedVal);
10104 if (Plan.getTripCount() == ExpandR)
10105 Plan.resetTripCount(ExpandedVal);
10106 ExpandR->eraseFromParent();
10107 }
10108
10109 // Ensure that the start values for all header phi recipes are updated before
10110 // vectorizing the epilogue loop.
10111 for (VPRecipeBase &R : Header->phis()) {
10112 if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
10113 // When vectorizing the epilogue loop, the canonical induction start
10114 // value needs to be changed from zero to the value after the main
10115 // vector loop. Find the resume value created during execution of the main
10116 // VPlan.
10117 // FIXME: Improve modeling for canonical IV start values in the epilogue
10118 // loop.
10119 BasicBlock *MainMiddle = find_singleton<BasicBlock>(
10120 predecessors(L->getLoopPreheader()),
10121 [&EPI](BasicBlock *BB, bool) -> BasicBlock * {
10122 if (BB != EPI.MainLoopIterationCountCheck &&
10123 BB != EPI.EpilogueIterationCountCheck &&
10124 BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck)
10125 return BB;
10126 return nullptr;
10127 });
10128 using namespace llvm::PatternMatch;
10129 Type *IdxTy = IV->getScalarType();
10130 PHINode *EPResumeVal = find_singleton<PHINode>(
10131 L->getLoopPreheader()->phis(),
10132 [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * {
10133 if (P.getType() == IdxTy &&
10134 P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount &&
10135 match(
10136 P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
10137 m_SpecificInt(0)))
10138 return &P;
10139 return nullptr;
10140 });
10141 assert(EPResumeVal && "must have a resume value for the canonical IV");
10142 VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
10143 assert(all_of(IV->users(),
10144 [](const VPUser *U) {
10145 return isa<VPScalarIVStepsRecipe>(U) ||
10146 isa<VPScalarCastRecipe>(U) ||
10147 isa<VPDerivedIVRecipe>(U) ||
10148 cast<VPInstruction>(U)->getOpcode() ==
10149 Instruction::Add;
10150 }) &&
10151 "the canonical IV should only be used by its increment or "
10152 "ScalarIVSteps when resetting the start value");
10153 IV->setOperand(0, VPV);
10154 continue;
10155 }
10156
10157 Value *ResumeV = nullptr;
10158 // TODO: Move setting of resume values to prepareToExecute.
10159 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10160 ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
10161 ->getIncomingValueForBlock(L->getLoopPreheader());
10162 const RecurrenceDescriptor &RdxDesc =
10163 ReductionPhi->getRecurrenceDescriptor();
10164 RecurKind RK = RdxDesc.getRecurrenceKind();
10166 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10167 // start value; compare the final value from the main vector loop
10168 // to the start value.
10169 IRBuilder<> Builder(
10170 cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10171 ResumeV =
10172 Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue());
10173 }
10174 } else {
10175 // Retrieve the induction resume values for wide inductions from
10176 // their original phi nodes in the scalar loop.
10177 PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
10178 // Hook up to the PHINode generated by a ResumePhi recipe of main
10179 // loop VPlan, which feeds the scalar loop.
10180 ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
10181 }
10182 assert(ResumeV && "Must have a resume value");
10183 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
10184 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10185 }
10186}
10187
10189 assert((EnableVPlanNativePath || L->isInnermost()) &&
10190 "VPlan-native path is not enabled. Only process inner loops.");
10191
10192 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10193 << L->getHeader()->getParent()->getName() << "' from "
10194 << L->getLocStr() << "\n");
10195
10196 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10197
10198 LLVM_DEBUG(
10199 dbgs() << "LV: Loop hints:"
10200 << " force="
10202 ? "disabled"
10204 ? "enabled"
10205 : "?"))
10206 << " width=" << Hints.getWidth()
10207 << " interleave=" << Hints.getInterleave() << "\n");
10208
10209 // Function containing loop
10210 Function *F = L->getHeader()->getParent();
10211
10212 // Looking at the diagnostic output is the only way to determine if a loop
10213 // was vectorized (other than looking at the IR or machine code), so it
10214 // is important to generate an optimization remark for each loop. Most of
10215 // these messages are generated as OptimizationRemarkAnalysis. Remarks
10216 // generated as OptimizationRemark and OptimizationRemarkMissed are
10217 // less verbose reporting vectorized loops and unvectorized loops that may
10218 // benefit from vectorization, respectively.
10219
10220 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10221 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10222 return false;
10223 }
10224
10225 PredicatedScalarEvolution PSE(*SE, *L);
10226
10227 // Check if it is legal to vectorize the loop.
10228 LoopVectorizationRequirements Requirements;
10229 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
10230 &Requirements, &Hints, DB, AC, BFI, PSI);
10232 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10233 Hints.emitRemarkWithHints();
10234 return false;
10235 }
10236
10238 reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10239 "early exit is not enabled",
10240 "UncountableEarlyExitLoopsDisabled", ORE, L);
10241 return false;
10242 }
10243
10244 // Entrance to the VPlan-native vectorization path. Outer loops are processed
10245 // here. They may require CFG and instruction level transformations before
10246 // even evaluating whether vectorization is profitable. Since we cannot modify
10247 // the incoming IR, we need to build VPlan upfront in the vectorization
10248 // pipeline.
10249 if (!L->isInnermost())
10250 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10251 ORE, BFI, PSI, Hints, Requirements);
10252
10253 assert(L->isInnermost() && "Inner loop expected.");
10254
10255 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10256 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10257
10258 // If an override option has been passed in for interleaved accesses, use it.
10259 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10260 UseInterleaved = EnableInterleavedMemAccesses;
10261
10262 // Analyze interleaved memory accesses.
10263 if (UseInterleaved)
10265
10266 if (LVL.hasUncountableEarlyExit()) {
10267 BasicBlock *LoopLatch = L->getLoopLatch();
10268 if (IAI.requiresScalarEpilogue() ||
10270 [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
10271 reportVectorizationFailure("Auto-vectorization of early exit loops "
10272 "requiring a scalar epilogue is unsupported",
10273 "UncountableEarlyExitUnsupported", ORE, L);
10274 return false;
10275 }
10276 }
10277
10278 // Check the function attributes and profiles to find out if this function
10279 // should be optimized for size.
10281 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
10282
10283 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10284 // count by optimizing for size, to minimize overheads.
10285 auto ExpectedTC = getSmallBestKnownTC(PSE, L);
10286 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10287 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10288 << "This loop is worth vectorizing only if no scalar "
10289 << "iteration overheads are incurred.");
10291 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10292 else {
10293 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10294 LLVM_DEBUG(dbgs() << "\n");
10295 // Predicate tail-folded loops are efficient even when the loop
10296 // iteration count is low. However, setting the epilogue policy to
10297 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10298 // with runtime checks. It's more effective to let
10299 // `areRuntimeChecksProfitable` determine if vectorization is beneficial
10300 // for the loop.
10303 } else {
10304 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
10305 "small to consider vectorizing.\n");
10307 "The trip count is below the minial threshold value.",
10308 "loop trip count is too low, avoiding vectorization",
10309 "LowTripCount", ORE, L);
10310 Hints.emitRemarkWithHints();
10311 return false;
10312 }
10313 }
10314 }
10315
10316 // Check the function attributes to see if implicit floats or vectors are
10317 // allowed.
10318 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10320 "Can't vectorize when the NoImplicitFloat attribute is used",
10321 "loop not vectorized due to NoImplicitFloat attribute",
10322 "NoImplicitFloat", ORE, L);
10323 Hints.emitRemarkWithHints();
10324 return false;
10325 }
10326
10327 // Check if the target supports potentially unsafe FP vectorization.
10328 // FIXME: Add a check for the type of safety issue (denormal, signaling)
10329 // for the target we're vectorizing for, to make sure none of the
10330 // additional fp-math flags can help.
10331 if (Hints.isPotentiallyUnsafe() &&
10334 "Potentially unsafe FP op prevents vectorization",
10335 "loop not vectorized due to unsafe FP support.",
10336 "UnsafeFP", ORE, L);
10337 Hints.emitRemarkWithHints();
10338 return false;
10339 }
10340
10341 bool AllowOrderedReductions;
10342 // If the flag is set, use that instead and override the TTI behaviour.
10343 if (ForceOrderedReductions.getNumOccurrences() > 0)
10344 AllowOrderedReductions = ForceOrderedReductions;
10345 else
10346 AllowOrderedReductions = TTI->enableOrderedReductions();
10347 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10348 ORE->emit([&]() {
10349 auto *ExactFPMathInst = Requirements.getExactFPInst();
10350 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10351 ExactFPMathInst->getDebugLoc(),
10352 ExactFPMathInst->getParent())
10353 << "loop not vectorized: cannot prove it is safe to reorder "
10354 "floating-point operations";
10355 });
10356 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10357 "reorder floating-point operations\n");
10358 Hints.emitRemarkWithHints();
10359 return false;
10360 }
10361
10362 // Use the cost model.
10363 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10364 F, &Hints, IAI);
10365 // Use the planner for vectorization.
10366 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10367 ORE);
10368
10369 // Get user vectorization factor and interleave count.
10370 ElementCount UserVF = Hints.getWidth();
10371 unsigned UserIC = Hints.getInterleave();
10372
10373 // Plan how to best vectorize.
10374 LVP.plan(UserVF, UserIC);
10376 unsigned IC = 1;
10377
10380
10381 bool AddBranchWeights =
10382 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10383 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10384 AddBranchWeights);
10385 if (LVP.hasPlanWithVF(VF.Width)) {
10386 // Select the interleave count.
10387 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10388
10389 unsigned SelectedIC = std::max(IC, UserIC);
10390 // Optimistically generate runtime checks if they are needed. Drop them if
10391 // they turn out to not be profitable.
10392 if (VF.Width.isVector() || SelectedIC > 1)
10393 Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10394
10395 // Check if it is profitable to vectorize with runtime checks.
10396 bool ForceVectorization =
10398 if (!ForceVectorization &&
10399 !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
10400 ORE->emit([&]() {
10402 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10403 L->getHeader())
10404 << "loop not vectorized: cannot prove it is safe to reorder "
10405 "memory operations";
10406 });
10407 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10408 Hints.emitRemarkWithHints();
10409 return false;
10410 }
10411 }
10412
10413 // Identify the diagnostic messages that should be produced.
10414 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10415 bool VectorizeLoop = true, InterleaveLoop = true;
10416 if (VF.Width.isScalar()) {
10417 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10418 VecDiagMsg = std::make_pair(
10419 "VectorizationNotBeneficial",
10420 "the cost-model indicates that vectorization is not beneficial");
10421 VectorizeLoop = false;
10422 }
10423
10424 if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
10425 // Tell the user interleaving was avoided up-front, despite being explicitly
10426 // requested.
10427 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10428 "interleaving should be avoided up front\n");
10429 IntDiagMsg = std::make_pair(
10430 "InterleavingAvoided",
10431 "Ignoring UserIC, because interleaving was avoided up front");
10432 InterleaveLoop = false;
10433 } else if (IC == 1 && UserIC <= 1) {
10434 // Tell the user interleaving is not beneficial.
10435 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10436 IntDiagMsg = std::make_pair(
10437 "InterleavingNotBeneficial",
10438 "the cost-model indicates that interleaving is not beneficial");
10439 InterleaveLoop = false;
10440 if (UserIC == 1) {
10441 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10442 IntDiagMsg.second +=
10443 " and is explicitly disabled or interleave count is set to 1";
10444 }
10445 } else if (IC > 1 && UserIC == 1) {
10446 // Tell the user interleaving is beneficial, but it explicitly disabled.
10447 LLVM_DEBUG(
10448 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10449 IntDiagMsg = std::make_pair(
10450 "InterleavingBeneficialButDisabled",
10451 "the cost-model indicates that interleaving is beneficial "
10452 "but is explicitly disabled or interleave count is set to 1");
10453 InterleaveLoop = false;
10454 }
10455
10456 // If there is a histogram in the loop, do not just interleave without
10457 // vectorizing. The order of operations will be incorrect without the
10458 // histogram intrinsics, which are only used for recipes with VF > 1.
10459 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10460 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10461 << "to histogram operations.\n");
10462 IntDiagMsg = std::make_pair(
10463 "HistogramPreventsScalarInterleaving",
10464 "Unable to interleave without vectorization due to constraints on "
10465 "the order of histogram operations");
10466 InterleaveLoop = false;
10467 }
10468
10469 // Override IC if user provided an interleave count.
10470 IC = UserIC > 0 ? UserIC : IC;
10471
10472 // Emit diagnostic messages, if any.
10473 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10474 if (!VectorizeLoop && !InterleaveLoop) {
10475 // Do not vectorize or interleaving the loop.
10476 ORE->emit([&]() {
10477 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10478 L->getStartLoc(), L->getHeader())
10479 << VecDiagMsg.second;
10480 });
10481 ORE->emit([&]() {
10482 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10483 L->getStartLoc(), L->getHeader())
10484 << IntDiagMsg.second;
10485 });
10486 return false;
10487 }
10488
10489 if (!VectorizeLoop && InterleaveLoop) {
10490 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10491 ORE->emit([&]() {
10492 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10493 L->getStartLoc(), L->getHeader())
10494 << VecDiagMsg.second;
10495 });
10496 } else if (VectorizeLoop && !InterleaveLoop) {
10497 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10498 << ") in " << L->getLocStr() << '\n');
10499 ORE->emit([&]() {
10500 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10501 L->getStartLoc(), L->getHeader())
10502 << IntDiagMsg.second;
10503 });
10504 } else if (VectorizeLoop && InterleaveLoop) {
10505 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10506 << ") in " << L->getLocStr() << '\n');
10507 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10508 }
10509
10510 bool DisableRuntimeUnroll = false;
10511 MDNode *OrigLoopID = L->getLoopID();
10512 {
10513 using namespace ore;
10514 if (!VectorizeLoop) {
10515 assert(IC > 1 && "interleave count should not be 1 or 0");
10516 // If we decided that it is not legal to vectorize the loop, then
10517 // interleave it.
10518 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10519 InnerLoopVectorizer Unroller(
10520 L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
10521 ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan);
10522
10523 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10524
10525 ORE->emit([&]() {
10526 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10527 L->getHeader())
10528 << "interleaved loop (interleaved count: "
10529 << NV("InterleaveCount", IC) << ")";
10530 });
10531 } else {
10532 // If we decided that it is *legal* to vectorize the loop, then do it.
10533
10534 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10535 // Consider vectorizing the epilogue too if it's profitable.
10536 VectorizationFactor EpilogueVF =
10538 if (EpilogueVF.Width.isVector()) {
10539 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10540
10541 // The first pass vectorizes the main loop and creates a scalar epilogue
10542 // to be vectorized by executing the plan (potentially with a different
10543 // factor) again shortly afterwards.
10544 VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
10545 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10546 BestEpiPlan);
10547 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10548 EPI, &LVL, &CM, BFI, PSI, Checks,
10549 *BestMainPlan);
10550
10551 auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10552 *BestMainPlan, MainILV, DT, false);
10553 ++LoopsVectorized;
10554
10555 // Second pass vectorizes the epilogue and adjusts the control flow
10556 // edges from the first pass.
10557 EPI.MainLoopVF = EPI.EpilogueVF;
10558 EPI.MainLoopUF = EPI.EpilogueUF;
10559 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10560 ORE, EPI, &LVL, &CM, BFI, PSI,
10561 Checks, BestEpiPlan);
10562 EpilogILV.setTripCount(MainILV.getTripCount());
10563 preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
10564
10565 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10566 DT, true, &ExpandedSCEVs);
10567 ++LoopsEpilogueVectorized;
10568
10569 if (!MainILV.areSafetyChecksAdded())
10570 DisableRuntimeUnroll = true;
10571 } else {
10572 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10573 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10574 PSI, Checks, BestPlan);
10575 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10576 ++LoopsVectorized;
10577
10578 // Add metadata to disable runtime unrolling a scalar loop when there
10579 // are no runtime checks about strides and memory. A scalar loop that is
10580 // rarely used is not worth unrolling.
10581 if (!LB.areSafetyChecksAdded())
10582 DisableRuntimeUnroll = true;
10583 }
10584 // Report the vectorization decision.
10585 reportVectorization(ORE, L, VF, IC);
10586 }
10587
10590 }
10591
10592 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10593 "DT not preserved correctly");
10594
10595 std::optional<MDNode *> RemainderLoopID =
10598 if (RemainderLoopID) {
10599 L->setLoopID(*RemainderLoopID);
10600 } else {
10601 if (DisableRuntimeUnroll)
10603
10604 // Mark the loop as already vectorized to avoid vectorizing again.
10605 Hints.setAlreadyVectorized();
10606 }
10607
10608 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10609 return true;
10610}
10611
10613
10614 // Don't attempt if
10615 // 1. the target claims to have no vector registers, and
10616 // 2. interleaving won't help ILP.
10617 //
10618 // The second condition is necessary because, even if the target has no
10619 // vector registers, loop vectorization may still enable scalar
10620 // interleaving.
10623 return LoopVectorizeResult(false, false);
10624
10625 bool Changed = false, CFGChanged = false;
10626
10627 // The vectorizer requires loops to be in simplified form.
10628 // Since simplification may add new inner loops, it has to run before the
10629 // legality and profitability checks. This means running the loop vectorizer
10630 // will simplify all loops, regardless of whether anything end up being
10631 // vectorized.
10632 for (const auto &L : *LI)
10633 Changed |= CFGChanged |=
10634 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10635
10636 // Build up a worklist of inner-loops to vectorize. This is necessary as
10637 // the act of vectorizing or partially unrolling a loop creates new loops
10638 // and can invalidate iterators across the loops.
10639 SmallVector<Loop *, 8> Worklist;
10640
10641 for (Loop *L : *LI)
10642 collectSupportedLoops(*L, LI, ORE, Worklist);
10643
10644 LoopsAnalyzed += Worklist.size();
10645
10646 // Now walk the identified inner loops.
10647 while (!Worklist.empty()) {
10648 Loop *L = Worklist.pop_back_val();
10649
10650 // For the inner loops we actually process, form LCSSA to simplify the
10651 // transform.
10652 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10653
10654 Changed |= CFGChanged |= processLoop(L);
10655
10656 if (Changed) {
10657 LAIs->clear();
10658
10659#ifndef NDEBUG
10660 if (VerifySCEV)
10661 SE->verify();
10662#endif
10663 }
10664 }
10665
10666 // Process each loop nest in the function.
10667 return LoopVectorizeResult(Changed, CFGChanged);
10668}
10669
10672 LI = &AM.getResult<LoopAnalysis>(F);
10673 // There are no loops in the function. Return before computing other
10674 // expensive analyses.
10675 if (LI->empty())
10676 return PreservedAnalyses::all();
10685
10686 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10687 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10688 BFI = nullptr;
10689 if (PSI && PSI->hasProfileSummary())
10691 LoopVectorizeResult Result = runImpl(F);
10692 if (!Result.MadeAnyChange)
10693 return PreservedAnalyses::all();
10695
10696 if (isAssignmentTrackingEnabled(*F.getParent())) {
10697 for (auto &BB : F)
10699 }
10700
10701 PA.preserve<LoopAnalysis>();
10705
10706 if (Result.MadeCFGChange) {
10707 // Making CFG changes likely means a loop got vectorized. Indicate that
10708 // extra simplification passes should be run.
10709 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10710 // be run if runtime checks have been added.
10713 } else {
10715 }
10716 return PA;
10717}
10718
10720 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10721 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10722 OS, MapClassName2PassName);
10723
10724 OS << '<';
10725 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10726 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10727 OS << '>';
10728}
@ Poison
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
ReachingDefAnalysis InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
This is the interface for LLVM's primary stateless and local alias analysis.
static bool IsEmptyBlock(MachineBasicBlock *MBB)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition: Debug.h:64
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
std::string Name
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define Check(C,...)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
Module.h This file contains the declarations for the Module class.
This defines the Use class.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:80
This header provides classes for managing per-loop analyses.
static const char * VerboseDebug
loop Loop Strength Reduction
#define LV_NAME
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static void addRuntimeUnrollDisableMetaData(Loop *L)
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static SetVector< VPIRInstruction * > collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, const MapVector< PHINode *, InductionDescriptor > &Inductions)
static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or its operands.
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, Loop *L, const TargetTransformInfo &TTI, PredicatedScalarEvolution &PSE, ScalarEpilogueLowering SEL)
const char LLVMLoopVectorizeFollowupAll[]
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, SetVector< VPIRInstruction * > &ExitUsersToFix)
Handle users in the exit block for first order reductions in the original exit block.
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, DebugLoc DL)
static std::optional< unsigned > getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI)
Convenience function that returns the value of vscale_range iff vscale_range.min == vscale_range....
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static constexpr uint32_t MemCheckBypassWeights[]
cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing."))
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static bool planContainsAdditionalSimplifications(VPlan &Plan, VPCostContext &CostCtx, Loop *TheLoop)
Return true if the original loop \ TheLoop contains any instructions that do not have corresponding r...
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static Type * maybeVectorizeType(Type *Elt, ElementCount VF)
static std::optional< unsigned > getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, bool CanUseConstantMax=true)
Returns "best known" trip count for the specified loop L as defined by the following procedure: 1) Re...
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
const char VerboseDebug[]
static void fixReductionScalarResumeWhenVectorizingEpilog(VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock, BasicBlock *BypassBlock)
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop)
Creates a VPWidenIntOrFpInductionRecpipe for Phi.
static constexpr uint32_t SCEVCheckBypassWeights[]
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
const char LLVMLoopVectorizeFollowupVectorized[]
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
static void introduceCheckBlockInVPlan(VPlan &Plan, BasicBlock *CheckIRBB)
Introduces a new VPIRBasicBlock for CheckIRBB to Plan between the vector preheader and its predecesso...
const char LLVMLoopVectorizeFollowupEpilogue[]
static void preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs, const EpilogueLoopVectorizationInfo &EPI)
Prepare Plan for vectorizing the epilogue loop.
static bool useActiveLaneMask(TailFoldingStyle Style)
static unsigned getEstimatedRuntimeVF(const Loop *L, const TargetTransformInfo &TTI, ElementCount VF)
This function attempts to return a value that represents the vectorization factor at runtime.
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static void addFullyUnrolledInstructionsToIgnore(Loop *L, const LoopVectorizationLegality::InductionList &IL, SmallPtrSetImpl< Instruction * > &InstsToIgnore)
Knowing that loop L executes a single vector iteration, add instructions that will get simplified and...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static bool addUsersInExitBlocks(VPlan &Plan, const SetVector< VPIRInstruction * > &ExitUsersToFix)
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan)
Create resume phis in the scalar preheader for first-order recurrences and reductions and update the ...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
#define DEBUG_TYPE
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static bool willGenerateVectors(VPlan &Plan, ElementCount VF, const TargetTransformInfo &TTI)
Check if any recipe of Plan will generate a vector value, which will be assigned a vector register.
static cl::opt< bool > EnableEarlyExitVectorization("enable-early-exit-vectorization", cl::init(false), cl::Hidden, cl::desc("Enable vectorization of early exit loops with uncountable exits."))
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I, DebugLoc DL={})
Create an analysis remark that explains why vectorization failed.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define H(x, y, z)
Definition: MD5.cpp:57
mir Rename Register Operands
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
This file contains the declarations for metadata subclasses.
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
#define OP(OPC)
Definition: Instruction.h:45
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
This pass exposes codegen information to IR-level passes.
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file defines the VPlanHCFGBuilder class which contains the public interface (buildHierarchicalCF...
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1520
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
void registerAssumption(AssumeInst *CI)
Add an @llvm.assume intrinsic to this function's cache.
unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
Definition: Attributes.cpp:460
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:517
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:367
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:459
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:489
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
BinaryOps getOpcode() const
Definition: InstrTypes.h:370
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
bool isConditional() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1873
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1349
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1294
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1285
unsigned arg_size() const
Definition: InstrTypes.h:1292
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:787
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:866
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:873
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
A debug info location.
Definition: DebugLoc.h:33
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:202
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
DomTreeNodeBase * getIDom() const
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:326
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:314
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition: TypeSize.h:317
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
BasicBlock * emitMinimumVectorEpilogueIterCountCheck(BasicBlock *Bypass, BasicBlock *Insert)
Emits an iteration count bypass check after the main vector loop has finished to see if there are any...
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (ie the ...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the main loop strategy (ie the firs...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, BasicBlock *MiddleBlock, VPTransformState &State) override
Set up the values of the IVs correctly when exiting the vector loop.
BasicBlock * emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:338
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent function types.
Definition: DerivedTypes.h:105
param_iterator param_begin() const
Definition: DerivedTypes.h:130
param_iterator param_end() const
Definition: DerivedTypes.h:131
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:707
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:704
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:731
static GEPNoWrapFlags inBounds()
static GEPNoWrapFlags none()
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:91
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:879
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:463
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1048
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:308
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2277
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2273
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:142
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1367
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1350
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:468
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2383
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1427
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2697
A struct for saving information about induction variables.
const SCEV * getStep() const
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
Value * getStartValue() const
An extension of the inner loop vectorizer that creates a skeleton for a vectorized loop that has its ...
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
BasicBlock * createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
virtual BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)=0
The interface for creating a vectorized skeleton using one of two different strategies,...
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
Value * TripCount
Trip count of the original loop.
void sinkScalarOperands(Instruction *PredInst)
Iteratively sink the scalarized operands of a predicated instruction into the block that was created ...
const TargetLibraryInfo * TLI
Target Library Info.
virtual void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, BasicBlock *MiddleBlock, VPTransformState &State)
Set up the values of the IVs correctly when exiting the vector loop.
ElementCount MinProfitableTripCount
const TargetTransformInfo * TTI
Target Transform Info.
Value * VectorTripCount
Trip count of the widened loop (TripCount - TripCount % (VF*UF))
BasicBlock * emitSCEVChecks(BasicBlock *Bypass)
Emit a bypass check to see if all of the SCEV assumptions we've had to make are correct.
virtual BasicBlock * createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
LoopVectorizationCostModel * Cost
The profitablity analysis.
BasicBlock * AdditionalBypassBlock
The additional bypass block which conditionally skips over the epilogue loop after executing the main...
BlockFrequencyInfo * BFI
BFI and PSI are used to check for profile guided size optimizations.
Value * getTripCount() const
Returns the original loop trip count.
BasicBlock * LoopMiddleBlock
Middle Block between the vector and the scalar.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPLane &Lane, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
SmallVector< Instruction *, 4 > PredicatedInstructions
Store instructions that were predicated.
DenseMap< PHINode *, Value * > Induction2AdditionalBypassValue
Mapping of induction phis to their additional bypass values.
void createInductionResumeVPValues(const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount=nullptr, SmallPtrSetImpl< PHINode * > *IVSubset=nullptr)
Create new phi nodes for the induction variables to resume iteration count in the scalar epilogue,...
void createVectorLoopSkeleton(StringRef Prefix)
Emit basic blocks (prefixed with Prefix) for the iteration check, vector loop preheader,...
BasicBlock * emitMemRuntimeChecks(BasicBlock *Bypass)
Emit bypass checks to check any memory assumptions we may have made.
BasicBlock * LoopScalarPreHeader
The scalar-loop preheader.
LoopVectorizationLegality * Legal
The legality analysis.
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks, VPlan &Plan)
void emitIterationCountCheck(BasicBlock *Bypass)
Emit a bypass check to see if the vector trip count is zero, including if it overflows.
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
LoopInfo * LI
Loop Info.
ProfileSummaryInfo * PSI
Value * getInductionAdditionalBypassValue(PHINode *OrigPhi) const
induction header phi.
BasicBlock * getAdditionalBypassBlock() const
Return the additional bypass block which targets the scalar loop by skipping the epilogue loop after ...
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
void fixVectorizedLoop(VPTransformState &State)
Fix the vectorized code, taking care of header phi's, and more.
BasicBlock * LoopVectorPreHeader
The vector-loop preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
Value * getOrCreateVectorTripCount(BasicBlock *InsertBlock)
Returns (and creates if needed) the trip count of the widened loop.
void createInductionResumeVPValue(VPIRInstruction *InductionPhiIRI, const InductionDescriptor &ID, Value *Step, ArrayRef< BasicBlock * > BypassBlocks, VPBuilder &ScalarPHBuilder, Value *MainVectorTripCount=nullptr)
Create a ResumePHI VPInstruction for the induction InductionPhiIRI to resume iteration count in the s...
IRBuilder Builder
The builder that we use.
void fixNonInductionPHIs(VPTransformState &State)
Fix the non-induction PHIs in Plan.
unsigned UF
The vectorization unroll factor to use.
SmallVector< BasicBlock *, 4 > LoopBypassBlocks
A list of all bypass blocks. The first block is the entry of the loop.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:475
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:68
bool isBinaryOp() const
Definition: Instruction.h:279
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
void replaceSuccessorWith(BasicBlock *OldBB, BasicBlock *NewBB)
Replace specified successor OldBB to point at the provided block.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:169
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
const char * getOpcodeName() const
Definition: Instruction.h:276
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
The group of interleaved loads/stores sharing the same stride and close to each other.
Definition: VectorUtils.h:480
uint32_t getFactor() const
Definition: VectorUtils.h:496
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
Definition: VectorUtils.h:550
InstTy * getInsertPos() const
Definition: VectorUtils.h:566
uint32_t getNumMembers() const
Definition: VectorUtils.h:498
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:622
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
Definition: VectorUtils.h:667
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
Definition: VectorUtils.h:678
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
Definition: VectorUtils.h:659
bool invalidateGroups()
Invalidate groups, e.g., in case all blocks in loop will be predicated contrary to original assumptio...
Definition: VectorUtils.h:642
iterator_range< SmallPtrSetIterator< llvm::InterleaveGroup< Instruction > * > > getInterleaveGroups()
Definition: VectorUtils.h:672
void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
Type * getPointerOperandType() const
Definition: Instructions.h:258
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
BlockT * getUniqueLatchExitBlock() const
Return the unique exit block for the latch, or null if there are multiple different exit blocks or th...
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
iterator_range< block_iterator > blocks() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
Store the result of a depth first search within basic blocks contained by a single loop.
Definition: LoopIterator.h:97
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
Definition: LoopIterator.h:136
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopInfo.cpp:1254
RPOIterator endRPO() const
Definition: LoopIterator.h:140
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopIterator.h:180
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool isEpilogueVectorizationProfitable(const ElementCount VF, const unsigned IC) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
bool requiresScalarEpilogue(VFRange Range) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const
Returns true if the predicated reduction select should be used to set the incoming value for the redu...
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
std::optional< unsigned > getMaxSafeElements() const
Return maximum safe number of elements to be processed per vector iteration, which do not prevent sto...
std::optional< InstructionCost > getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, TTI::TargetCostKind CostKind) const
Return the cost of instructions in an inloop reduction pattern, if I is part of that pattern.
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI)
LoopVectorizationLegality * Legal
Vectorization legality.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
InstructionCost getInstructionCost(Instruction *I, ElementCount VF)
Returns the execution time cost of an instruction for a given vector width.
DemandedBits * DB
Demanded bits analysis.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr) const
Get the interleaved access group that Instr belongs to.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
Loop * TheLoop
The loop that we evaluate.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool isAccessInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleaved access group.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallVector< RegisterUsage, 8 > calculateRegisterUsage(ArrayRef< ElementCount > VFs)
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool shouldConsiderInvariant(Value *Op)
Returns true if Op should be considered invariant and if it is trivially hoistable.
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
void collectUniformsAndScalars(ElementCount VF)
Collect Uniform and Scalar values for the given VF.
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const
Returns true if I is an instruction which requires predication and for which our chosen predication s...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
InstructionCost expectedCost(ElementCount VF)
Returns the expected execution cost.
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost)
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
bool isInvariantStoreOfReduction(StoreInst *SI)
Returns True if given store is a final invariant store of one of the reductions found in the loop.
bool hasVectorCallVariants() const
Returns true if there is at least one function call in the loop which has a vectorized variant availa...
bool isInvariantAddressOfReduction(Value *V)
Returns True if given address is invariant and is used to store recurrent expression.
bool blockNeedsPredication(BasicBlock *BB) const
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const
Check if this pointer is consecutive when vectorizing.
std::optional< const HistogramInfo * > getHistogramInfo(Instruction *I) const
Returns a HistogramInfo* for the given instruction if it was determined to be part of a load -> updat...
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
bool isReductionVariable(PHINode *PN) const
Returns True if PN is a reduction variable in this loop.
bool isFixedOrderRecurrence(const PHINode *Phi) const
Returns True if Phi is a fixed-order recurrence in this loop.
const InductionDescriptor * getPointerInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is pointer induction.
const InductionDescriptor * getIntOrFpInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is an integer or floating point induction.
bool isInductionPhi(const Value *V) const
Returns True if V is a Phi node of an induction variable in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const SmallVector< BasicBlock *, 4 > & getCountableExitingBlocks() const
Returns all exiting blocks with a countable exit, i.e.
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool isInvariant(Value *V) const
Returns true if V is invariant across all loop iterations according to SCEV.
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
bool canFoldTailByMasking() const
Return true if we can vectorize this loop while folding its tail by masking.
void prepareToFoldTailByMasking()
Mark all respective loads/stores for masking.
Type * getWidestInductionType()
Returns the widest induction type.
bool hasUncountableEarlyExit() const
Returns true if the loop has an uncountable early exit, i.e.
bool hasHistograms() const
Returns a list of all known histogram operations in the loop.
const LoopAccessInfo * getLAI() const
bool isUniformMemOp(Instruction &I, ElementCount VF) const
A uniform memory op is a load or store which accesses the same memory location on all VF lanes,...
BasicBlock * getUncountableEarlyExitingBlock() const
Returns the uncountable early exiting block.
bool isMaskRequired(const Instruction *I) const
Returns true if vector representation of the instruction I requires mask.
const RuntimePointerChecking * getRuntimePointerChecking() const
Returns the information that we collected about runtime memory check.
Planner drives the vectorization process after having passed Legality checks.
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)
VPlan & getPlanFor(ElementCount VF) const
Return the VPlan for VF.
Definition: VPlan.cpp:1634
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
Definition: VPlan.cpp:1622
VectorizationFactor computeBestVF()
Compute and return the most profitable vectorization factor.
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE)
Emit remarks for recipes with invalid costs in the available VPlans.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition: VPlan.cpp:1603
void printPlans(raw_ostream &O)
Definition: VPlan.cpp:1648
void plan(ElementCount UserVF, unsigned UserIC)
Build VPlans for the specified UserVF and UserIC if they are non-zero or all applicable candidate VFs...
DenseMap< const SCEV *, Value * > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool VectorizingEpilogue, const DenseMap< const SCEV *, Value * > *ExpandedSCEVs=nullptr)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
void prepareNoAliasMetadata()
Set up the aliasing scopes based on the memchecks.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition: LoopInfo.cpp:67
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:632
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:61
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Definition: LoopInfo.cpp:502
Metadata node.
Definition: Metadata.h:1069
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1077
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1543
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1436
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:606
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool contains(const KeyT &Key) const
Definition: MapVector.h:163
bool empty() const
Definition: MapVector.h:79
size_type size() const
Definition: MapVector.h:60
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:228
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:692
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static unsigned getIncomingValueNumForOperand(unsigned i)
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
unsigned getSmallConstantMaxTripCount()
Returns the upper bound of the loop trip count as a normal unsigned value, or 0 if the trip count is ...
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSymbolicMaxBackedgeTakenCount()
Get the (predicated) symbolic max backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
void preserve()
Mark an analysis as preserved.
Definition: Analysis.h:131
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:77
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
static unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
TrackingVH< Value > getRecurrenceStartValue() const
SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isFindLastIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
Value * getSentinelValue() const
Returns the sentinel value for FindLastIV recurrences to replace the start value.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
bool Need
This flag indicates if we need to add the runtime check.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class represents a constant integer value.
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
This class represents an assumption made using SCEV expressions which can be checked at run-time.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isZero() const
Return true if the expression is a constant zero.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
value_type pop_back_val()
Definition: SetVector.h:285
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:401
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Multiway switch.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
std::optional< unsigned > getVScaleForTuning() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const
Return true if the target supports masked scatter.
bool enableAggressiveInterleaving(bool LoopHasReductions) const
Don't restrict interleaved unrolling to small loops.
bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
bool prefersVectorizedAddressing() const
Return true if target doesn't mind addresses in vectors.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
bool hasBranchDivergence(const Function *F=nullptr) const
Return true if branch divergence exists.
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE=nullptr, const SCEV *Ptr=nullptr) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const
Get target-customized preferences for the generic loop unrolling transformation.
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instructions unique non-constant operands.
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
std::optional< unsigned > getMaxVScale() const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool enableOrderedReductions() const
Return true if we should be enabling ordered reductions for the target.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Query the target what the preferred style of tail folding is.
unsigned getRegUsageForType(Type *Ty) const
Returns the estimated number of registers required to represent Ty.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add ...
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isElementTypeLegalForScalableVector(Type *Ty) const
ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const
bool enableMaskedInterleavedAccessVectorization() const
Enable matching of interleaved access groups that contain predicated accesses or gaps and therefore v...
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
const char * getRegisterClassName(unsigned ClassID) const
bool preferEpilogueVectorization() const
Return true if the loop vectorizer should consider vectorizing an otherwise scalar epilogue loop.
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
unsigned getEpilogueVectorizationMinVF() const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
bool isLegalMaskedStore(Type *DataType, Align Alignment) const
Return true if the target supports masked store.
@ TCC_Free
Expected to fold away in lowering.
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
bool enableInterleavedAccessVectorization() const
Enable matching of interleaved access groups.
unsigned getMinTripCountTailFoldingThreshold() const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
unsigned getMaxInterleaveFactor(ElementCount VF) const
unsigned getNumberOfParts(Type *Tp) const
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
Query the target whether it would be prefered to create a predicated vector loop, which can avoid the...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
bool isLegalMaskedLoad(Type *DataType, Align Alignment) const
Return true if the target supports masked load.
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition: TypeSwitch.h:87
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition: TypeSwitch.h:96
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition: Type.h:252
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isTokenTy() const
Return true if this is 'token'.
Definition: Type.h:234
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:288
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
op_iterator op_begin()
Definition: User.h:280
void setOperand(unsigned i, Value *Val)
Definition: User.h:233
Value * getOperand(unsigned i) const
Definition: User.h:228
op_iterator op_end()
Definition: User.h:282
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:72
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:3470
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition: VPlan.h:3542
RecipeListTy::iterator iterator
Instruction iterators...
Definition: VPlan.h:3494
void execute(VPTransformState *State) override
The method which generates the output IR instructions that correspond to this VPBasicBlock,...
Definition: VPlan.cpp:477
iterator end()
Definition: VPlan.h:3504
iterator begin()
Recipe iterator methods.
Definition: VPlan.h:3502
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition: VPlan.h:3555
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition: VPlan.cpp:213
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition: VPlan.h:3533
bool empty() const
Definition: VPlan.h:3513
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition: VPlan.h:2428
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:396
VPRegionBlock * getParent()
Definition: VPlan.h:488
const VPBasicBlock * getExitingBasicBlock() const
Definition: VPlan.cpp:178
void setName(const Twine &newName)
Definition: VPlan.h:481
size_t getNumSuccessors() const
Definition: VPlan.h:534
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition: VPlan.h:627
const VPBlocksTy & getPredecessors() const
Definition: VPlan.h:519
VPBlockBase * getSinglePredecessor() const
Definition: VPlan.h:530
const VPBasicBlock * getEntryBasicBlock() const
Definition: VPlan.cpp:158
VPBlockBase * getSingleSuccessor() const
Definition: VPlan.h:524
const VPBlocksTy & getSuccessors() const
Definition: VPlan.h:513
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBase NewBlock after BlockPtr.
Definition: VPlan.h:4117
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition: VPlan.h:4233
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition: VPlan.h:4171
static void reassociateBlocks(VPBlockBase *Old, VPBlockBase *New)
Reassociate all the blocks connected to Old so that they now point to New.
Definition: VPlan.h:4198
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL={}, const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPValue * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPBasicBlock * getInsertBlock() const
VPInstruction * createOverflowingOp(unsigned Opcode, std::initializer_list< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL={}, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPValue * createNot(VPValue *Operand, DebugLoc DL={}, const Twine &Name="")
VPValue * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPValue * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL={}, const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition: VPlan.h:3167
Type * getScalarType() const
Returns the scalar type of the induction.
Definition: VPlan.h:3198
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition: VPlanValue.h:387
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPValue * getStepValue() const
Definition: VPlan.h:3400
VPValue * getStartValue() const
Definition: VPlan.h:3399
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition: VPlan.h:2026
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition: VPlan.h:2074
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition: VPlan.h:2063
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition: VPlan.h:1775
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition: VPlan.h:3616
static VPIRBasicBlock * fromBasicBlock(BasicBlock *IRBB)
Create a VPIRBasicBlock from IRBB containing VPIRInstructions for all instructions in IRBB,...
Definition: VPlan.cpp:843
A recipe to wrap on original IR instruction not to be modified during execution, execept for PHIs.
Definition: VPlan.h:1382
Instruction & getInstruction() const
Definition: VPlan.h:1406
This is a concrete Recipe that models a single VPlan-level instruction.
Definition: VPlan.h:1197
@ ResumePhi
Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
Definition: VPlan.h:1215
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition: VPlan.h:2495
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
Definition: VPlan.h:153
static VPLane getLastLaneForVF(const ElementCount &VF)
Definition: VPlan.h:194
static VPLane getFirstLane()
Definition: VPlan.h:178
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition: VPlan.h:720
VPBasicBlock * getParent()
Definition: VPlan.h:745
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
Helper class to create VPRecipies from IR instructions.
VPValue * createEdgeMask(BasicBlock *Src, BasicBlock *Dst)
A helper function that computes the predicate of the edge between SRC and DST.
VPReplicateRecipe * handleReplication(Instruction *I, VFRange &Range)
Build a VPReplicationRecipe for I.
void createSwitchEdgeMasks(SwitchInst *SI)
Create an edge mask for every destination of cases and/or default.
VPValue * getBlockInMask(BasicBlock *BB) const
Returns the entry mask for the block BB.
VPValue * getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const
A helper that returns the previously computed predicate of the edge between SRC and DST.
iterator_range< mapped_iterator< Use *, std::function< VPValue *(Value *)> > > mapToVPValues(User::op_range Operands)
Returns a range mapping the values of the range Operands to their corresponding VPValues.
void fixHeaderPhis()
Add the incoming values from the backedge to reduction & first-order recurrence cross-iteration phis.
VPRecipeBase * tryToCreateWidenRecipe(Instruction *Instr, ArrayRef< VPValue * > Operands, VFRange &Range, VPBasicBlock *VPBB)
Create and return a widened recipe for I if one can be created within the given VF Range.
VPValue * getVPValueOrAddLiveIn(Value *V)
void createHeaderMask()
Create the mask for the vector loop header block.
void createBlockInMask(BasicBlock *BB)
A helper function that computes the predicate of the block BB, assuming that the header block of the ...
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
void setFlags(Instruction *I) const
Set the IR flags for I.
Definition: VPlan.h:1111
A recipe for handling reduction phis.
Definition: VPlan.h:2369
bool isInLoop() const
Returns true, if the phi is part of an in-loop reduction.
Definition: VPlan.h:2423
const RecurrenceDescriptor & getRecurrenceDescriptor() const
Definition: VPlan.h:2415
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition: VPlan.h:2590
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition: VPlan.h:3657
const VPBlockBase * getEntry() const
Definition: VPlan.h:3696
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition: VPlan.h:3728
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition: VPlan.h:2711
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isUniform() const
Definition: VPlan.h:2755
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
A recipe to compute the pointers for widened memory accesses of IndexTy in reverse order.
Definition: VPlan.h:1903
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition: VPlan.h:847
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition: VPlan.h:916
An analysis for type-inference for VPValues.
Definition: VPlanAnalysis.h:40
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition: VPlanValue.h:200
operand_range operands()
Definition: VPlanValue.h:257
void setOperand(unsigned I, VPValue *New)
Definition: VPlanValue.h:242
unsigned getNumOperands() const
Definition: VPlanValue.h:236
VPValue * getOperand(unsigned N) const
Definition: VPlanValue.h:237
void addOperand(VPValue *Operand)
Definition: VPlanValue.h:231
void replaceAllUsesWith(VPValue *New)
Definition: VPlan.cpp:1417
Value * getLiveInIRValue()
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition: VPlanValue.h:172
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition: VPlanValue.h:167
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition: VPlan.cpp:1421
user_range users()
Definition: VPlanValue.h:132
A recipe to compute the pointers for widened memory accesses of IndexTy.
Definition: VPlan.h:1956
A recipe for widening Call instructions using library calls.
Definition: VPlan.h:1719
A Recipe for widening the canonical induction variable of the vector loop.
Definition: VPlan.h:3308
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition: VPlan.h:1529
A recipe for handling GEP instructions.
Definition: VPlan.h:1854
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition: VPlan.h:2132
A recipe for widening vector intrinsics.
Definition: VPlan.h:1627
A common base class for widening memory operations.
Definition: VPlan.h:2884
A recipe for handling phis that are widened in the vector loop.
Definition: VPlan.h:2292
VPValue * getIncomingValue(unsigned I)
Returns the I th incoming VPValue.
Definition: VPlan.h:2331
VPBasicBlock * getIncomingBlock(unsigned I)
Returns the I th incoming VPBasicBlock.
Definition: VPlan.h:2328
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition: VPlan.h:1431
Main class to build the VPlan H-CFG for an incoming IR.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition: VPlan.h:3761
void prepareToExecute(Value *TripCount, Value *VectorTripCount, VPTransformState &State)
Prepare the plan for execution, setting up the required live-in values.
Definition: VPlan.cpp:930
VPBasicBlock * getEntry()
Definition: VPlan.h:3870
VPValue & getVectorTripCount()
The vector trip count.
Definition: VPlan.h:3928
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition: VPlan.h:3934
VPValue & getVF()
Returns the VF of the vector loop region.
Definition: VPlan.h:3931
VPValue * getTripCount() const
The trip count of the original loop.
Definition: VPlan.h:3907
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition: VPlan.h:3921
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition: VPlan.h:3951
unsigned getUF() const
Definition: VPlan.h:3959
static VPlanPtr createInitialVPlan(Type *InductionTy, PredicatedScalarEvolution &PSE, bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop)
Create initial VPlan, having an "entry" VPBasicBlock (wrapping original scalar pre-header) which cont...
Definition: VPlan.cpp:851
bool hasVF(ElementCount VF)
Definition: VPlan.h:3944
bool hasUF(unsigned UF) const
Definition: VPlan.h:3957
auto getExitBlocks()
Return an iterator range over the VPIRBasicBlock wrapping the exit blocks of the VPlan,...
Definition: VPlanCFG.h:309
VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition: VPlan.cpp:1079
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this plan.
Definition: VPlan.cpp:1073
const VPBasicBlock * getMiddleBlock() const
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition: VPlan.h:3885
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition: VPlan.h:3914
void setEntry(VPBasicBlock *VPBB)
Definition: VPlan.h:3840
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition: VPlan.h:3977
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition: VPlan.h:3893
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition: VPlan.cpp:977
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition: VPlan.h:4011
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition: VPlan.h:3898
VPValue * getSCEVExpansion(const SCEV *S) const
Definition: VPlan.h:4020
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region.
Definition: VPlan.h:3874
VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition: VPlan.cpp:1219
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition: Value.cpp:157
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:232
constexpr bool isNonZero() const
Definition: TypeSize.h:158
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:218
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition: TypeSize.h:258
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition: TypeSize.h:174
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr bool isZero() const
Definition: TypeSize.h:156
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:225
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:254
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:239
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
A range adaptor for a pair of iterators.
IteratorT end() const
IteratorT begin() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ Entry
Definition: COFF.h:844
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
ID ArrayRef< Type * > Tys
Definition: Intrinsics.h:102
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition: DwarfDebug.h:190
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:826
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:235
bool isUniformAfterVectorization(const VPValue *VPV)
Returns true if VPV is uniform after vectorization.
Definition: VPlanUtils.h:39
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE)
Get or create a VPValue that corresponds to the expansion of Expr.
Definition: VPlanUtils.cpp:26
const SCEV * getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE)
Return the SCEV expression for V.
Definition: VPlanUtils.cpp:65
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
@ Offset
Definition: DWP.cpp:480
Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
Definition: LoopUtils.cpp:1954
bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:850
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:989
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
auto pred_end(const MachineBasicBlock *BB)
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7297
auto successors(const MachineBasicBlock *BB)
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
std::pair< Instruction *, ElementCount > InstructionVFPair
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:465
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
Definition: LoopUtils.cpp:263
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition: VPlanCFG.h:214
bool VerifySCEV
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition: VPlanCFG.h:226
auto map_range(ContainerTy &&C, FuncTy F)
Definition: STLExtras.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
void collectEphemeralRecipesForVPlan(VPlan &Plan, DenseSet< VPRecipeBase * > &EphRecipes)
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
cl::opt< bool > EnableVPlanNativePath("enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization."))
Definition: VPlan.cpp:53
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
std::unique_ptr< VPlan > VPlanPtr
Definition: VPlan.h:144
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
cl::opt< bool > EnableLoopVectorization
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:425
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition: STLExtras.h:573
void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition: STLExtras.h:336
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
TargetTransformInfo TTI
static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
Definition: DebugInfo.cpp:2298
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:33
@ Or
Bitwise or logical OR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, Loop *RemainderLoop, uint64_t UF)
Set weights for UnrolledLoop and RemainderLoop based on weights for OrigLoop and the following distri...
Definition: LoopUtils.cpp:1761
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
DWARFExpression::Operation Op
ScalarEpilogueLowering
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
auto pred_begin(const MachineBasicBlock *BB)
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
Definition: LoopUtils.cpp:2012
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2087
InstructionCost Cost
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
unsigned getReciprocalPredBlockProb()
A helper function that returns the reciprocal of the block probability of predicated blocks.
Definition: VPlan.h:92
bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:590
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
bool verifyVPlanIsValid(const VPlan &Plan)
Verify invariants for general VPlans.
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:468
cl::opt< bool > EnableLoopInterleaving
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:28
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF, VPlan &EpiloguePlan)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
This holds details about a histogram operation – a load -> update -> store sequence where each lane i...
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
SmallMapVector< unsigned, unsigned, 4 > LoopInvariantRegs
Holds the number of loop invariant values that are used in the loop.
TargetLibraryInfo * TLI
LoopVectorizeResult runImpl(Function &F)
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LoopVectorizePass(LoopVectorizeOptions Opts={})
BlockFrequencyInfo * BFI
ScalarEvolution * SE
AssumptionCache * AC
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
Storage for information about made changes.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition: PassManager.h:69
A marker analysis to determine if extra passes should be run after loop vectorization.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Flags describing the kind of vector reduction.
Parameters that control the generic loop unrolling transformation.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Definition: VPlan.h:97
ElementCount End
Definition: VPlan.h:102
Struct to hold various analysis needed for cost computations.
Definition: VPlan.h:688
LoopVectorizationCostModel & CM
Definition: VPlan.h:693
bool skipCostComputation(Instruction *UI, bool IsVector) const
Return true if the cost for UI shouldn't be computed, e.g.
InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const
Return the cost for UI with VF using the legacy cost model as fallback until computing the cost of al...
SmallPtrSet< Instruction *, 8 > SkipCostComputation
Definition: VPlan.h:694
A recipe for handling first-order recurrence phis.
Definition: VPlan.h:2337
BasicBlock * PrevBB
The previous IR BasicBlock created or used.
Definition: VPlan.h:343
SmallDenseMap< VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
Definition: VPlan.h:351
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
Definition: VPlan.h:236
DenseMap< const SCEV *, Value * > ExpandedSCEVs
Map SCEVs to their expanded values.
Definition: VPlan.h:388
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
Definition: VPlan.h:391
void packScalarIntoVectorValue(VPValue *Def, const VPLane &Lane)
Construct the vector value of a scalarized value V one lane at a time.
Definition: VPlan.cpp:398
Value * get(VPValue *Def, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def if IsScalar is false, otherwise return the gen...
Definition: VPlan.cpp:253
struct llvm::VPTransformState::CFGState CFG
LoopVersioning * LVer
LoopVersioning.
Definition: VPlan.h:384
void addNewMetadata(Instruction *To, const Instruction *Orig)
Add additional metadata to To that was not present on Orig.
Definition: VPlan.cpp:357
std::optional< VPLane > Lane
Hold the index to generate specific scalar instructions.
Definition: VPlan.h:249
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
Definition: VPlan.h:368
VPlan * Plan
Pointer to the VPlan code is generated for.
Definition: VPlan.h:374
InnerLoopVectorizer * ILV
Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
Definition: VPlan.h:371
ElementCount VF
The chosen Vectorization Factor of the loop being vectorized.
Definition: VPlan.h:244
void setDebugLocFrom(DebugLoc DL)
Set the debug location in the builder using the debug location DL.
Definition: VPlan.cpp:376
void set(VPValue *Def, Value *V, bool IsScalar=false)
Set the generated vector Value for a given VPValue, if IsScalar is false.
Definition: VPlan.h:278
A recipe for widening load operations, using the address to load from and an optional mask.
Definition: VPlan.h:2964
A recipe for widening select instructions.
Definition: VPlan.h:1816
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition: VPlan.h:3042
static void handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop, BasicBlock *UncountableExitingBlock, VPRecipeBuilder &RecipeBuilder)
Update Plan to account for the uncountable early exit block in UncountableExitingBlock by.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx)
Explicitly unroll Plan by UF.
static void dropPoisonGeneratingRecipes(VPlan &Plan, function_ref< bool(BasicBlock *)> BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed)
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static bool tryAddExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void VPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, ScalarEvolution &SE, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.