LLVM 20.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanHCFGBuilder.h"
62#include "VPlanPatternMatch.h"
63#include "VPlanTransforms.h"
64#include "VPlanVerifier.h"
65#include "llvm/ADT/APInt.h"
66#include "llvm/ADT/ArrayRef.h"
67#include "llvm/ADT/DenseMap.h"
69#include "llvm/ADT/Hashing.h"
70#include "llvm/ADT/MapVector.h"
71#include "llvm/ADT/STLExtras.h"
73#include "llvm/ADT/SmallSet.h"
75#include "llvm/ADT/Statistic.h"
76#include "llvm/ADT/StringRef.h"
77#include "llvm/ADT/Twine.h"
78#include "llvm/ADT/TypeSwitch.h"
83#include "llvm/Analysis/CFG.h"
99#include "llvm/IR/Attributes.h"
100#include "llvm/IR/BasicBlock.h"
101#include "llvm/IR/CFG.h"
102#include "llvm/IR/Constant.h"
103#include "llvm/IR/Constants.h"
104#include "llvm/IR/DataLayout.h"
105#include "llvm/IR/DebugInfo.h"
107#include "llvm/IR/DebugLoc.h"
108#include "llvm/IR/DerivedTypes.h"
110#include "llvm/IR/Dominators.h"
111#include "llvm/IR/Function.h"
112#include "llvm/IR/IRBuilder.h"
113#include "llvm/IR/InstrTypes.h"
114#include "llvm/IR/Instruction.h"
115#include "llvm/IR/Instructions.h"
117#include "llvm/IR/Intrinsics.h"
118#include "llvm/IR/MDBuilder.h"
119#include "llvm/IR/Metadata.h"
120#include "llvm/IR/Module.h"
121#include "llvm/IR/Operator.h"
122#include "llvm/IR/PatternMatch.h"
124#include "llvm/IR/Type.h"
125#include "llvm/IR/Use.h"
126#include "llvm/IR/User.h"
127#include "llvm/IR/Value.h"
128#include "llvm/IR/ValueHandle.h"
130#include "llvm/IR/Verifier.h"
131#include "llvm/Support/Casting.h"
134#include "llvm/Support/Debug.h"
148#include <algorithm>
149#include <cassert>
150#include <cmath>
151#include <cstdint>
152#include <functional>
153#include <iterator>
154#include <limits>
155#include <map>
156#include <memory>
157#include <string>
158#include <tuple>
159#include <utility>
160
161using namespace llvm;
162
163#define LV_NAME "loop-vectorize"
164#define DEBUG_TYPE LV_NAME
165
166#ifndef NDEBUG
167const char VerboseDebug[] = DEBUG_TYPE "-verbose";
168#endif
169
170/// @{
171/// Metadata attribute names
172const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
174 "llvm.loop.vectorize.followup_vectorized";
176 "llvm.loop.vectorize.followup_epilogue";
177/// @}
178
179STATISTIC(LoopsVectorized, "Number of loops vectorized");
180STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
181STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
182
184 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
185 cl::desc("Enable vectorization of epilogue loops."));
186
188 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
189 cl::desc("When epilogue vectorization is enabled, and a value greater than "
190 "1 is specified, forces the given VF for all applicable epilogue "
191 "loops."));
192
194 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
195 cl::desc("Only loops with vectorization factor equal to or larger than "
196 "the specified value are considered for epilogue vectorization."));
197
198/// Loops with a known constant trip count below this number are vectorized only
199/// if no scalar iteration overheads are incurred.
201 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
202 cl::desc("Loops with a constant trip count that is smaller than this "
203 "value are vectorized only if no scalar iteration overheads "
204 "are incurred."));
205
207 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
208 cl::desc("The maximum allowed number of runtime memory checks"));
209
210// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
211// that predication is preferred, and this lists all options. I.e., the
212// vectorizer will try to fold the tail-loop (epilogue) into the vector body
213// and predicate the instructions accordingly. If tail-folding fails, there are
214// different fallback strategies depending on these values:
216 enum Option {
220 };
221} // namespace PreferPredicateTy
222
224 "prefer-predicate-over-epilogue",
227 cl::desc("Tail-folding and predication preferences over creating a scalar "
228 "epilogue loop."),
230 "scalar-epilogue",
231 "Don't tail-predicate loops, create scalar epilogue"),
233 "predicate-else-scalar-epilogue",
234 "prefer tail-folding, create scalar epilogue if tail "
235 "folding fails."),
237 "predicate-dont-vectorize",
238 "prefers tail-folding, don't attempt vectorization if "
239 "tail-folding fails.")));
240
242 "force-tail-folding-style", cl::desc("Force the tail folding style"),
243 cl::init(TailFoldingStyle::None),
245 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
247 TailFoldingStyle::Data, "data",
248 "Create lane mask for data only, using active.lane.mask intrinsic"),
249 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
250 "data-without-lane-mask",
251 "Create lane mask with compare/stepvector"),
252 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
253 "Create lane mask using active.lane.mask intrinsic, and use "
254 "it for both data and control flow"),
255 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
256 "data-and-control-without-rt-check",
257 "Similar to data-and-control, but remove the runtime check"),
258 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
259 "Use predicated EVL instructions for tail folding. If EVL "
260 "is unsupported, fallback to data-without-lane-mask.")));
261
263 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
264 cl::desc("Maximize bandwidth when selecting vectorization factor which "
265 "will be determined by the smallest type in loop."));
266
268 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
269 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
270
271/// An interleave-group may need masking if it resides in a block that needs
272/// predication, or in order to mask away gaps.
274 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
275 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
276
278 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
279 cl::desc("A flag that overrides the target's number of scalar registers."));
280
282 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
283 cl::desc("A flag that overrides the target's number of vector registers."));
284
286 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
287 cl::desc("A flag that overrides the target's max interleave factor for "
288 "scalar loops."));
289
291 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
292 cl::desc("A flag that overrides the target's max interleave factor for "
293 "vectorized loops."));
294
296 "force-target-instruction-cost", cl::init(0), cl::Hidden,
297 cl::desc("A flag that overrides the target's expected cost for "
298 "an instruction to a single constant value. Mostly "
299 "useful for getting consistent testing."));
300
302 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
303 cl::desc(
304 "Pretend that scalable vectors are supported, even if the target does "
305 "not support them. This flag should only be used for testing."));
306
308 "small-loop-cost", cl::init(20), cl::Hidden,
309 cl::desc(
310 "The cost of a loop that is considered 'small' by the interleaver."));
311
313 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
314 cl::desc("Enable the use of the block frequency analysis to access PGO "
315 "heuristics minimizing code growth in cold regions and being more "
316 "aggressive in hot regions."));
317
318// Runtime interleave loops for load/store throughput.
320 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
321 cl::desc(
322 "Enable runtime interleaving until load/store ports are saturated"));
323
324/// The number of stores in a loop that are allowed to need predication.
326 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
327 cl::desc("Max number of stores to be predicated behind an if."));
328
330 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
331 cl::desc("Count the induction variable only once when interleaving"));
332
334 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
335 cl::desc("Enable if predication of stores during vectorization."));
336
338 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
339 cl::desc("The maximum interleave count to use when interleaving a scalar "
340 "reduction in a nested loop."));
341
342static cl::opt<bool>
343 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
345 cl::desc("Prefer in-loop vector reductions, "
346 "overriding the targets preference."));
347
349 "force-ordered-reductions", cl::init(false), cl::Hidden,
350 cl::desc("Enable the vectorisation of loops with in-order (strict) "
351 "FP reductions"));
352
354 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
355 cl::desc(
356 "Prefer predicating a reduction operation over an after loop select."));
357
358namespace llvm {
360 "enable-vplan-native-path", cl::Hidden,
361 cl::desc("Enable VPlan-native vectorization path with "
362 "support for outer loop vectorization."));
363}
364
365// This flag enables the stress testing of the VPlan H-CFG construction in the
366// VPlan-native vectorization path. It must be used in conjuction with
367// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
368// verification of the H-CFGs built.
370 "vplan-build-stress-test", cl::init(false), cl::Hidden,
371 cl::desc(
372 "Build VPlan for every supported loop nest in the function and bail "
373 "out right after the build (stress test the VPlan H-CFG construction "
374 "in the VPlan-native vectorization path)."));
375
377 "interleave-loops", cl::init(true), cl::Hidden,
378 cl::desc("Enable loop interleaving in Loop vectorization passes"));
380 "vectorize-loops", cl::init(true), cl::Hidden,
381 cl::desc("Run the Loop vectorization passes"));
382
384 "force-widen-divrem-via-safe-divisor", cl::Hidden,
385 cl::desc(
386 "Override cost based safe divisor widening for div/rem instructions"));
387
389 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
391 cl::desc("Try wider VFs if they enable the use of vector variants"));
392
393// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
394// variables not overflowing do not hold. See `emitSCEVChecks`.
395static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
396// Likelyhood of bypassing the vectorized loop because pointers overlap. See
397// `emitMemRuntimeChecks`.
398static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
399// Likelyhood of bypassing the vectorized loop because there are zero trips left
400// after prolog. See `emitIterationCountCheck`.
401static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
402
403/// A helper function that returns true if the given type is irregular. The
404/// type is irregular if its allocated size doesn't equal the store size of an
405/// element of the corresponding vector type.
406static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
407 // Determine if an array of N elements of type Ty is "bitcast compatible"
408 // with a <N x Ty> vector.
409 // This is only true if there is no padding between the array elements.
410 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
411}
412
413/// Returns "best known" trip count for the specified loop \p L as defined by
414/// the following procedure:
415/// 1) Returns exact trip count if it is known.
416/// 2) Returns expected trip count according to profile data if any.
417/// 3) Returns upper bound estimate if it is known.
418/// 4) Returns std::nullopt if all of the above failed.
419static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
420 Loop *L) {
421 // Check if exact trip count is known.
422 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
423 return ExpectedTC;
424
425 // Check if there is an expected trip count available from profile data.
427 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
428 return *EstimatedTC;
429
430 // Check if upper bound estimate is known.
431 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
432 return ExpectedTC;
433
434 return std::nullopt;
435}
436
437namespace {
438// Forward declare GeneratedRTChecks.
439class GeneratedRTChecks;
440
441using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
442} // namespace
443
444namespace llvm {
445
447
448/// InnerLoopVectorizer vectorizes loops which contain only one basic
449/// block to a specified vectorization factor (VF).
450/// This class performs the widening of scalars into vectors, or multiple
451/// scalars. This class also implements the following features:
452/// * It inserts an epilogue loop for handling loops that don't have iteration
453/// counts that are known to be a multiple of the vectorization factor.
454/// * It handles the code generation for reduction variables.
455/// * Scalarization (implementation using scalars) of un-vectorizable
456/// instructions.
457/// InnerLoopVectorizer does not perform any vectorization-legality
458/// checks, and relies on the caller to check for the different legality
459/// aspects. The InnerLoopVectorizer relies on the
460/// LoopVectorizationLegality class to provide information about the induction
461/// and reduction variables that were found to a given vectorization factor.
463public:
466 const TargetLibraryInfo *TLI,
470 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
472 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
473 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
474 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
475 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
477 // Query this against the original loop and save it here because the profile
478 // of the original loop header may change as the transformation happens.
481
483 this->MinProfitableTripCount = VecWidth;
484 else
485 this->MinProfitableTripCount = MinProfitableTripCount;
486 }
487
488 virtual ~InnerLoopVectorizer() = default;
489
490 /// Create a new empty loop that will contain vectorized instructions later
491 /// on, while the old loop will be used as the scalar remainder. Control flow
492 /// is generated around the vectorized (and scalar epilogue) loops consisting
493 /// of various checks and bypasses. Return the pre-header block of the new
494 /// loop and the start value for the canonical induction, if it is != 0. The
495 /// latter is the case when vectorizing the epilogue loop. In the case of
496 /// epilogue vectorization, this function is overriden to handle the more
497 /// complex control flow around the loops. \p ExpandedSCEVs is used to
498 /// look up SCEV expansions for expressions needed during skeleton creation.
499 virtual std::pair<BasicBlock *, Value *>
500 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
501
502 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
503 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
504
505 // Return true if any runtime check is added.
507
508 /// A helper function to scalarize a single Instruction in the innermost loop.
509 /// Generates a sequence of scalar instances for each lane between \p MinLane
510 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
511 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
512 /// Instr's operands.
513 void scalarizeInstruction(const Instruction *Instr,
514 VPReplicateRecipe *RepRecipe,
515 const VPIteration &Instance,
516 VPTransformState &State);
517
518 /// Fix the non-induction PHIs in \p Plan.
519 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
520
521 /// Create a new phi node for the induction variable \p OrigPhi to resume
522 /// iteration count in the scalar epilogue, from where the vectorized loop
523 /// left off. \p Step is the SCEV-expanded induction step to use. In cases
524 /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
525 /// and the resume values can come from an additional bypass block, the \p
526 /// AdditionalBypass pair provides information about the bypass block and the
527 /// end value on the edge from bypass to this loop.
529 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
530 ArrayRef<BasicBlock *> BypassBlocks,
531 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
532
533 /// Returns the original loop trip count.
534 Value *getTripCount() const { return TripCount; }
535
536 /// Used to set the trip count after ILV's construction and after the
537 /// preheader block has been executed. Note that this always holds the trip
538 /// count of the original loop for both main loop and epilogue vectorization.
539 void setTripCount(Value *TC) { TripCount = TC; }
540
541protected:
543
544 /// A small list of PHINodes.
546
547 /// A type for scalarized values in the new loop. Each value from the
548 /// original loop, when scalarized, is represented by UF x VF scalar values
549 /// in the new unrolled loop, where UF is the unroll factor and VF is the
550 /// vectorization factor.
552
553 /// Set up the values of the IVs correctly when exiting the vector loop.
554 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
555 Value *VectorTripCount, Value *EndValue,
556 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
557 VPlan &Plan, VPTransformState &State);
558
559 /// Iteratively sink the scalarized operands of a predicated instruction into
560 /// the block that was created for it.
561 void sinkScalarOperands(Instruction *PredInst);
562
563 /// Returns (and creates if needed) the trip count of the widened loop.
565
566 /// Emit a bypass check to see if the vector trip count is zero, including if
567 /// it overflows.
569
570 /// Emit a bypass check to see if all of the SCEV assumptions we've
571 /// had to make are correct. Returns the block containing the checks or
572 /// nullptr if no checks have been added.
574
575 /// Emit bypass checks to check any memory assumptions we may have made.
576 /// Returns the block containing the checks or nullptr if no checks have been
577 /// added.
579
580 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
581 /// vector loop preheader, middle block and scalar preheader.
583
584 /// Create new phi nodes for the induction variables to resume iteration count
585 /// in the scalar epilogue, from where the vectorized loop left off.
586 /// In cases where the loop skeleton is more complicated (eg. epilogue
587 /// vectorization) and the resume values can come from an additional bypass
588 /// block, the \p AdditionalBypass pair provides information about the bypass
589 /// block and the end value on the edge from bypass to this loop.
591 const SCEV2ValueTy &ExpandedSCEVs,
592 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
593
594 /// Complete the loop skeleton by adding debug MDs, creating appropriate
595 /// conditional branches in the middle block, preparing the builder and
596 /// running the verifier. Return the preheader of the completed vector loop.
598
599 /// Allow subclasses to override and print debug traces before/after vplan
600 /// execution, when trace information is requested.
601 virtual void printDebugTracesAtStart(){};
602 virtual void printDebugTracesAtEnd(){};
603
604 /// The original loop.
606
607 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
608 /// dynamic knowledge to simplify SCEV expressions and converts them to a
609 /// more usable form.
611
612 /// Loop Info.
614
615 /// Dominator Tree.
617
618 /// Target Library Info.
620
621 /// Target Transform Info.
623
624 /// Assumption Cache.
626
627 /// Interface to emit optimization remarks.
629
630 /// The vectorization SIMD factor to use. Each vector will have this many
631 /// vector elements.
633
635
636 /// The vectorization unroll factor to use. Each scalar is vectorized to this
637 /// many different vector instructions.
638 unsigned UF;
639
640 /// The builder that we use
642
643 // --- Vectorization state ---
644
645 /// The vector-loop preheader.
647
648 /// The scalar-loop preheader.
650
651 /// Middle Block between the vector and the scalar.
653
654 /// The unique ExitBlock of the scalar loop if one exists. Note that
655 /// there can be multiple exiting edges reaching this block.
657
658 /// The scalar loop body.
660
661 /// A list of all bypass blocks. The first block is the entry of the loop.
663
664 /// Store instructions that were predicated.
666
667 /// Trip count of the original loop.
668 Value *TripCount = nullptr;
669
670 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
672
673 /// The legality analysis.
675
676 /// The profitablity analysis.
678
679 // Record whether runtime checks are added.
680 bool AddedSafetyChecks = false;
681
682 // Holds the end values for each induction variable. We save the end values
683 // so we can later fix-up the external users of the induction variables.
685
686 /// BFI and PSI are used to check for profile guided size optimizations.
689
690 // Whether this loop should be optimized for size based on profile guided size
691 // optimizatios.
693
694 /// Structure to hold information about generated runtime checks, responsible
695 /// for cleaning the checks, if vectorization turns out unprofitable.
696 GeneratedRTChecks &RTChecks;
697
698 // Holds the resume values for reductions in the loops, used to set the
699 // correct start value of reduction PHIs when vectorizing the epilogue.
702};
703
705public:
708 const TargetLibraryInfo *TLI,
710 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
713 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
715 ElementCount::getFixed(1),
716 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
717 BFI, PSI, Check) {}
718};
719
720/// Encapsulate information regarding vectorization of a loop and its epilogue.
721/// This information is meant to be updated and used across two stages of
722/// epilogue vectorization.
725 unsigned MainLoopUF = 0;
727 unsigned EpilogueUF = 0;
732 Value *TripCount = nullptr;
734
736 ElementCount EVF, unsigned EUF)
737 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
738 assert(EUF == 1 &&
739 "A high UF for the epilogue loop is likely not beneficial.");
740 }
741};
742
743/// An extension of the inner loop vectorizer that creates a skeleton for a
744/// vectorized loop that has its epilogue (residual) also vectorized.
745/// The idea is to run the vplan on a given loop twice, firstly to setup the
746/// skeleton and vectorize the main loop, and secondly to complete the skeleton
747/// from the first step and vectorize the epilogue. This is achieved by
748/// deriving two concrete strategy classes from this base class and invoking
749/// them in succession from the loop vectorizer planner.
751public:
759 GeneratedRTChecks &Checks)
761 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
762 CM, BFI, PSI, Checks),
763 EPI(EPI) {}
764
765 // Override this function to handle the more complex control flow around the
766 // three loops.
767 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
768 const SCEV2ValueTy &ExpandedSCEVs) final {
769 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
770 }
771
772 /// The interface for creating a vectorized skeleton using one of two
773 /// different strategies, each corresponding to one execution of the vplan
774 /// as described above.
775 virtual std::pair<BasicBlock *, Value *>
776 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
777
778 /// Holds and updates state information required to vectorize the main loop
779 /// and its epilogue in two separate passes. This setup helps us avoid
780 /// regenerating and recomputing runtime safety checks. It also helps us to
781 /// shorten the iteration-count-check path length for the cases where the
782 /// iteration count of the loop is so small that the main vector loop is
783 /// completely skipped.
785};
786
787/// A specialized derived class of inner loop vectorizer that performs
788/// vectorization of *main* loops in the process of vectorizing loops and their
789/// epilogues.
791public:
799 GeneratedRTChecks &Check)
801 EPI, LVL, CM, BFI, PSI, Check) {}
802 /// Implements the interface for creating a vectorized skeleton using the
803 /// *main loop* strategy (ie the first pass of vplan execution).
804 std::pair<BasicBlock *, Value *>
805 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
806
807protected:
808 /// Emits an iteration count bypass check once for the main loop (when \p
809 /// ForEpilogue is false) and once for the epilogue loop (when \p
810 /// ForEpilogue is true).
811 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
812 void printDebugTracesAtStart() override;
813 void printDebugTracesAtEnd() override;
814};
815
816// A specialized derived class of inner loop vectorizer that performs
817// vectorization of *epilogue* loops in the process of vectorizing loops and
818// their epilogues.
820public:
828 GeneratedRTChecks &Checks)
830 EPI, LVL, CM, BFI, PSI, Checks) {
832 }
833 /// Implements the interface for creating a vectorized skeleton using the
834 /// *epilogue loop* strategy (ie the second pass of vplan execution).
835 std::pair<BasicBlock *, Value *>
836 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
837
838protected:
839 /// Emits an iteration count bypass check after the main vector loop has
840 /// finished to see if there are any iterations left to execute by either
841 /// the vector epilogue or the scalar epilogue.
843 BasicBlock *Bypass,
844 BasicBlock *Insert);
845 void printDebugTracesAtStart() override;
846 void printDebugTracesAtEnd() override;
847};
848} // end namespace llvm
849
850/// Look for a meaningful debug location on the instruction or it's
851/// operands.
853 if (!I)
854 return DebugLoc();
855
857 if (I->getDebugLoc() != Empty)
858 return I->getDebugLoc();
859
860 for (Use &Op : I->operands()) {
861 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
862 if (OpInst->getDebugLoc() != Empty)
863 return OpInst->getDebugLoc();
864 }
865
866 return I->getDebugLoc();
867}
868
869/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
870/// is passed, the message relates to that particular instruction.
871#ifndef NDEBUG
872static void debugVectorizationMessage(const StringRef Prefix,
873 const StringRef DebugMsg,
874 Instruction *I) {
875 dbgs() << "LV: " << Prefix << DebugMsg;
876 if (I != nullptr)
877 dbgs() << " " << *I;
878 else
879 dbgs() << '.';
880 dbgs() << '\n';
881}
882#endif
883
884/// Create an analysis remark that explains why vectorization failed
885///
886/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
887/// RemarkName is the identifier for the remark. If \p I is passed it is an
888/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
889/// the location of the remark. If \p DL is passed, use it as debug location for
890/// the remark. \return the remark object that can be streamed to.
892createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
893 Instruction *I, DebugLoc DL = {}) {
894 Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
895 // If debug location is attached to the instruction, use it. Otherwise if DL
896 // was not provided, use the loop's.
897 if (I && I->getDebugLoc())
898 DL = I->getDebugLoc();
899 else if (!DL)
900 DL = TheLoop->getStartLoc();
901
902 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
903}
904
905namespace llvm {
906
907/// Return a value for Step multiplied by VF.
909 int64_t Step) {
910 assert(Ty->isIntegerTy() && "Expected an integer step");
911 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
912}
913
914/// Return the runtime value for VF.
916 return B.CreateElementCount(Ty, VF);
917}
918
920 Loop *OrigLoop) {
921 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
922 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
923
924 ScalarEvolution &SE = *PSE.getSE();
925 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
926}
927
929 const StringRef OREMsg, const StringRef ORETag,
930 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
931 Instruction *I) {
932 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
933 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
934 ORE->emit(
935 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
936 << "loop not vectorized: " << OREMsg);
937}
938
939/// Reports an informative message: print \p Msg for debugging purposes as well
940/// as an optimization remark. Uses either \p I as location of the remark, or
941/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
942/// remark. If \p DL is passed, use it as debug location for the remark.
943static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
945 Loop *TheLoop, Instruction *I = nullptr,
946 DebugLoc DL = {}) {
948 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
949 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
950 I, DL)
951 << Msg);
952}
953
954/// Report successful vectorization of the loop. In case an outer loop is
955/// vectorized, prepend "outer" to the vectorization remark.
957 VectorizationFactor VF, unsigned IC) {
959 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
960 nullptr));
961 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
962 ORE->emit([&]() {
963 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
964 TheLoop->getHeader())
965 << "vectorized " << LoopType << "loop (vectorization width: "
966 << ore::NV("VectorizationFactor", VF.Width)
967 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
968 });
969}
970
971} // end namespace llvm
972
973namespace llvm {
974
975// Loop vectorization cost-model hints how the scalar epilogue loop should be
976// lowered.
978
979 // The default: allowing scalar epilogues.
981
982 // Vectorization with OptForSize: don't allow epilogues.
984
985 // A special case of vectorisation with OptForSize: loops with a very small
986 // trip count are considered for vectorization under OptForSize, thereby
987 // making sure the cost of their loop body is dominant, free of runtime
988 // guards and scalar iteration overheads.
990
991 // Loop hint predicate indicating an epilogue is undesired.
993
994 // Directive indicating we must either tail fold or not vectorize
997
998using InstructionVFPair = std::pair<Instruction *, ElementCount>;
999
1000/// LoopVectorizationCostModel - estimates the expected speedups due to
1001/// vectorization.
1002/// In many cases vectorization is not profitable. This can happen because of
1003/// a number of reasons. In this class we mainly attempt to predict the
1004/// expected speedup/slowdowns due to the supported instruction set. We use the
1005/// TargetTransformInfo to query the different backends for the cost of
1006/// different operations.
1008public:
1012 const TargetTransformInfo &TTI,
1018 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1019 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1020 Hints(Hints), InterleaveInfo(IAI) {}
1021
1022 /// \return An upper bound for the vectorization factors (both fixed and
1023 /// scalable). If the factors are 0, vectorization and interleaving should be
1024 /// avoided up front.
1025 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1026
1027 /// \return True if runtime checks are required for vectorization, and false
1028 /// otherwise.
1029 bool runtimeChecksRequired();
1030
1031 /// Setup cost-based decisions for user vectorization factor.
1032 /// \return true if the UserVF is a feasible VF to be chosen.
1036 return expectedCost(UserVF).isValid();
1037 }
1038
1039 /// \return The size (in bits) of the smallest and widest types in the code
1040 /// that needs to be vectorized. We ignore values that remain scalar such as
1041 /// 64 bit loop indices.
1042 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1043
1044 /// \return The desired interleave count.
1045 /// If interleave count has been specified by metadata it will be returned.
1046 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1047 /// are the selected vectorization factor and the cost of the selected VF.
1048 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1049
1050 /// Memory access instruction may be vectorized in more than one way.
1051 /// Form of instruction after vectorization depends on cost.
1052 /// This function takes cost-based decisions for Load/Store instructions
1053 /// and collects them in a map. This decisions map is used for building
1054 /// the lists of loop-uniform and loop-scalar instructions.
1055 /// The calculated cost is saved with widening decision in order to
1056 /// avoid redundant calculations.
1058
1059 /// A call may be vectorized in different ways depending on whether we have
1060 /// vectorized variants available and whether the target supports masking.
1061 /// This function analyzes all calls in the function at the supplied VF,
1062 /// makes a decision based on the costs of available options, and stores that
1063 /// decision in a map for use in planning and plan execution.
1065
1066 /// A struct that represents some properties of the register usage
1067 /// of a loop.
1069 /// Holds the number of loop invariant values that are used in the loop.
1070 /// The key is ClassID of target-provided register class.
1072 /// Holds the maximum number of concurrent live intervals in the loop.
1073 /// The key is ClassID of target-provided register class.
1075 };
1076
1077 /// \return Returns information about the register usages of the loop for the
1078 /// given vectorization factors.
1081
1082 /// Collect values we want to ignore in the cost model.
1083 void collectValuesToIgnore();
1084
1085 /// Collect all element types in the loop for which widening is needed.
1087
1088 /// Split reductions into those that happen in the loop, and those that happen
1089 /// outside. In loop reductions are collected into InLoopReductions.
1091
1092 /// Returns true if we should use strict in-order reductions for the given
1093 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1094 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1095 /// of FP operations.
1096 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1097 return !Hints->allowReordering() && RdxDesc.isOrdered();
1098 }
1099
1100 /// \returns The smallest bitwidth each instruction can be represented with.
1101 /// The vector equivalents of these instructions should be truncated to this
1102 /// type.
1104 return MinBWs;
1105 }
1106
1107 /// \returns True if it is more profitable to scalarize instruction \p I for
1108 /// vectorization factor \p VF.
1110 assert(VF.isVector() &&
1111 "Profitable to scalarize relevant only for VF > 1.");
1112 assert(
1113 TheLoop->isInnermost() &&
1114 "cost-model should not be used for outer loops (in VPlan-native path)");
1115
1116 auto Scalars = InstsToScalarize.find(VF);
1117 assert(Scalars != InstsToScalarize.end() &&
1118 "VF not yet analyzed for scalarization profitability");
1119 return Scalars->second.contains(I);
1120 }
1121
1122 /// Returns true if \p I is known to be uniform after vectorization.
1124 assert(
1125 TheLoop->isInnermost() &&
1126 "cost-model should not be used for outer loops (in VPlan-native path)");
1127 // Pseudo probe needs to be duplicated for each unrolled iteration and
1128 // vector lane so that profiled loop trip count can be accurately
1129 // accumulated instead of being under counted.
1130 if (isa<PseudoProbeInst>(I))
1131 return false;
1132
1133 if (VF.isScalar())
1134 return true;
1135
1136 auto UniformsPerVF = Uniforms.find(VF);
1137 assert(UniformsPerVF != Uniforms.end() &&
1138 "VF not yet analyzed for uniformity");
1139 return UniformsPerVF->second.count(I);
1140 }
1141
1142 /// Returns true if \p I is known to be scalar after vectorization.
1144 assert(
1145 TheLoop->isInnermost() &&
1146 "cost-model should not be used for outer loops (in VPlan-native path)");
1147 if (VF.isScalar())
1148 return true;
1149
1150 auto ScalarsPerVF = Scalars.find(VF);
1151 assert(ScalarsPerVF != Scalars.end() &&
1152 "Scalar values are not calculated for VF");
1153 return ScalarsPerVF->second.count(I);
1154 }
1155
1156 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1157 /// for vectorization factor \p VF.
1159 return VF.isVector() && MinBWs.contains(I) &&
1160 !isProfitableToScalarize(I, VF) &&
1162 }
1163
1164 /// Decision that was taken during cost calculation for memory instruction.
1167 CM_Widen, // For consecutive accesses with stride +1.
1168 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1175
1176 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1177 /// instruction \p I and vector width \p VF.
1180 assert(VF.isVector() && "Expected VF >=2");
1181 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1182 }
1183
1184 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1185 /// interleaving group \p Grp and vector width \p VF.
1189 assert(VF.isVector() && "Expected VF >=2");
1190 /// Broadcast this decicion to all instructions inside the group.
1191 /// But the cost will be assigned to one instruction only.
1192 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1193 if (auto *I = Grp->getMember(i)) {
1194 if (Grp->getInsertPos() == I)
1195 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1196 else
1197 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1198 }
1199 }
1200 }
1201
1202 /// Return the cost model decision for the given instruction \p I and vector
1203 /// width \p VF. Return CM_Unknown if this instruction did not pass
1204 /// through the cost modeling.
1206 assert(VF.isVector() && "Expected VF to be a vector VF");
1207 assert(
1208 TheLoop->isInnermost() &&
1209 "cost-model should not be used for outer loops (in VPlan-native path)");
1210
1211 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1212 auto Itr = WideningDecisions.find(InstOnVF);
1213 if (Itr == WideningDecisions.end())
1214 return CM_Unknown;
1215 return Itr->second.first;
1216 }
1217
1218 /// Return the vectorization cost for the given instruction \p I and vector
1219 /// width \p VF.
1221 assert(VF.isVector() && "Expected VF >=2");
1222 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1223 assert(WideningDecisions.contains(InstOnVF) &&
1224 "The cost is not calculated");
1225 return WideningDecisions[InstOnVF].second;
1226 }
1227
1232 std::optional<unsigned> MaskPos;
1234 };
1235
1237 Function *Variant, Intrinsic::ID IID,
1238 std::optional<unsigned> MaskPos,
1240 assert(!VF.isScalar() && "Expected vector VF");
1241 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1242 MaskPos, Cost};
1243 }
1244
1246 ElementCount VF) const {
1247 assert(!VF.isScalar() && "Expected vector VF");
1248 return CallWideningDecisions.at(std::make_pair(CI, VF));
1249 }
1250
1251 /// Return True if instruction \p I is an optimizable truncate whose operand
1252 /// is an induction variable. Such a truncate will be removed by adding a new
1253 /// induction variable with the destination type.
1255 // If the instruction is not a truncate, return false.
1256 auto *Trunc = dyn_cast<TruncInst>(I);
1257 if (!Trunc)
1258 return false;
1259
1260 // Get the source and destination types of the truncate.
1261 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1262 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1263
1264 // If the truncate is free for the given types, return false. Replacing a
1265 // free truncate with an induction variable would add an induction variable
1266 // update instruction to each iteration of the loop. We exclude from this
1267 // check the primary induction variable since it will need an update
1268 // instruction regardless.
1269 Value *Op = Trunc->getOperand(0);
1270 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1271 return false;
1272
1273 // If the truncated value is not an induction variable, return false.
1274 return Legal->isInductionPhi(Op);
1275 }
1276
1277 /// Collects the instructions to scalarize for each predicated instruction in
1278 /// the loop.
1280
1281 /// Collect Uniform and Scalar values for the given \p VF.
1282 /// The sets depend on CM decision for Load/Store instructions
1283 /// that may be vectorized as interleave, gather-scatter or scalarized.
1284 /// Also make a decision on what to do about call instructions in the loop
1285 /// at that VF -- scalarize, call a known vector routine, or call a
1286 /// vector intrinsic.
1288 // Do the analysis once.
1289 if (VF.isScalar() || Uniforms.contains(VF))
1290 return;
1293 collectLoopUniforms(VF);
1294 collectLoopScalars(VF);
1295 }
1296
1297 /// Returns true if the target machine supports masked store operation
1298 /// for the given \p DataType and kind of access to \p Ptr.
1299 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1300 return Legal->isConsecutivePtr(DataType, Ptr) &&
1301 TTI.isLegalMaskedStore(DataType, Alignment);
1302 }
1303
1304 /// Returns true if the target machine supports masked load operation
1305 /// for the given \p DataType and kind of access to \p Ptr.
1306 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1307 return Legal->isConsecutivePtr(DataType, Ptr) &&
1308 TTI.isLegalMaskedLoad(DataType, Alignment);
1309 }
1310
1311 /// Returns true if the target machine can represent \p V as a masked gather
1312 /// or scatter operation.
1314 bool LI = isa<LoadInst>(V);
1315 bool SI = isa<StoreInst>(V);
1316 if (!LI && !SI)
1317 return false;
1318 auto *Ty = getLoadStoreType(V);
1320 if (VF.isVector())
1321 Ty = VectorType::get(Ty, VF);
1322 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1323 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1324 }
1325
1326 /// Returns true if the target machine supports all of the reduction
1327 /// variables found for the given VF.
1329 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1330 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1331 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1332 }));
1333 }
1334
1335 /// Given costs for both strategies, return true if the scalar predication
1336 /// lowering should be used for div/rem. This incorporates an override
1337 /// option so it is not simply a cost comparison.
1339 InstructionCost SafeDivisorCost) const {
1340 switch (ForceSafeDivisor) {
1341 case cl::BOU_UNSET:
1342 return ScalarCost < SafeDivisorCost;
1343 case cl::BOU_TRUE:
1344 return false;
1345 case cl::BOU_FALSE:
1346 return true;
1347 };
1348 llvm_unreachable("impossible case value");
1349 }
1350
1351 /// Returns true if \p I is an instruction which requires predication and
1352 /// for which our chosen predication strategy is scalarization (i.e. we
1353 /// don't have an alternate strategy such as masking available).
1354 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1356
1357 /// Returns true if \p I is an instruction that needs to be predicated
1358 /// at runtime. The result is independent of the predication mechanism.
1359 /// Superset of instructions that return true for isScalarWithPredication.
1360 bool isPredicatedInst(Instruction *I) const;
1361
1362 /// Return the costs for our two available strategies for lowering a
1363 /// div/rem operation which requires speculating at least one lane.
1364 /// First result is for scalarization (will be invalid for scalable
1365 /// vectors); second is for the safe-divisor strategy.
1366 std::pair<InstructionCost, InstructionCost>
1368 ElementCount VF) const;
1369
1370 /// Returns true if \p I is a memory instruction with consecutive memory
1371 /// access that can be widened.
1373
1374 /// Returns true if \p I is a memory instruction in an interleaved-group
1375 /// of memory accesses that can be vectorized with wide vector loads/stores
1376 /// and shuffles.
1378
1379 /// Check if \p Instr belongs to any interleaved access group.
1381 return InterleaveInfo.isInterleaved(Instr);
1382 }
1383
1384 /// Get the interleaved access group that \p Instr belongs to.
1387 return InterleaveInfo.getInterleaveGroup(Instr);
1388 }
1389
1390 /// Returns true if we're required to use a scalar epilogue for at least
1391 /// the final iteration of the original loop.
1392 bool requiresScalarEpilogue(bool IsVectorizing) const {
1393 if (!isScalarEpilogueAllowed()) {
1394 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1395 return false;
1396 }
1397 // If we might exit from anywhere but the latch, must run the exiting
1398 // iteration in scalar form.
1400 LLVM_DEBUG(
1401 dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
1402 return true;
1403 }
1404 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1405 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1406 "interleaved group requires scalar epilogue\n");
1407 return true;
1408 }
1409 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1410 return false;
1411 }
1412
1413 /// Returns true if we're required to use a scalar epilogue for at least
1414 /// the final iteration of the original loop for all VFs in \p Range.
1415 /// A scalar epilogue must either be required for all VFs in \p Range or for
1416 /// none.
1418 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1419 return requiresScalarEpilogue(VF.isVector());
1420 };
1421 bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1422 assert(
1423 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1424 "all VFs in range must agree on whether a scalar epilogue is required");
1425 return IsRequired;
1426 }
1427
1428 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1429 /// loop hint annotation.
1431 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1432 }
1433
1434 /// Returns the TailFoldingStyle that is best for the current loop.
1435 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1436 if (!ChosenTailFoldingStyle)
1438 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1439 : ChosenTailFoldingStyle->second;
1440 }
1441
1442 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1443 /// overflow or not.
1444 /// \param IsScalableVF true if scalable vector factors enabled.
1445 /// \param UserIC User specific interleave count.
1446 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1447 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1448 if (!Legal->canFoldTailByMasking()) {
1449 ChosenTailFoldingStyle =
1451 return;
1452 }
1453
1454 if (!ForceTailFoldingStyle.getNumOccurrences()) {
1455 ChosenTailFoldingStyle = std::make_pair(
1456 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1457 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1458 return;
1459 }
1460
1461 // Set styles when forced.
1462 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1463 ForceTailFoldingStyle.getValue());
1465 return;
1466 // Override forced styles if needed.
1467 // FIXME: use actual opcode/data type for analysis here.
1468 // FIXME: Investigate opportunity for fixed vector factor.
1469 bool EVLIsLegal =
1470 IsScalableVF && UserIC <= 1 &&
1471 TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1473 // FIXME: implement support for max safe dependency distance.
1475 if (!EVLIsLegal) {
1476 // If for some reason EVL mode is unsupported, fallback to
1477 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1478 // in a generic way.
1479 ChosenTailFoldingStyle =
1482 LLVM_DEBUG(
1483 dbgs()
1484 << "LV: Preference for VP intrinsics indicated. Will "
1485 "not try to generate VP Intrinsics "
1486 << (UserIC > 1
1487 ? "since interleave count specified is greater than 1.\n"
1488 : "due to non-interleaving reasons.\n"));
1489 }
1490 }
1491
1492 /// Returns true if all loop blocks should be masked to fold tail loop.
1493 bool foldTailByMasking() const {
1494 // TODO: check if it is possible to check for None style independent of
1495 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1497 }
1498
1499 /// Returns true if the instructions in this block requires predication
1500 /// for any reason, e.g. because tail folding now requires a predicate
1501 /// or because the block in the original loop was predicated.
1504 }
1505
1506 /// Returns true if VP intrinsics with explicit vector length support should
1507 /// be generated in the tail folded loop.
1508 bool foldTailWithEVL() const {
1510 }
1511
1512 /// Returns true if the Phi is part of an inloop reduction.
1513 bool isInLoopReduction(PHINode *Phi) const {
1514 return InLoopReductions.contains(Phi);
1515 }
1516
1517 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1518 /// with factor VF. Return the cost of the instruction, including
1519 /// scalarization overhead if it's needed.
1521
1522 /// Estimate cost of a call instruction CI if it were vectorized with factor
1523 /// VF. Return the cost of the instruction, including scalarization overhead
1524 /// if it's needed.
1526
1527 /// Invalidates decisions already taken by the cost model.
1529 WideningDecisions.clear();
1530 CallWideningDecisions.clear();
1531 Uniforms.clear();
1532 Scalars.clear();
1533 }
1534
1535 /// Returns the expected execution cost. The unit of the cost does
1536 /// not matter because we use the 'cost' units to compare different
1537 /// vector widths. The cost that is returned is *not* normalized by
1538 /// the factor width.
1540
1541 bool hasPredStores() const { return NumPredStores > 0; }
1542
1543 /// Returns true if epilogue vectorization is considered profitable, and
1544 /// false otherwise.
1545 /// \p VF is the vectorization factor chosen for the original loop.
1547
1548 /// Returns the execution time cost of an instruction for a given vector
1549 /// width. Vector width of one means scalar.
1551
1552 /// Return the cost of instructions in an inloop reduction pattern, if I is
1553 /// part of that pattern.
1554 std::optional<InstructionCost>
1557
1558private:
1559 unsigned NumPredStores = 0;
1560
1561 /// \return An upper bound for the vectorization factors for both
1562 /// fixed and scalable vectorization, where the minimum-known number of
1563 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1564 /// disabled or unsupported, then the scalable part will be equal to
1565 /// ElementCount::getScalable(0).
1566 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1567 ElementCount UserVF,
1568 bool FoldTailByMasking);
1569
1570 /// \return the maximized element count based on the targets vector
1571 /// registers and the loop trip-count, but limited to a maximum safe VF.
1572 /// This is a helper function of computeFeasibleMaxVF.
1573 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1574 unsigned SmallestType,
1575 unsigned WidestType,
1576 ElementCount MaxSafeVF,
1577 bool FoldTailByMasking);
1578
1579 /// Checks if scalable vectorization is supported and enabled. Caches the
1580 /// result to avoid repeated debug dumps for repeated queries.
1581 bool isScalableVectorizationAllowed();
1582
1583 /// \return the maximum legal scalable VF, based on the safe max number
1584 /// of elements.
1585 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1586
1587 /// Calculate vectorization cost of memory instruction \p I.
1588 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1589
1590 /// The cost computation for scalarized memory instruction.
1591 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1592
1593 /// The cost computation for interleaving group of memory instructions.
1594 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1595
1596 /// The cost computation for Gather/Scatter instruction.
1597 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1598
1599 /// The cost computation for widening instruction \p I with consecutive
1600 /// memory access.
1601 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1602
1603 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1604 /// Load: scalar load + broadcast.
1605 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1606 /// element)
1607 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1608
1609 /// Estimate the overhead of scalarizing an instruction. This is a
1610 /// convenience wrapper for the type-based getScalarizationOverhead API.
1611 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1613
1614 /// Returns true if an artificially high cost for emulated masked memrefs
1615 /// should be used.
1616 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1617
1618 /// Map of scalar integer values to the smallest bitwidth they can be legally
1619 /// represented as. The vector equivalents of these values should be truncated
1620 /// to this type.
1622
1623 /// A type representing the costs for instructions if they were to be
1624 /// scalarized rather than vectorized. The entries are Instruction-Cost
1625 /// pairs.
1626 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1627
1628 /// A set containing all BasicBlocks that are known to present after
1629 /// vectorization as a predicated block.
1631 PredicatedBBsAfterVectorization;
1632
1633 /// Records whether it is allowed to have the original scalar loop execute at
1634 /// least once. This may be needed as a fallback loop in case runtime
1635 /// aliasing/dependence checks fail, or to handle the tail/remainder
1636 /// iterations when the trip count is unknown or doesn't divide by the VF,
1637 /// or as a peel-loop to handle gaps in interleave-groups.
1638 /// Under optsize and when the trip count is very small we don't allow any
1639 /// iterations to execute in the scalar loop.
1640 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1641
1642 /// Control finally chosen tail folding style. The first element is used if
1643 /// the IV update may overflow, the second element - if it does not.
1644 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1645 ChosenTailFoldingStyle;
1646
1647 /// true if scalable vectorization is supported and enabled.
1648 std::optional<bool> IsScalableVectorizationAllowed;
1649
1650 /// A map holding scalar costs for different vectorization factors. The
1651 /// presence of a cost for an instruction in the mapping indicates that the
1652 /// instruction will be scalarized when vectorizing with the associated
1653 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1655
1656 /// Holds the instructions known to be uniform after vectorization.
1657 /// The data is collected per VF.
1659
1660 /// Holds the instructions known to be scalar after vectorization.
1661 /// The data is collected per VF.
1663
1664 /// Holds the instructions (address computations) that are forced to be
1665 /// scalarized.
1667
1668 /// PHINodes of the reductions that should be expanded in-loop.
1669 SmallPtrSet<PHINode *, 4> InLoopReductions;
1670
1671 /// A Map of inloop reduction operations and their immediate chain operand.
1672 /// FIXME: This can be removed once reductions can be costed correctly in
1673 /// VPlan. This was added to allow quick lookup of the inloop operations.
1674 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1675
1676 /// Returns the expected difference in cost from scalarizing the expression
1677 /// feeding a predicated instruction \p PredInst. The instructions to
1678 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1679 /// non-negative return value implies the expression will be scalarized.
1680 /// Currently, only single-use chains are considered for scalarization.
1681 InstructionCost computePredInstDiscount(Instruction *PredInst,
1682 ScalarCostsTy &ScalarCosts,
1683 ElementCount VF);
1684
1685 /// Collect the instructions that are uniform after vectorization. An
1686 /// instruction is uniform if we represent it with a single scalar value in
1687 /// the vectorized loop corresponding to each vector iteration. Examples of
1688 /// uniform instructions include pointer operands of consecutive or
1689 /// interleaved memory accesses. Note that although uniformity implies an
1690 /// instruction will be scalar, the reverse is not true. In general, a
1691 /// scalarized instruction will be represented by VF scalar values in the
1692 /// vectorized loop, each corresponding to an iteration of the original
1693 /// scalar loop.
1694 void collectLoopUniforms(ElementCount VF);
1695
1696 /// Collect the instructions that are scalar after vectorization. An
1697 /// instruction is scalar if it is known to be uniform or will be scalarized
1698 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1699 /// to the list if they are used by a load/store instruction that is marked as
1700 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1701 /// VF values in the vectorized loop, each corresponding to an iteration of
1702 /// the original scalar loop.
1703 void collectLoopScalars(ElementCount VF);
1704
1705 /// Keeps cost model vectorization decision and cost for instructions.
1706 /// Right now it is used for memory instructions only.
1708 std::pair<InstWidening, InstructionCost>>;
1709
1710 DecisionList WideningDecisions;
1711
1712 using CallDecisionList =
1713 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1714
1715 CallDecisionList CallWideningDecisions;
1716
1717 /// Returns true if \p V is expected to be vectorized and it needs to be
1718 /// extracted.
1719 bool needsExtract(Value *V, ElementCount VF) const {
1720 Instruction *I = dyn_cast<Instruction>(V);
1721 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1723 return false;
1724
1725 // Assume we can vectorize V (and hence we need extraction) if the
1726 // scalars are not computed yet. This can happen, because it is called
1727 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1728 // the scalars are collected. That should be a safe assumption in most
1729 // cases, because we check if the operands have vectorizable types
1730 // beforehand in LoopVectorizationLegality.
1731 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1732 };
1733
1734 /// Returns a range containing only operands needing to be extracted.
1735 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1736 ElementCount VF) const {
1738 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1739 }
1740
1741public:
1742 /// The loop that we evaluate.
1744
1745 /// Predicated scalar evolution analysis.
1747
1748 /// Loop Info analysis.
1750
1751 /// Vectorization legality.
1753
1754 /// Vector target information.
1756
1757 /// Target Library Info.
1759
1760 /// Demanded bits analysis.
1762
1763 /// Assumption cache.
1765
1766 /// Interface to emit optimization remarks.
1768
1770
1771 /// Loop Vectorize Hint.
1773
1774 /// The interleave access information contains groups of interleaved accesses
1775 /// with the same stride and close to each other.
1777
1778 /// Values to ignore in the cost model.
1780
1781 /// Values to ignore in the cost model when VF > 1.
1783
1784 /// All element types found in the loop.
1786};
1787} // end namespace llvm
1788
1789namespace {
1790/// Helper struct to manage generating runtime checks for vectorization.
1791///
1792/// The runtime checks are created up-front in temporary blocks to allow better
1793/// estimating the cost and un-linked from the existing IR. After deciding to
1794/// vectorize, the checks are moved back. If deciding not to vectorize, the
1795/// temporary blocks are completely removed.
1796class GeneratedRTChecks {
1797 /// Basic block which contains the generated SCEV checks, if any.
1798 BasicBlock *SCEVCheckBlock = nullptr;
1799
1800 /// The value representing the result of the generated SCEV checks. If it is
1801 /// nullptr, either no SCEV checks have been generated or they have been used.
1802 Value *SCEVCheckCond = nullptr;
1803
1804 /// Basic block which contains the generated memory runtime checks, if any.
1805 BasicBlock *MemCheckBlock = nullptr;
1806
1807 /// The value representing the result of the generated memory runtime checks.
1808 /// If it is nullptr, either no memory runtime checks have been generated or
1809 /// they have been used.
1810 Value *MemRuntimeCheckCond = nullptr;
1811
1812 DominatorTree *DT;
1813 LoopInfo *LI;
1815
1816 SCEVExpander SCEVExp;
1817 SCEVExpander MemCheckExp;
1818
1819 bool CostTooHigh = false;
1820 const bool AddBranchWeights;
1821
1822 Loop *OuterLoop = nullptr;
1823
1824public:
1825 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1827 bool AddBranchWeights)
1828 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1829 MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1830
1831 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1832 /// accurately estimate the cost of the runtime checks. The blocks are
1833 /// un-linked from the IR and is added back during vector code generation. If
1834 /// there is no vector code generation, the check blocks are removed
1835 /// completely.
1836 void Create(Loop *L, const LoopAccessInfo &LAI,
1837 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1838
1839 // Hard cutoff to limit compile-time increase in case a very large number of
1840 // runtime checks needs to be generated.
1841 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1842 // profile info.
1843 CostTooHigh =
1845 if (CostTooHigh)
1846 return;
1847
1848 BasicBlock *LoopHeader = L->getHeader();
1849 BasicBlock *Preheader = L->getLoopPreheader();
1850
1851 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1852 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1853 // may be used by SCEVExpander. The blocks will be un-linked from their
1854 // predecessors and removed from LI & DT at the end of the function.
1855 if (!UnionPred.isAlwaysTrue()) {
1856 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1857 nullptr, "vector.scevcheck");
1858
1859 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1860 &UnionPred, SCEVCheckBlock->getTerminator());
1861 }
1862
1863 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1864 if (RtPtrChecking.Need) {
1865 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1866 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1867 "vector.memcheck");
1868
1869 auto DiffChecks = RtPtrChecking.getDiffChecks();
1870 if (DiffChecks) {
1871 Value *RuntimeVF = nullptr;
1872 MemRuntimeCheckCond = addDiffRuntimeChecks(
1873 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1874 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1875 if (!RuntimeVF)
1876 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1877 return RuntimeVF;
1878 },
1879 IC);
1880 } else {
1881 MemRuntimeCheckCond = addRuntimeChecks(
1882 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1884 }
1885 assert(MemRuntimeCheckCond &&
1886 "no RT checks generated although RtPtrChecking "
1887 "claimed checks are required");
1888 }
1889
1890 if (!MemCheckBlock && !SCEVCheckBlock)
1891 return;
1892
1893 // Unhook the temporary block with the checks, update various places
1894 // accordingly.
1895 if (SCEVCheckBlock)
1896 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1897 if (MemCheckBlock)
1898 MemCheckBlock->replaceAllUsesWith(Preheader);
1899
1900 if (SCEVCheckBlock) {
1901 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1902 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1903 Preheader->getTerminator()->eraseFromParent();
1904 }
1905 if (MemCheckBlock) {
1906 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1907 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1908 Preheader->getTerminator()->eraseFromParent();
1909 }
1910
1911 DT->changeImmediateDominator(LoopHeader, Preheader);
1912 if (MemCheckBlock) {
1913 DT->eraseNode(MemCheckBlock);
1914 LI->removeBlock(MemCheckBlock);
1915 }
1916 if (SCEVCheckBlock) {
1917 DT->eraseNode(SCEVCheckBlock);
1918 LI->removeBlock(SCEVCheckBlock);
1919 }
1920
1921 // Outer loop is used as part of the later cost calculations.
1922 OuterLoop = L->getParentLoop();
1923 }
1924
1925 InstructionCost getCost() {
1926 if (SCEVCheckBlock || MemCheckBlock)
1927 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1928
1929 if (CostTooHigh) {
1931 Cost.setInvalid();
1932 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1933 return Cost;
1934 }
1935
1936 InstructionCost RTCheckCost = 0;
1937 if (SCEVCheckBlock)
1938 for (Instruction &I : *SCEVCheckBlock) {
1939 if (SCEVCheckBlock->getTerminator() == &I)
1940 continue;
1943 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1944 RTCheckCost += C;
1945 }
1946 if (MemCheckBlock) {
1947 InstructionCost MemCheckCost = 0;
1948 for (Instruction &I : *MemCheckBlock) {
1949 if (MemCheckBlock->getTerminator() == &I)
1950 continue;
1953 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1954 MemCheckCost += C;
1955 }
1956
1957 // If the runtime memory checks are being created inside an outer loop
1958 // we should find out if these checks are outer loop invariant. If so,
1959 // the checks will likely be hoisted out and so the effective cost will
1960 // reduce according to the outer loop trip count.
1961 if (OuterLoop) {
1962 ScalarEvolution *SE = MemCheckExp.getSE();
1963 // TODO: If profitable, we could refine this further by analysing every
1964 // individual memory check, since there could be a mixture of loop
1965 // variant and invariant checks that mean the final condition is
1966 // variant.
1967 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1968 if (SE->isLoopInvariant(Cond, OuterLoop)) {
1969 // It seems reasonable to assume that we can reduce the effective
1970 // cost of the checks even when we know nothing about the trip
1971 // count. Assume that the outer loop executes at least twice.
1972 unsigned BestTripCount = 2;
1973
1974 // If exact trip count is known use that.
1975 if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
1976 BestTripCount = SmallTC;
1978 // Else use profile data if available.
1979 if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
1980 BestTripCount = *EstimatedTC;
1981 }
1982
1983 BestTripCount = std::max(BestTripCount, 1U);
1984 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1985
1986 // Let's ensure the cost is always at least 1.
1987 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
1989
1990 if (BestTripCount > 1)
1992 << "We expect runtime memory checks to be hoisted "
1993 << "out of the outer loop. Cost reduced from "
1994 << MemCheckCost << " to " << NewMemCheckCost << '\n');
1995
1996 MemCheckCost = NewMemCheckCost;
1997 }
1998 }
1999
2000 RTCheckCost += MemCheckCost;
2001 }
2002
2003 if (SCEVCheckBlock || MemCheckBlock)
2004 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2005 << "\n");
2006
2007 return RTCheckCost;
2008 }
2009
2010 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2011 /// unused.
2012 ~GeneratedRTChecks() {
2013 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2014 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2015 if (!SCEVCheckCond)
2016 SCEVCleaner.markResultUsed();
2017
2018 if (!MemRuntimeCheckCond)
2019 MemCheckCleaner.markResultUsed();
2020
2021 if (MemRuntimeCheckCond) {
2022 auto &SE = *MemCheckExp.getSE();
2023 // Memory runtime check generation creates compares that use expanded
2024 // values. Remove them before running the SCEVExpanderCleaners.
2025 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2026 if (MemCheckExp.isInsertedInstruction(&I))
2027 continue;
2028 SE.forgetValue(&I);
2029 I.eraseFromParent();
2030 }
2031 }
2032 MemCheckCleaner.cleanup();
2033 SCEVCleaner.cleanup();
2034
2035 if (SCEVCheckCond)
2036 SCEVCheckBlock->eraseFromParent();
2037 if (MemRuntimeCheckCond)
2038 MemCheckBlock->eraseFromParent();
2039 }
2040
2041 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2042 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2043 /// depending on the generated condition.
2044 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2045 BasicBlock *LoopVectorPreHeader,
2046 BasicBlock *LoopExitBlock) {
2047 if (!SCEVCheckCond)
2048 return nullptr;
2049
2050 Value *Cond = SCEVCheckCond;
2051 // Mark the check as used, to prevent it from being removed during cleanup.
2052 SCEVCheckCond = nullptr;
2053 if (auto *C = dyn_cast<ConstantInt>(Cond))
2054 if (C->isZero())
2055 return nullptr;
2056
2057 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2058
2059 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2060 // Create new preheader for vector loop.
2061 if (OuterLoop)
2062 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2063
2064 SCEVCheckBlock->getTerminator()->eraseFromParent();
2065 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2066 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2067 SCEVCheckBlock);
2068
2069 DT->addNewBlock(SCEVCheckBlock, Pred);
2070 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2071
2072 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2073 if (AddBranchWeights)
2074 setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
2075 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2076 return SCEVCheckBlock;
2077 }
2078
2079 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2080 /// the branches to branch to the vector preheader or \p Bypass, depending on
2081 /// the generated condition.
2082 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2083 BasicBlock *LoopVectorPreHeader) {
2084 // Check if we generated code that checks in runtime if arrays overlap.
2085 if (!MemRuntimeCheckCond)
2086 return nullptr;
2087
2088 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2089 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2090 MemCheckBlock);
2091
2092 DT->addNewBlock(MemCheckBlock, Pred);
2093 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2094 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2095
2096 if (OuterLoop)
2097 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2098
2099 BranchInst &BI =
2100 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2101 if (AddBranchWeights) {
2102 setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2103 }
2104 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2105 MemCheckBlock->getTerminator()->setDebugLoc(
2106 Pred->getTerminator()->getDebugLoc());
2107
2108 // Mark the check as used, to prevent it from being removed during cleanup.
2109 MemRuntimeCheckCond = nullptr;
2110 return MemCheckBlock;
2111 }
2112};
2113} // namespace
2114
2116 return Style == TailFoldingStyle::Data ||
2117 Style == TailFoldingStyle::DataAndControlFlow ||
2118 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2119}
2120
2122 return Style == TailFoldingStyle::DataAndControlFlow ||
2123 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2124}
2125
2126// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2127// vectorization. The loop needs to be annotated with #pragma omp simd
2128// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2129// vector length information is not provided, vectorization is not considered
2130// explicit. Interleave hints are not allowed either. These limitations will be
2131// relaxed in the future.
2132// Please, note that we are currently forced to abuse the pragma 'clang
2133// vectorize' semantics. This pragma provides *auto-vectorization hints*
2134// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2135// provides *explicit vectorization hints* (LV can bypass legal checks and
2136// assume that vectorization is legal). However, both hints are implemented
2137// using the same metadata (llvm.loop.vectorize, processed by
2138// LoopVectorizeHints). This will be fixed in the future when the native IR
2139// representation for pragma 'omp simd' is introduced.
2140static bool isExplicitVecOuterLoop(Loop *OuterLp,
2142 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2143 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2144
2145 // Only outer loops with an explicit vectorization hint are supported.
2146 // Unannotated outer loops are ignored.
2148 return false;
2149
2150 Function *Fn = OuterLp->getHeader()->getParent();
2151 if (!Hints.allowVectorization(Fn, OuterLp,
2152 true /*VectorizeOnlyWhenForced*/)) {
2153 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2154 return false;
2155 }
2156
2157 if (Hints.getInterleave() > 1) {
2158 // TODO: Interleave support is future work.
2159 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2160 "outer loops.\n");
2161 Hints.emitRemarkWithHints();
2162 return false;
2163 }
2164
2165 return true;
2166}
2167
2171 // Collect inner loops and outer loops without irreducible control flow. For
2172 // now, only collect outer loops that have explicit vectorization hints. If we
2173 // are stress testing the VPlan H-CFG construction, we collect the outermost
2174 // loop of every loop nest.
2175 if (L.isInnermost() || VPlanBuildStressTest ||
2177 LoopBlocksRPO RPOT(&L);
2178 RPOT.perform(LI);
2179 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2180 V.push_back(&L);
2181 // TODO: Collect inner loops inside marked outer loops in case
2182 // vectorization fails for the outer loop. Do not invoke
2183 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2184 // already known to be reducible. We can use an inherited attribute for
2185 // that.
2186 return;
2187 }
2188 }
2189 for (Loop *InnerL : L)
2190 collectSupportedLoops(*InnerL, LI, ORE, V);
2191}
2192
2193//===----------------------------------------------------------------------===//
2194// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2195// LoopVectorizationCostModel and LoopVectorizationPlanner.
2196//===----------------------------------------------------------------------===//
2197
2198/// Compute the transformed value of Index at offset StartValue using step
2199/// StepValue.
2200/// For integer induction, returns StartValue + Index * StepValue.
2201/// For pointer induction, returns StartValue[Index * StepValue].
2202/// FIXME: The newly created binary instructions should contain nsw/nuw
2203/// flags, which can be found from the original scalar operations.
2204static Value *
2206 Value *Step,
2208 const BinaryOperator *InductionBinOp) {
2209 Type *StepTy = Step->getType();
2210 Value *CastedIndex = StepTy->isIntegerTy()
2211 ? B.CreateSExtOrTrunc(Index, StepTy)
2212 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2213 if (CastedIndex != Index) {
2214 CastedIndex->setName(CastedIndex->getName() + ".cast");
2215 Index = CastedIndex;
2216 }
2217
2218 // Note: the IR at this point is broken. We cannot use SE to create any new
2219 // SCEV and then expand it, hoping that SCEV's simplification will give us
2220 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2221 // lead to various SCEV crashes. So all we can do is to use builder and rely
2222 // on InstCombine for future simplifications. Here we handle some trivial
2223 // cases only.
2224 auto CreateAdd = [&B](Value *X, Value *Y) {
2225 assert(X->getType() == Y->getType() && "Types don't match!");
2226 if (auto *CX = dyn_cast<ConstantInt>(X))
2227 if (CX->isZero())
2228 return Y;
2229 if (auto *CY = dyn_cast<ConstantInt>(Y))
2230 if (CY->isZero())
2231 return X;
2232 return B.CreateAdd(X, Y);
2233 };
2234
2235 // We allow X to be a vector type, in which case Y will potentially be
2236 // splatted into a vector with the same element count.
2237 auto CreateMul = [&B](Value *X, Value *Y) {
2238 assert(X->getType()->getScalarType() == Y->getType() &&
2239 "Types don't match!");
2240 if (auto *CX = dyn_cast<ConstantInt>(X))
2241 if (CX->isOne())
2242 return Y;
2243 if (auto *CY = dyn_cast<ConstantInt>(Y))
2244 if (CY->isOne())
2245 return X;
2246 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2247 if (XVTy && !isa<VectorType>(Y->getType()))
2248 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2249 return B.CreateMul(X, Y);
2250 };
2251
2252 switch (InductionKind) {
2254 assert(!isa<VectorType>(Index->getType()) &&
2255 "Vector indices not supported for integer inductions yet");
2256 assert(Index->getType() == StartValue->getType() &&
2257 "Index type does not match StartValue type");
2258 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2259 return B.CreateSub(StartValue, Index);
2260 auto *Offset = CreateMul(Index, Step);
2261 return CreateAdd(StartValue, Offset);
2262 }
2264 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2266 assert(!isa<VectorType>(Index->getType()) &&
2267 "Vector indices not supported for FP inductions yet");
2268 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2269 assert(InductionBinOp &&
2270 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2271 InductionBinOp->getOpcode() == Instruction::FSub) &&
2272 "Original bin op should be defined for FP induction");
2273
2274 Value *MulExp = B.CreateFMul(Step, Index);
2275 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2276 "induction");
2277 }
2279 return nullptr;
2280 }
2281 llvm_unreachable("invalid enum");
2282}
2283
2284std::optional<unsigned> getMaxVScale(const Function &F,
2285 const TargetTransformInfo &TTI) {
2286 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2287 return MaxVScale;
2288
2289 if (F.hasFnAttribute(Attribute::VScaleRange))
2290 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2291
2292 return std::nullopt;
2293}
2294
2295/// For the given VF and UF and maximum trip count computed for the loop, return
2296/// whether the induction variable might overflow in the vectorized loop. If not,
2297/// then we know a runtime overflow check always evaluates to false and can be
2298/// removed.
2301 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2302 // Always be conservative if we don't know the exact unroll factor.
2303 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2304
2305 Type *IdxTy = Cost->Legal->getWidestInductionType();
2306 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2307
2308 // We know the runtime overflow check is known false iff the (max) trip-count
2309 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2310 // the vector loop induction variable.
2311 if (unsigned TC =
2312 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2313 uint64_t MaxVF = VF.getKnownMinValue();
2314 if (VF.isScalable()) {
2315 std::optional<unsigned> MaxVScale =
2316 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2317 if (!MaxVScale)
2318 return false;
2319 MaxVF *= *MaxVScale;
2320 }
2321
2322 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2323 }
2324
2325 return false;
2326}
2327
2328// Return whether we allow using masked interleave-groups (for dealing with
2329// strided loads/stores that reside in predicated blocks, or for dealing
2330// with gaps).
2332 // If an override option has been passed in for interleaved accesses, use it.
2333 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2335
2337}
2338
2340 VPReplicateRecipe *RepRecipe,
2341 const VPIteration &Instance,
2342 VPTransformState &State) {
2343 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2344
2345 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2346 // the first lane and part.
2347 if (isa<NoAliasScopeDeclInst>(Instr))
2348 if (!Instance.isFirstIteration())
2349 return;
2350
2351 // Does this instruction return a value ?
2352 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2353
2354 Instruction *Cloned = Instr->clone();
2355 if (!IsVoidRetTy) {
2356 Cloned->setName(Instr->getName() + ".cloned");
2357#if !defined(NDEBUG)
2358 // Verify that VPlan type inference results agree with the type of the
2359 // generated values.
2360 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2361 "inferred type and type from generated instructions do not match");
2362#endif
2363 }
2364
2365 RepRecipe->setFlags(Cloned);
2366
2367 if (auto DL = Instr->getDebugLoc())
2368 State.setDebugLocFrom(DL);
2369
2370 // Replace the operands of the cloned instructions with their scalar
2371 // equivalents in the new loop.
2372 for (const auto &I : enumerate(RepRecipe->operands())) {
2373 auto InputInstance = Instance;
2374 VPValue *Operand = I.value();
2376 InputInstance.Lane = VPLane::getFirstLane();
2377 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2378 }
2379 State.addNewMetadata(Cloned, Instr);
2380
2381 // Place the cloned scalar in the new loop.
2382 State.Builder.Insert(Cloned);
2383
2384 State.set(RepRecipe, Cloned, Instance);
2385
2386 // If we just cloned a new assumption, add it the assumption cache.
2387 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2389
2390 // End if-block.
2391 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2392 if (IfPredicateInstr)
2393 PredicatedInstructions.push_back(Cloned);
2394}
2395
2396Value *
2398 if (VectorTripCount)
2399 return VectorTripCount;
2400
2401 Value *TC = getTripCount();
2402 IRBuilder<> Builder(InsertBlock->getTerminator());
2403
2404 Type *Ty = TC->getType();
2405 // This is where we can make the step a runtime constant.
2406 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2407
2408 // If the tail is to be folded by masking, round the number of iterations N
2409 // up to a multiple of Step instead of rounding down. This is done by first
2410 // adding Step-1 and then rounding down. Note that it's ok if this addition
2411 // overflows: the vector induction variable will eventually wrap to zero given
2412 // that it starts at zero and its Step is a power of two; the loop will then
2413 // exit, with the last early-exit vector comparison also producing all-true.
2414 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2415 // is accounted for in emitIterationCountCheck that adds an overflow check.
2416 if (Cost->foldTailByMasking()) {
2418 "VF*UF must be a power of 2 when folding tail by masking");
2419 TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2420 "n.rnd.up");
2421 }
2422
2423 // Now we need to generate the expression for the part of the loop that the
2424 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2425 // iterations are not required for correctness, or N - Step, otherwise. Step
2426 // is equal to the vectorization factor (number of SIMD elements) times the
2427 // unroll factor (number of SIMD instructions).
2428 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2429
2430 // There are cases where we *must* run at least one iteration in the remainder
2431 // loop. See the cost model for when this can happen. If the step evenly
2432 // divides the trip count, we set the remainder to be equal to the step. If
2433 // the step does not evenly divide the trip count, no adjustment is necessary
2434 // since there will already be scalar iterations. Note that the minimum
2435 // iterations check ensures that N >= Step.
2436 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2437 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2438 R = Builder.CreateSelect(IsZero, Step, R);
2439 }
2440
2441 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2442
2443 return VectorTripCount;
2444}
2445
2447 Value *Count = getTripCount();
2448 // Reuse existing vector loop preheader for TC checks.
2449 // Note that new preheader block is generated for vector loop.
2450 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2451 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2452
2453 // Generate code to check if the loop's trip count is less than VF * UF, or
2454 // equal to it in case a scalar epilogue is required; this implies that the
2455 // vector trip count is zero. This check also covers the case where adding one
2456 // to the backedge-taken count overflowed leading to an incorrect trip count
2457 // of zero. In this case we will also jump to the scalar loop.
2458 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2460
2461 // If tail is to be folded, vector loop takes care of all iterations.
2462 Type *CountTy = Count->getType();
2463 Value *CheckMinIters = Builder.getFalse();
2464 auto CreateStep = [&]() -> Value * {
2465 // Create step with max(MinProTripCount, UF * VF).
2467 return createStepForVF(Builder, CountTy, VF, UF);
2468
2469 Value *MinProfTC =
2471 if (!VF.isScalable())
2472 return MinProfTC;
2474 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2475 };
2476
2477 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2478 if (Style == TailFoldingStyle::None)
2479 CheckMinIters =
2480 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2481 else if (VF.isScalable() &&
2484 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2485 // an overflow to zero when updating induction variables and so an
2486 // additional overflow check is required before entering the vector loop.
2487
2488 // Get the maximum unsigned value for the type.
2489 Value *MaxUIntTripCount =
2490 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2491 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2492
2493 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2494 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2495 }
2496
2497 // Create new preheader for vector loop.
2499 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2500 "vector.ph");
2501
2502 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2503 DT->getNode(Bypass)->getIDom()) &&
2504 "TC check is expected to dominate Bypass");
2505
2506 // Update dominator for Bypass & LoopExit (if needed).
2507 DT->changeImmediateDominator(Bypass, TCCheckBlock);
2508 BranchInst &BI =
2509 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2511 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2512 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2513 LoopBypassBlocks.push_back(TCCheckBlock);
2514}
2515
2517 BasicBlock *const SCEVCheckBlock =
2518 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2519 if (!SCEVCheckBlock)
2520 return nullptr;
2521
2522 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2524 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2525 "Cannot SCEV check stride or overflow when optimizing for size");
2526
2527
2528 // Update dominator only if this is first RT check.
2529 if (LoopBypassBlocks.empty()) {
2530 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2531 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2532 // If there is an epilogue which must run, there's no edge from the
2533 // middle block to exit blocks and thus no need to update the immediate
2534 // dominator of the exit blocks.
2535 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2536 }
2537
2538 LoopBypassBlocks.push_back(SCEVCheckBlock);
2539 AddedSafetyChecks = true;
2540 return SCEVCheckBlock;
2541}
2542
2544 // VPlan-native path does not do any analysis for runtime checks currently.
2546 return nullptr;
2547
2548 BasicBlock *const MemCheckBlock =
2549 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2550
2551 // Check if we generated code that checks in runtime if arrays overlap. We put
2552 // the checks into a separate block to make the more common case of few
2553 // elements faster.
2554 if (!MemCheckBlock)
2555 return nullptr;
2556
2557 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2558 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2559 "Cannot emit memory checks when optimizing for size, unless forced "
2560 "to vectorize.");
2561 ORE->emit([&]() {
2562 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2565 << "Code-size may be reduced by not forcing "
2566 "vectorization, or by source-code modifications "
2567 "eliminating the need for runtime checks "
2568 "(e.g., adding 'restrict').";
2569 });
2570 }
2571
2572 LoopBypassBlocks.push_back(MemCheckBlock);
2573
2574 AddedSafetyChecks = true;
2575
2576 return MemCheckBlock;
2577}
2578
2582 assert(LoopVectorPreHeader && "Invalid loop structure");
2583 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2584 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2585 "multiple exit loop without required epilogue?");
2586
2589 LI, nullptr, Twine(Prefix) + "middle.block");
2592 nullptr, Twine(Prefix) + "scalar.ph");
2593}
2594
2596 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
2597 ArrayRef<BasicBlock *> BypassBlocks,
2598 std::pair<BasicBlock *, Value *> AdditionalBypass) {
2600 assert(VectorTripCount && "Expected valid arguments");
2601
2602 Instruction *OldInduction = Legal->getPrimaryInduction();
2603 Value *&EndValue = IVEndValues[OrigPhi];
2604 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
2605 if (OrigPhi == OldInduction) {
2606 // We know what the end value is.
2607 EndValue = VectorTripCount;
2608 } else {
2610
2611 // Fast-math-flags propagate from the original induction instruction.
2612 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
2613 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2614
2615 EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
2616 Step, II.getKind(), II.getInductionBinOp());
2617 EndValue->setName("ind.end");
2618
2619 // Compute the end value for the additional bypass (if applicable).
2620 if (AdditionalBypass.first) {
2621 B.SetInsertPoint(AdditionalBypass.first,
2622 AdditionalBypass.first->getFirstInsertionPt());
2623 EndValueFromAdditionalBypass =
2624 emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
2625 Step, II.getKind(), II.getInductionBinOp());
2626 EndValueFromAdditionalBypass->setName("ind.end");
2627 }
2628 }
2629
2630 // Create phi nodes to merge from the backedge-taken check block.
2631 PHINode *BCResumeVal =
2632 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
2634 // Copy original phi DL over to the new one.
2635 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2636
2637 // The new PHI merges the original incoming value, in case of a bypass,
2638 // or the value at the end of the vectorized loop.
2639 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
2640
2641 // Fix the scalar body counter (PHI node).
2642 // The old induction's phi node in the scalar body needs the truncated
2643 // value.
2644 for (BasicBlock *BB : BypassBlocks)
2645 BCResumeVal->addIncoming(II.getStartValue(), BB);
2646
2647 if (AdditionalBypass.first)
2648 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
2649 EndValueFromAdditionalBypass);
2650 return BCResumeVal;
2651}
2652
2653/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2654/// expansion results.
2656 const SCEV2ValueTy &ExpandedSCEVs) {
2657 const SCEV *Step = ID.getStep();
2658 if (auto *C = dyn_cast<SCEVConstant>(Step))
2659 return C->getValue();
2660 if (auto *U = dyn_cast<SCEVUnknown>(Step))
2661 return U->getValue();
2662 auto I = ExpandedSCEVs.find(Step);
2663 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
2664 return I->second;
2665}
2666
2668 const SCEV2ValueTy &ExpandedSCEVs,
2669 std::pair<BasicBlock *, Value *> AdditionalBypass) {
2670 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
2671 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
2672 "Inconsistent information about additional bypass.");
2673 // We are going to resume the execution of the scalar loop.
2674 // Go over all of the induction variables that we found and fix the
2675 // PHIs that are left in the scalar version of the loop.
2676 // The starting values of PHI nodes depend on the counter of the last
2677 // iteration in the vectorized loop.
2678 // If we come from a bypass edge then we need to start from the original
2679 // start value.
2680 for (const auto &InductionEntry : Legal->getInductionVars()) {
2681 PHINode *OrigPhi = InductionEntry.first;
2682 const InductionDescriptor &II = InductionEntry.second;
2683 PHINode *BCResumeVal = createInductionResumeValue(
2684 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
2685 AdditionalBypass);
2686 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
2687 }
2688}
2689
2690std::pair<BasicBlock *, Value *>
2692 const SCEV2ValueTy &ExpandedSCEVs) {
2693 /*
2694 In this function we generate a new loop. The new loop will contain
2695 the vectorized instructions while the old loop will continue to run the
2696 scalar remainder.
2697
2698 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2699 / | preheader are expanded here. Eventually all required SCEV
2700 / | expansion should happen here.
2701 / v
2702 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2703 | / |
2704 | / v
2705 || [ ] <-- vector pre header.
2706 |/ |
2707 | v
2708 | [ ] \
2709 | [ ]_| <-- vector loop (created during VPlan execution).
2710 | |
2711 | v
2712 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2713 | | successors created during VPlan execution)
2714 \/ |
2715 /\ v
2716 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
2717 | |
2718 (opt) v <-- edge from middle to exit iff epilogue is not required.
2719 | [ ] \
2720 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
2721 \ |
2722 \ v
2723 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
2724 ...
2725 */
2726
2727 // Create an empty vector loop, and prepare basic blocks for the runtime
2728 // checks.
2730
2731 // Now, compare the new count to zero. If it is zero skip the vector loop and
2732 // jump to the scalar loop. This check also covers the case where the
2733 // backedge-taken count is uint##_max: adding one to it will overflow leading
2734 // to an incorrect trip count of zero. In this (rare) case we will also jump
2735 // to the scalar loop.
2737
2738 // Generate the code to check any assumptions that we've made for SCEV
2739 // expressions.
2741
2742 // Generate the code that checks in runtime if arrays overlap. We put the
2743 // checks into a separate block to make the more common case of few elements
2744 // faster.
2746
2747 // Emit phis for the new starting index of the scalar loop.
2748 createInductionResumeValues(ExpandedSCEVs);
2749
2750 return {LoopVectorPreHeader, nullptr};
2751}
2752
2753// Fix up external users of the induction variable. At this point, we are
2754// in LCSSA form, with all external PHIs that use the IV having one input value,
2755// coming from the remainder loop. We need those PHIs to also have a correct
2756// value for the IV when arriving directly from the middle block.
2758 const InductionDescriptor &II,
2759 Value *VectorTripCount, Value *EndValue,
2760 BasicBlock *MiddleBlock,
2761 BasicBlock *VectorHeader, VPlan &Plan,
2762 VPTransformState &State) {
2763 // There are two kinds of external IV usages - those that use the value
2764 // computed in the last iteration (the PHI) and those that use the penultimate
2765 // value (the value that feeds into the phi from the loop latch).
2766 // We allow both, but they, obviously, have different values.
2767
2768 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
2769
2770 DenseMap<Value *, Value *> MissingVals;
2771
2772 // An external user of the last iteration's value should see the value that
2773 // the remainder loop uses to initialize its own IV.
2775 for (User *U : PostInc->users()) {
2776 Instruction *UI = cast<Instruction>(U);
2777 if (!OrigLoop->contains(UI)) {
2778 assert(isa<PHINode>(UI) && "Expected LCSSA form");
2779 MissingVals[UI] = EndValue;
2780 }
2781 }
2782
2783 // An external user of the penultimate value need to see EndValue - Step.
2784 // The simplest way to get this is to recompute it from the constituent SCEVs,
2785 // that is Start + (Step * (CRD - 1)).
2786 for (User *U : OrigPhi->users()) {
2787 auto *UI = cast<Instruction>(U);
2788 if (!OrigLoop->contains(UI)) {
2789 assert(isa<PHINode>(UI) && "Expected LCSSA form");
2790 IRBuilder<> B(MiddleBlock->getTerminator());
2791
2792 // Fast-math-flags propagate from the original induction instruction.
2793 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
2794 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2795
2796 Value *CountMinusOne = B.CreateSub(
2797 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
2798 CountMinusOne->setName("cmo");
2799
2800 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
2801 assert(StepVPV && "step must have been expanded during VPlan execution");
2802 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
2803 : State.get(StepVPV, {0, 0});
2804 Value *Escape =
2805 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
2806 II.getKind(), II.getInductionBinOp());
2807 Escape->setName("ind.escape");
2808 MissingVals[UI] = Escape;
2809 }
2810 }
2811
2812 for (auto &I : MissingVals) {
2813 PHINode *PHI = cast<PHINode>(I.first);
2814 // One corner case we have to handle is two IVs "chasing" each-other,
2815 // that is %IV2 = phi [...], [ %IV1, %latch ]
2816 // In this case, if IV1 has an external use, we need to avoid adding both
2817 // "last value of IV1" and "penultimate value of IV2". So, verify that we
2818 // don't already have an incoming value for the middle block.
2819 if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
2820 PHI->addIncoming(I.second, MiddleBlock);
2821 }
2822}
2823
2824namespace {
2825
2826struct CSEDenseMapInfo {
2827 static bool canHandle(const Instruction *I) {
2828 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2829 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2830 }
2831
2832 static inline Instruction *getEmptyKey() {
2834 }
2835
2836 static inline Instruction *getTombstoneKey() {
2838 }
2839
2840 static unsigned getHashValue(const Instruction *I) {
2841 assert(canHandle(I) && "Unknown instruction!");
2842 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
2843 I->value_op_end()));
2844 }
2845
2846 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2847 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2848 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2849 return LHS == RHS;
2850 return LHS->isIdenticalTo(RHS);
2851 }
2852};
2853
2854} // end anonymous namespace
2855
2856///Perform cse of induction variable instructions.
2857static void cse(BasicBlock *BB) {
2858 // Perform simple cse.
2860 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2861 if (!CSEDenseMapInfo::canHandle(&In))
2862 continue;
2863
2864 // Check if we can replace this instruction with any of the
2865 // visited instructions.
2866 if (Instruction *V = CSEMap.lookup(&In)) {
2867 In.replaceAllUsesWith(V);
2868 In.eraseFromParent();
2869 continue;
2870 }
2871
2872 CSEMap[&In] = &In;
2873 }
2874}
2875
2878 ElementCount VF) const {
2879 // We only need to calculate a cost if the VF is scalar; for actual vectors
2880 // we should already have a pre-calculated cost at each VF.
2881 if (!VF.isScalar())
2882 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
2883
2885 Type *RetTy = CI->getType();
2887 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
2888 return *RedCost;
2889
2891 for (auto &ArgOp : CI->args())
2892 Tys.push_back(ArgOp->getType());
2893
2894 InstructionCost ScalarCallCost =
2896
2897 // If this is an intrinsic we may have a lower cost for it.
2899 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2900 return std::min(ScalarCallCost, IntrinsicCost);
2901 }
2902 return ScalarCallCost;
2903}
2904
2906 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2907 return Elt;
2908 return VectorType::get(Elt, VF);
2909}
2910
2913 ElementCount VF) const {
2915 assert(ID && "Expected intrinsic call!");
2916 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
2917 FastMathFlags FMF;
2918 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2919 FMF = FPMO->getFastMathFlags();
2920
2923 SmallVector<Type *> ParamTys;
2924 std::transform(FTy->param_begin(), FTy->param_end(),
2925 std::back_inserter(ParamTys),
2926 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
2927
2928 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2929 dyn_cast<IntrinsicInst>(CI));
2930 return TTI.getIntrinsicInstrCost(CostAttrs,
2932}
2933
2935 VPlan &Plan) {
2936 // Fix widened non-induction PHIs by setting up the PHI operands.
2938 fixNonInductionPHIs(Plan, State);
2939
2940 // Forget the original basic block.
2943
2944 // After vectorization, the exit blocks of the original loop will have
2945 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2946 // looked through single-entry phis.
2947 SmallVector<BasicBlock *> ExitBlocks;
2948 OrigLoop->getExitBlocks(ExitBlocks);
2949 for (BasicBlock *Exit : ExitBlocks)
2950 for (PHINode &PN : Exit->phis())
2952
2953 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
2954 VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
2955 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
2956 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2957 // No edge from the middle block to the unique exit block has been inserted
2958 // and there is nothing to fix from vector loop; phis should have incoming
2959 // from scalar loop only.
2960 } else {
2961 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
2962 // the cost model.
2963
2964 // If we inserted an edge from the middle block to the unique exit block,
2965 // update uses outside the loop (phis) to account for the newly inserted
2966 // edge.
2967
2968 // Fix-up external users of the induction variables.
2969 for (const auto &Entry : Legal->getInductionVars())
2970 fixupIVUsers(Entry.first, Entry.second,
2972 IVEndValues[Entry.first], LoopMiddleBlock,
2973 VectorLoop->getHeader(), Plan, State);
2974 }
2975
2976 // Fix live-out phis not already fixed earlier.
2977 for (const auto &KV : Plan.getLiveOuts())
2978 KV.second->fixPhi(Plan, State);
2979
2981 sinkScalarOperands(&*PI);
2982
2983 // Remove redundant induction instructions.
2984 cse(VectorLoop->getHeader());
2985
2986 // Set/update profile weights for the vector and remainder loops as original
2987 // loop iterations are now distributed among them. Note that original loop
2988 // represented by LoopScalarBody becomes remainder loop after vectorization.
2989 //
2990 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
2991 // end up getting slightly roughened result but that should be OK since
2992 // profile is not inherently precise anyway. Note also possible bypass of
2993 // vector code caused by legality checks is ignored, assigning all the weight
2994 // to the vector loop, optimistically.
2995 //
2996 // For scalable vectorization we can't know at compile time how many iterations
2997 // of the loop are handled in one vector iteration, so instead assume a pessimistic
2998 // vscale of '1'.
3001 VF.getKnownMinValue() * UF);
3002}
3003
3005 // The basic block and loop containing the predicated instruction.
3006 auto *PredBB = PredInst->getParent();
3007 auto *VectorLoop = LI->getLoopFor(PredBB);
3008
3009 // Initialize a worklist with the operands of the predicated instruction.
3010 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3011
3012 // Holds instructions that we need to analyze again. An instruction may be
3013 // reanalyzed if we don't yet know if we can sink it or not.
3014 SmallVector<Instruction *, 8> InstsToReanalyze;
3015
3016 // Returns true if a given use occurs in the predicated block. Phi nodes use
3017 // their operands in their corresponding predecessor blocks.
3018 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3019 auto *I = cast<Instruction>(U.getUser());
3020 BasicBlock *BB = I->getParent();
3021 if (auto *Phi = dyn_cast<PHINode>(I))
3022 BB = Phi->getIncomingBlock(
3023 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3024 return BB == PredBB;
3025 };
3026
3027 // Iteratively sink the scalarized operands of the predicated instruction
3028 // into the block we created for it. When an instruction is sunk, it's
3029 // operands are then added to the worklist. The algorithm ends after one pass
3030 // through the worklist doesn't sink a single instruction.
3031 bool Changed;
3032 do {
3033 // Add the instructions that need to be reanalyzed to the worklist, and
3034 // reset the changed indicator.
3035 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3036 InstsToReanalyze.clear();
3037 Changed = false;
3038
3039 while (!Worklist.empty()) {
3040 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3041
3042 // We can't sink an instruction if it is a phi node, is not in the loop,
3043 // may have side effects or may read from memory.
3044 // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3045 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3046 I->mayHaveSideEffects() || I->mayReadFromMemory())
3047 continue;
3048
3049 // If the instruction is already in PredBB, check if we can sink its
3050 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3051 // sinking the scalar instruction I, hence it appears in PredBB; but it
3052 // may have failed to sink I's operands (recursively), which we try
3053 // (again) here.
3054 if (I->getParent() == PredBB) {
3055 Worklist.insert(I->op_begin(), I->op_end());
3056 continue;
3057 }
3058
3059 // It's legal to sink the instruction if all its uses occur in the
3060 // predicated block. Otherwise, there's nothing to do yet, and we may
3061 // need to reanalyze the instruction.
3062 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3063 InstsToReanalyze.push_back(I);
3064 continue;
3065 }
3066
3067 // Move the instruction to the beginning of the predicated block, and add
3068 // it's operands to the worklist.
3069 I->moveBefore(&*PredBB->getFirstInsertionPt());
3070 Worklist.insert(I->op_begin(), I->op_end());
3071
3072 // The sinking may have enabled other instructions to be sunk, so we will
3073 // need to iterate.
3074 Changed = true;
3075 }
3076 } while (Changed);
3077}
3078
3080 VPTransformState &State) {
3081 auto Iter = vp_depth_first_deep(Plan.getEntry());
3082 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3083 for (VPRecipeBase &P : VPBB->phis()) {
3084 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3085 if (!VPPhi)
3086 continue;
3087 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3088 // Make sure the builder has a valid insert point.
3089 Builder.SetInsertPoint(NewPhi);
3090 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3091 VPValue *Inc = VPPhi->getIncomingValue(i);
3092 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3093 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
3094 }
3095 }
3096 }
3097}
3098
3099void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3100 // We should not collect Scalars more than once per VF. Right now, this
3101 // function is called from collectUniformsAndScalars(), which already does
3102 // this check. Collecting Scalars for VF=1 does not make any sense.
3103 assert(VF.isVector() && !Scalars.contains(VF) &&
3104 "This function should not be visited twice for the same VF");
3105
3106 // This avoids any chances of creating a REPLICATE recipe during planning
3107 // since that would result in generation of scalarized code during execution,
3108 // which is not supported for scalable vectors.
3109 if (VF.isScalable()) {
3110 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3111 return;
3112 }
3113
3115
3116 // These sets are used to seed the analysis with pointers used by memory
3117 // accesses that will remain scalar.
3119 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3120 auto *Latch = TheLoop->getLoopLatch();
3121
3122 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3123 // The pointer operands of loads and stores will be scalar as long as the
3124 // memory access is not a gather or scatter operation. The value operand of a
3125 // store will remain scalar if the store is scalarized.
3126 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3127 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3128 assert(WideningDecision != CM_Unknown &&
3129 "Widening decision should be ready at this moment");
3130 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3131 if (Ptr == Store->getValueOperand())
3132 return WideningDecision == CM_Scalarize;
3133 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3134 "Ptr is neither a value or pointer operand");
3135 return WideningDecision != CM_GatherScatter;
3136 };
3137
3138 // A helper that returns true if the given value is a getelementptr
3139 // instruction contained in the loop.
3140 auto isLoopVaryingGEP = [&](Value *V) {
3141 return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
3142 };
3143
3144 // A helper that evaluates a memory access's use of a pointer. If the use will
3145 // be a scalar use and the pointer is only used by memory accesses, we place
3146 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3147 // PossibleNonScalarPtrs.
3148 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3149 // We only care about bitcast and getelementptr instructions contained in
3150 // the loop.
3151 if (!isLoopVaryingGEP(Ptr))
3152 return;
3153
3154 // If the pointer has already been identified as scalar (e.g., if it was
3155 // also identified as uniform), there's nothing to do.
3156 auto *I = cast<Instruction>(Ptr);
3157 if (Worklist.count(I))
3158 return;
3159
3160 // If the use of the pointer will be a scalar use, and all users of the
3161 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3162 // place the pointer in PossibleNonScalarPtrs.
3163 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3164 return isa<LoadInst>(U) || isa<StoreInst>(U);
3165 }))
3166 ScalarPtrs.insert(I);
3167 else
3168 PossibleNonScalarPtrs.insert(I);
3169 };
3170
3171 // We seed the scalars analysis with three classes of instructions: (1)
3172 // instructions marked uniform-after-vectorization and (2) bitcast,
3173 // getelementptr and (pointer) phi instructions used by memory accesses
3174 // requiring a scalar use.
3175 //
3176 // (1) Add to the worklist all instructions that have been identified as
3177 // uniform-after-vectorization.
3178 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3179
3180 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3181 // memory accesses requiring a scalar use. The pointer operands of loads and
3182 // stores will be scalar as long as the memory accesses is not a gather or
3183 // scatter operation. The value operand of a store will remain scalar if the
3184 // store is scalarized.
3185 for (auto *BB : TheLoop->blocks())
3186 for (auto &I : *BB) {
3187 if (auto *Load = dyn_cast<LoadInst>(&I)) {
3188 evaluatePtrUse(Load, Load->getPointerOperand());
3189 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3190 evaluatePtrUse(Store, Store->getPointerOperand());
3191 evaluatePtrUse(Store, Store->getValueOperand());
3192 }
3193 }
3194 for (auto *I : ScalarPtrs)
3195 if (!PossibleNonScalarPtrs.count(I)) {
3196 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3197 Worklist.insert(I);
3198 }
3199
3200 // Insert the forced scalars.
3201 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3202 // induction variable when the PHI user is scalarized.
3203 auto ForcedScalar = ForcedScalars.find(VF);
3204 if (ForcedScalar != ForcedScalars.end())
3205 for (auto *I : ForcedScalar->second) {
3206 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3207 Worklist.insert(I);
3208 }
3209
3210 // Expand the worklist by looking through any bitcasts and getelementptr
3211 // instructions we've already identified as scalar. This is similar to the
3212 // expansion step in collectLoopUniforms(); however, here we're only
3213 // expanding to include additional bitcasts and getelementptr instructions.
3214 unsigned Idx = 0;
3215 while (Idx != Worklist.size()) {
3216 Instruction *Dst = Worklist[Idx++];
3217 if (!isLoopVaryingGEP(Dst->getOperand(0)))
3218 continue;
3219 auto *Src = cast<Instruction>(Dst->getOperand(0));
3220 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3221 auto *J = cast<Instruction>(U);
3222 return !TheLoop->contains(J) || Worklist.count(J) ||
3223 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3224 isScalarUse(J, Src));
3225 })) {
3226 Worklist.insert(Src);
3227 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3228 }
3229 }
3230
3231 // An induction variable will remain scalar if all users of the induction
3232 // variable and induction variable update remain scalar.
3233 for (const auto &Induction : Legal->getInductionVars()) {
3234 auto *Ind = Induction.first;
3235 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3236
3237 // If tail-folding is applied, the primary induction variable will be used
3238 // to feed a vector compare.
3239 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3240 continue;
3241
3242 // Returns true if \p Indvar is a pointer induction that is used directly by
3243 // load/store instruction \p I.
3244 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3245 Instruction *I) {
3246 return Induction.second.getKind() ==
3248 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3249 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
3250 };
3251
3252 // Determine if all users of the induction variable are scalar after
3253 // vectorization.
3254 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3255 auto *I = cast<Instruction>(U);
3256 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3257 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3258 });
3259 if (!ScalarInd)
3260 continue;
3261
3262 // If the induction variable update is a fixed-order recurrence, neither the
3263 // induction variable or its update should be marked scalar after
3264 // vectorization.
3265 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3266 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3267 continue;
3268
3269 // Determine if all users of the induction variable update instruction are
3270 // scalar after vectorization.
3271 auto ScalarIndUpdate =
3272 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3273 auto *I = cast<Instruction>(U);
3274 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3275 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3276 });
3277 if (!ScalarIndUpdate)
3278 continue;
3279
3280 // The induction variable and its update instruction will remain scalar.
3281 Worklist.insert(Ind);
3282 Worklist.insert(IndUpdate);
3283 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3284 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3285 << "\n");
3286 }
3287
3288 Scalars[VF].insert(Worklist.begin(), Worklist.end());
3289}
3290
3292 Instruction *I, ElementCount VF) const {
3293 if (!isPredicatedInst(I))
3294 return false;
3295
3296 // Do we have a non-scalar lowering for this predicated
3297 // instruction? No - it is scalar with predication.
3298 switch(I->getOpcode()) {
3299 default:
3300 return true;
3301 case Instruction::Call:
3302 if (VF.isScalar())
3303 return true;
3304 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3305 .Kind == CM_Scalarize;
3306 case Instruction::Load:
3307 case Instruction::Store: {
3309 auto *Ty = getLoadStoreType(I);
3310 Type *VTy = Ty;
3311 if (VF.isVector())
3312 VTy = VectorType::get(Ty, VF);
3313 const Align Alignment = getLoadStoreAlignment(I);
3314 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3315 TTI.isLegalMaskedGather(VTy, Alignment))
3316 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3317 TTI.isLegalMaskedScatter(VTy, Alignment));
3318 }
3319 case Instruction::UDiv:
3320 case Instruction::SDiv:
3321 case Instruction::SRem:
3322 case Instruction::URem: {
3323 // We have the option to use the safe-divisor idiom to avoid predication.
3324 // The cost based decision here will always select safe-divisor for
3325 // scalable vectors as scalarization isn't legal.
3326 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3327 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3328 }
3329 }
3330}
3331
3332// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
3334 // If predication is not needed, avoid it.
3335 // TODO: We can use the loop-preheader as context point here and get
3336 // context sensitive reasoning for isSafeToSpeculativelyExecute.
3337 if (!blockNeedsPredicationForAnyReason(I->getParent()) ||
3339 (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
3340 isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I))
3341 return false;
3342
3343 // If the instruction was executed conditionally in the original scalar loop,
3344 // predication is needed with a mask whose lanes are all possibly inactive.
3345 if (Legal->blockNeedsPredication(I->getParent()))
3346 return true;
3347
3348 // All that remain are instructions with side-effects originally executed in
3349 // the loop unconditionally, but now execute under a tail-fold mask (only)
3350 // having at least one active lane (the first). If the side-effects of the
3351 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
3352 // - it will cause the same side-effects as when masked.
3353 switch(I->getOpcode()) {
3354 default:
3356 "instruction should have been considered by earlier checks");
3357 case Instruction::Call:
3358 // Side-effects of a Call are assumed to be non-invariant, needing a
3359 // (fold-tail) mask.
3361 "should have returned earlier for calls not needing a mask");
3362 return true;
3363 case Instruction::Load:
3364 // If the address is loop invariant no predication is needed.
3366 case Instruction::Store: {
3367 // For stores, we need to prove both speculation safety (which follows from
3368 // the same argument as loads), but also must prove the value being stored
3369 // is correct. The easiest form of the later is to require that all values
3370 // stored are the same.
3372 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
3373 }
3374 case Instruction::UDiv:
3375 case Instruction::SDiv:
3376 case Instruction::SRem:
3377 case Instruction::URem:
3378 // If the divisor is loop-invariant no predication is needed.
3379 return !TheLoop->isLoopInvariant(I->getOperand(1));
3380 }
3381}
3382
3383std::pair<InstructionCost, InstructionCost>
3385 ElementCount VF) const {
3386 assert(I->getOpcode() == Instruction::UDiv ||
3387 I->getOpcode() == Instruction::SDiv ||
3388 I->getOpcode() == Instruction::SRem ||
3389 I->getOpcode() == Instruction::URem);
3391
3393
3394 // Scalarization isn't legal for scalable vector types
3395 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3396 if (!VF.isScalable()) {
3397 // Get the scalarization cost and scale this amount by the probability of
3398 // executing the predicated block. If the instruction is not predicated,
3399 // we fall through to the next case.
3400 ScalarizationCost = 0;
3401
3402 // These instructions have a non-void type, so account for the phi nodes
3403 // that we will create. This cost is likely to be zero. The phi node
3404 // cost, if any, should be scaled by the block probability because it
3405 // models a copy at the end of each predicated block.
3406 ScalarizationCost += VF.getKnownMinValue() *
3407 TTI.getCFInstrCost(Instruction::PHI, CostKind);
3408
3409 // The cost of the non-predicated instruction.
3410 ScalarizationCost += VF.getKnownMinValue() *
3411 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3412
3413 // The cost of insertelement and extractelement instructions needed for
3414 // scalarization.
3415 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
3416
3417 // Scale the cost by the probability of executing the predicated blocks.
3418 // This assumes the predicated block for each vector lane is equally
3419 // likely.
3420 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3421 }
3422 InstructionCost SafeDivisorCost = 0;
3423
3424 auto *VecTy = ToVectorTy(I->getType(), VF);
3425
3426 // The cost of the select guard to ensure all lanes are well defined
3427 // after we speculate above any internal control flow.
3428 SafeDivisorCost += TTI.getCmpSelInstrCost(
3429 Instruction::Select, VecTy,
3430 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
3432
3433 // Certain instructions can be cheaper to vectorize if they have a constant
3434 // second vector operand. One example of this are shifts on x86.
3435 Value *Op2 = I->getOperand(1);
3436 auto Op2Info = TTI.getOperandInfo(Op2);
3437 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3438 Legal->isInvariant(Op2))
3440
3441 SmallVector<const Value *, 4> Operands(I->operand_values());
3442 SafeDivisorCost += TTI.getArithmeticInstrCost(
3443 I->getOpcode(), VecTy, CostKind,
3444 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3445 Op2Info, Operands, I);
3446 return {ScalarizationCost, SafeDivisorCost};
3447}
3448
3450 Instruction *I, ElementCount VF) const {
3451 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3453 "Decision should not be set yet.");
3454 auto *Group = getInterleavedAccessGroup(I);
3455 assert(Group && "Must have a group.");
3456
3457 // If the instruction's allocated size doesn't equal it's type size, it
3458 // requires padding and will be scalarized.
3459 auto &DL = I->getDataLayout();
3460 auto *ScalarTy = getLoadStoreType(I);
3461 if (hasIrregularType(ScalarTy, DL))
3462 return false;
3463
3464 // If the group involves a non-integral pointer, we may not be able to
3465 // losslessly cast all values to a common type.
3466 unsigned InterleaveFactor = Group->getFactor();
3467 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3468 for (unsigned i = 0; i < InterleaveFactor; i++) {
3469 Instruction *Member = Group->getMember(i);
3470 if (!Member)
3471 continue;
3472 auto *MemberTy = getLoadStoreType(Member);
3473 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3474 // Don't coerce non-integral pointers to integers or vice versa.
3475 if (MemberNI != ScalarNI) {
3476 // TODO: Consider adding special nullptr value case here
3477 return false;
3478 } else if (MemberNI && ScalarNI &&
3479 ScalarTy->getPointerAddressSpace() !=
3480 MemberTy->getPointerAddressSpace()) {
3481 return false;
3482 }
3483 }
3484
3485 // Check if masking is required.
3486 // A Group may need masking for one of two reasons: it resides in a block that
3487 // needs predication, or it was decided to use masking to deal with gaps
3488 // (either a gap at the end of a load-access that may result in a speculative
3489 // load, or any gaps in a store-access).
3490 bool PredicatedAccessRequiresMasking =
3491 blockNeedsPredicationForAnyReason(I->getParent()) &&
3493 bool LoadAccessWithGapsRequiresEpilogMasking =
3494 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3496 bool StoreAccessWithGapsRequiresMasking =
3497 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3498 if (!PredicatedAccessRequiresMasking &&
3499 !LoadAccessWithGapsRequiresEpilogMasking &&
3500 !StoreAccessWithGapsRequiresMasking)
3501 return true;
3502
3503 // If masked interleaving is required, we expect that the user/target had
3504 // enabled it, because otherwise it either wouldn't have been created or
3505 // it should have been invalidated by the CostModel.
3507 "Masked interleave-groups for predicated accesses are not enabled.");
3508
3509 if (Group->isReverse())
3510 return false;
3511
3512 auto *Ty = getLoadStoreType(I);
3513 const Align Alignment = getLoadStoreAlignment(I);
3514 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
3515 : TTI.isLegalMaskedStore(Ty, Alignment);
3516}
3517
3519 Instruction *I, ElementCount VF) {
3520 // Get and ensure we have a valid memory instruction.
3521 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3522
3524 auto *ScalarTy = getLoadStoreType(I);
3525
3526 // In order to be widened, the pointer should be consecutive, first of all.
3527 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3528 return false;
3529
3530 // If the instruction is a store located in a predicated block, it will be
3531 // scalarized.
3532 if (isScalarWithPredication(I, VF))
3533 return false;
3534
3535 // If the instruction's allocated size doesn't equal it's type size, it
3536 // requires padding and will be scalarized.
3537 auto &DL = I->getDataLayout();
3538 if (hasIrregularType(ScalarTy, DL))
3539 return false;
3540
3541 return true;
3542}
3543
3544void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3545 // We should not collect Uniforms more than once per VF. Right now,
3546 // this function is called from collectUniformsAndScalars(), which
3547 // already does this check. Collecting Uniforms for VF=1 does not make any
3548 // sense.
3549
3550 assert(VF.isVector() && !Uniforms.contains(VF) &&
3551 "This function should not be visited twice for the same VF");
3552
3553 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
3554 // not analyze again. Uniforms.count(VF) will return 1.
3555 Uniforms[VF].clear();
3556
3557 // We now know that the loop is vectorizable!
3558 // Collect instructions inside the loop that will remain uniform after
3559 // vectorization.
3560
3561 // Global values, params and instructions outside of current loop are out of
3562 // scope.
3563 auto isOutOfScope = [&](Value *V) -> bool {
3564 Instruction *I = dyn_cast<Instruction>(V);
3565 return (!I || !TheLoop->contains(I));
3566 };
3567
3568 // Worklist containing uniform instructions demanding lane 0.
3569 SetVector<Instruction *> Worklist;
3570
3571 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3572 // that require predication must not be considered uniform after
3573 // vectorization, because that would create an erroneous replicating region
3574 // where only a single instance out of VF should be formed.
3575 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
3576 if (isOutOfScope(I)) {
3577 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3578 << *I << "\n");
3579 return;
3580 }
3581 if (isPredicatedInst(I)) {
3582 LLVM_DEBUG(
3583 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3584 << "\n");
3585 return;
3586 }
3587 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3588 Worklist.insert(I);
3589 };
3590
3591 // Start with the conditional branches exiting the loop. If the branch
3592 // condition is an instruction contained in the loop that is only used by the
3593 // branch, it is uniform.
3595 TheLoop->getExitingBlocks(Exiting);
3596 for (BasicBlock *E : Exiting) {
3597 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3598 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3599 addToWorklistIfAllowed(Cmp);
3600 }
3601
3602 auto PrevVF = VF.divideCoefficientBy(2);
3603 // Return true if all lanes perform the same memory operation, and we can
3604 // thus chose to execute only one.
3605 auto isUniformMemOpUse = [&](Instruction *I) {
3606 // If the value was already known to not be uniform for the previous
3607 // (smaller VF), it cannot be uniform for the larger VF.
3608 if (PrevVF.isVector()) {
3609 auto Iter = Uniforms.find(PrevVF);
3610 if (Iter != Uniforms.end() && !Iter->second.contains(I))
3611 return false;
3612 }
3613 if (!Legal->isUniformMemOp(*I, VF))
3614 return false;
3615 if (isa<LoadInst>(I))
3616 // Loading the same address always produces the same result - at least
3617 // assuming aliasing and ordering which have already been checked.
3618 return true;
3619 // Storing the same value on every iteration.
3620 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3621 };
3622
3623 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
3624 InstWidening WideningDecision = getWideningDecision(I, VF);
3625 assert(WideningDecision != CM_Unknown &&
3626 "Widening decision should be ready at this moment");
3627
3628 if (isUniformMemOpUse(I))
3629 return true;
3630
3631 return (WideningDecision == CM_Widen ||
3632 WideningDecision == CM_Widen_Reverse ||
3633 WideningDecision == CM_Interleave);
3634 };
3635
3636 // Returns true if Ptr is the pointer operand of a memory access instruction
3637 // I, I is known to not require scalarization, and the pointer is not also
3638 // stored.
3639 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3640 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3641 return false;
3642 return getLoadStorePointerOperand(I) == Ptr &&
3643 (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3644 };
3645
3646 // Holds a list of values which are known to have at least one uniform use.
3647 // Note that there may be other uses which aren't uniform. A "uniform use"
3648 // here is something which only demands lane 0 of the unrolled iterations;
3649 // it does not imply that all lanes produce the same value (e.g. this is not
3650 // the usual meaning of uniform)
3651 SetVector<Value *> HasUniformUse;
3652
3653 // Scan the loop for instructions which are either a) known to have only
3654 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3655 for (auto *BB : TheLoop->blocks())
3656 for (auto &I : *BB) {
3657 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3658 switch (II->getIntrinsicID()) {
3659 case Intrinsic::sideeffect:
3660 case Intrinsic::experimental_noalias_scope_decl:
3661 case Intrinsic::assume:
3662 case Intrinsic::lifetime_start:
3663 case Intrinsic::lifetime_end:
3665 addToWorklistIfAllowed(&I);
3666 break;
3667 default:
3668 break;
3669 }
3670 }
3671
3672 // ExtractValue instructions must be uniform, because the operands are
3673 // known to be loop-invariant.
3674 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3675 assert(isOutOfScope(EVI->getAggregateOperand()) &&
3676 "Expected aggregate value to be loop invariant");
3677 addToWorklistIfAllowed(EVI);
3678 continue;
3679 }
3680
3681 // If there's no pointer operand, there's nothing to do.
3683 if (!Ptr)
3684 continue;
3685
3686 if (isUniformMemOpUse(&I))
3687 addToWorklistIfAllowed(&I);
3688
3689 if (isVectorizedMemAccessUse(&I, Ptr))
3690 HasUniformUse.insert(Ptr);
3691 }
3692
3693 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3694 // demanding) users. Since loops are assumed to be in LCSSA form, this
3695 // disallows uses outside the loop as well.
3696 for (auto *V : HasUniformUse) {
3697 if (isOutOfScope(V))
3698 continue;
3699 auto *I = cast<Instruction>(V);
3700 auto UsersAreMemAccesses =
3701 llvm::all_of(I->users(), [&](User *U) -> bool {
3702 auto *UI = cast<Instruction>(U);
3703 return TheLoop->contains(UI) && isVectorizedMemAccessUse(UI, V);
3704 });
3705 if (UsersAreMemAccesses)
3706 addToWorklistIfAllowed(I);
3707 }
3708
3709 // Expand Worklist in topological order: whenever a new instruction
3710 // is added , its users should be already inside Worklist. It ensures
3711 // a uniform instruction will only be used by uniform instructions.
3712 unsigned idx = 0;
3713 while (idx != Worklist.size()) {
3714 Instruction *I = Worklist[idx++];
3715
3716 for (auto *OV : I->operand_values()) {
3717 // isOutOfScope operands cannot be uniform instructions.
3718 if (isOutOfScope(OV))
3719 continue;
3720 // First order recurrence Phi's should typically be considered
3721 // non-uniform.
3722 auto *OP = dyn_cast<PHINode>(OV);
3724 continue;
3725 // If all the users of the operand are uniform, then add the
3726 // operand into the uniform worklist.
3727 auto *OI = cast<Instruction>(OV);
3728 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3729 auto *J = cast<Instruction>(U);
3730 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
3731 }))
3732 addToWorklistIfAllowed(OI);
3733 }
3734 }
3735
3736 // For an instruction to be added into Worklist above, all its users inside
3737 // the loop should also be in Worklist. However, this condition cannot be
3738 // true for phi nodes that form a cyclic dependence. We must process phi
3739 // nodes separately. An induction variable will remain uniform if all users
3740 // of the induction variable and induction variable update remain uniform.
3741 // The code below handles both pointer and non-pointer induction variables.
3742 BasicBlock *Latch = TheLoop->getLoopLatch();
3743 for (const auto &Induction : Legal->getInductionVars()) {
3744 auto *Ind = Induction.first;
3745 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3746
3747 // Determine if all users of the induction variable are uniform after
3748 // vectorization.
3749 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3750 auto *I = cast<Instruction>(U);
3751 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3752 isVectorizedMemAccessUse(I, Ind);
3753 });
3754 if (!UniformInd)
3755 continue;
3756
3757 // Determine if all users of the induction variable update instruction are
3758 // uniform after vectorization.
3759 auto UniformIndUpdate =
3760 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3761 auto *I = cast<Instruction>(U);
3762 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3763 isVectorizedMemAccessUse(I, IndUpdate);
3764 });
3765 if (!UniformIndUpdate)
3766 continue;
3767
3768 // The induction variable and its update instruction will remain uniform.
3769 addToWorklistIfAllowed(Ind);
3770 addToWorklistIfAllowed(IndUpdate);
3771 }
3772
3773 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
3774}
3775
3777 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3778
3780 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3781 "runtime pointer checks needed. Enable vectorization of this "
3782 "loop with '#pragma clang loop vectorize(enable)' when "
3783 "compiling with -Os/-Oz",
3784 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3785 return true;
3786 }
3787
3788 if (!PSE.getPredicate().isAlwaysTrue()) {
3789 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3790 "runtime SCEV checks needed. Enable vectorization of this "
3791 "loop with '#pragma clang loop vectorize(enable)' when "
3792 "compiling with -Os/-Oz",
3793 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3794 return true;
3795 }
3796
3797 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3798 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3799 reportVectorizationFailure("Runtime stride check for small trip count",
3800 "runtime stride == 1 checks needed. Enable vectorization of "
3801 "this loop without such check by compiling with -Os/-Oz",
3802 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3803 return true;
3804 }
3805
3806 return false;
3807}
3808
3809bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3810 if (IsScalableVectorizationAllowed)
3811 return *IsScalableVectorizationAllowed;
3812
3813 IsScalableVectorizationAllowed = false;
3815 return false;
3816
3818 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3819 "ScalableVectorizationDisabled", ORE, TheLoop);
3820 return false;
3821 }
3822
3823 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3824
3825 auto MaxScalableVF = ElementCount::getScalable(
3826 std::numeric_limits<ElementCount::ScalarTy>::max());
3827
3828 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3829 // FIXME: While for scalable vectors this is currently sufficient, this should
3830 // be replaced by a more detailed mechanism that filters out specific VFs,
3831 // instead of invalidating vectorization for a whole set of VFs based on the
3832 // MaxVF.
3833
3834 // Disable scalable vectorization if the loop contains unsupported reductions.
3835 if (!canVectorizeReductions(MaxScalableVF)) {
3837 "Scalable vectorization not supported for the reduction "
3838 "operations found in this loop.",
3839 "ScalableVFUnfeasible", ORE, TheLoop);
3840 return false;
3841 }
3842
3843 // Disable scalable vectorization if the loop contains any instructions
3844 // with element types not supported for scalable vectors.
3845 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3846 return !Ty->isVoidTy() &&
3848 })) {
3849 reportVectorizationInfo("Scalable vectorization is not supported "
3850 "for all element types found in this loop.",
3851 "ScalableVFUnfeasible", ORE, TheLoop);
3852 return false;
3853 }
3854
3856 reportVectorizationInfo("The target does not provide maximum vscale value "
3857 "for safe distance analysis.",
3858 "ScalableVFUnfeasible", ORE, TheLoop);
3859 return false;
3860 }
3861
3862 IsScalableVectorizationAllowed = true;
3863 return true;
3864}
3865
3867LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3868 if (!isScalableVectorizationAllowed())
3869 return ElementCount::getScalable(0);
3870
3871 auto MaxScalableVF = ElementCount::getScalable(
3872 std::numeric_limits<ElementCount::ScalarTy>::max());
3874 return MaxScalableVF;
3875
3876 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3877 // Limit MaxScalableVF by the maximum safe dependence distance.
3878 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3879
3880 if (!MaxScalableVF)
3882 "Max legal vector width too small, scalable vectorization "
3883 "unfeasible.",
3884 "ScalableVFUnfeasible", ORE, TheLoop);
3885
3886 return MaxScalableVF;
3887}
3888
3889FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3890 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3892 unsigned SmallestType, WidestType;
3893 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3894
3895 // Get the maximum safe dependence distance in bits computed by LAA.
3896 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3897 // the memory accesses that is most restrictive (involved in the smallest
3898 // dependence distance).
3899 unsigned MaxSafeElements =
3901
3902 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
3903 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3904
3905 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3906 << ".\n");
3907 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3908 << ".\n");
3909
3910 // First analyze the UserVF, fall back if the UserVF should be ignored.
3911 if (UserVF) {
3912 auto MaxSafeUserVF =
3913 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3914
3915 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3916 // If `VF=vscale x N` is safe, then so is `VF=N`
3917 if (UserVF.isScalable())
3918 return FixedScalableVFPair(
3919 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3920 else
3921 return UserVF;
3922 }
3923
3924 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3925
3926 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3927 // is better to ignore the hint and let the compiler choose a suitable VF.
3928 if (!UserVF.isScalable()) {
3929 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3930 << " is unsafe, clamping to max safe VF="
3931 << MaxSafeFixedVF << ".\n");
3932 ORE->emit([&]() {
3933 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3935 TheLoop->getHeader())
3936 << "User-specified vectorization factor "
3937 << ore::NV("UserVectorizationFactor", UserVF)
3938 << " is unsafe, clamping to maximum safe vectorization factor "
3939 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3940 });
3941 return MaxSafeFixedVF;
3942 }
3943
3945 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3946 << " is ignored because scalable vectors are not "
3947 "available.\n");
3948 ORE->emit([&]() {
3949 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3951 TheLoop->getHeader())
3952 << "User-specified vectorization factor "
3953 << ore::NV("UserVectorizationFactor", UserVF)
3954 << " is ignored because the target does not support scalable "
3955 "vectors. The compiler will pick a more suitable value.";
3956 });
3957 } else {
3958 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3959 << " is unsafe. Ignoring scalable UserVF.\n");
3960 ORE->emit([&]() {
3961 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3963 TheLoop->getHeader())
3964 << "User-specified vectorization factor "
3965 << ore::NV("UserVectorizationFactor", UserVF)
3966 << " is unsafe. Ignoring the hint to let the compiler pick a "
3967 "more suitable value.";
3968 });
3969 }
3970 }
3971
3972 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3973 << " / " << WidestType << " bits.\n");
3974
3977 if (auto MaxVF =
3978 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3979 MaxSafeFixedVF, FoldTailByMasking))
3980 Result.FixedVF = MaxVF;
3981
3982 if (auto MaxVF =
3983 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3984 MaxSafeScalableVF, FoldTailByMasking))
3985 if (MaxVF.isScalable()) {
3986 Result.ScalableVF = MaxVF;
3987 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3988 << "\n");
3989 }
3990
3991 return Result;
3992}
3993
3997 // TODO: It may by useful to do since it's still likely to be dynamically
3998 // uniform if the target can skip.
4000 "Not inserting runtime ptr check for divergent target",
4001 "runtime pointer checks needed. Not enabled for divergent target",
4002 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4004 }
4005
4006 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4007 unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
4008 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4009 if (TC == 1) {
4010 reportVectorizationFailure("Single iteration (non) loop",
4011 "loop trip count is one, irrelevant for vectorization",
4012 "SingleIterationLoop", ORE, TheLoop);
4014 }
4015
4016 switch (ScalarEpilogueStatus) {
4018 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4020 [[fallthrough]];
4022 LLVM_DEBUG(
4023 dbgs() << "LV: vector predicate hint/switch found.\n"
4024 << "LV: Not allowing scalar epilogue, creating predicated "
4025 << "vector loop.\n");
4026 break;
4028 // fallthrough as a special case of OptForSize
4030 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4031 LLVM_DEBUG(
4032 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4033 else
4034 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4035 << "count.\n");
4036
4037 // Bail if runtime checks are required, which are not good when optimising
4038 // for size.
4041
4042 break;
4043 }
4044
4045 // The only loops we can vectorize without a scalar epilogue, are loops with
4046 // a bottom-test and a single exiting block. We'd have to handle the fact
4047 // that not every instruction executes on the last iteration. This will
4048 // require a lane mask which varies through the vector loop body. (TODO)
4050 // If there was a tail-folding hint/switch, but we can't fold the tail by
4051 // masking, fallback to a vectorization with a scalar epilogue.
4052 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4053 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4054 "scalar epilogue instead.\n");
4055 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4056 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4057 }
4059 }
4060
4061 // Now try the tail folding
4062
4063 // Invalidate interleave groups that require an epilogue if we can't mask
4064 // the interleave-group.
4066 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4067 "No decisions should have been taken at this point");
4068 // Note: There is no need to invalidate any cost modeling decisions here, as
4069 // non where taken so far.
4071 }
4072
4073 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4074
4075 // Avoid tail folding if the trip count is known to be a multiple of any VF
4076 // we choose.
4077 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4078 MaxFactors.FixedVF.getFixedValue();
4079 if (MaxFactors.ScalableVF) {
4080 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4081 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4082 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4083 *MaxPowerOf2RuntimeVF,
4084 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4085 } else
4086 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4087 }
4088
4089 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4090 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4091 "MaxFixedVF must be a power of 2");
4092 unsigned MaxVFtimesIC =
4093 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4094 ScalarEvolution *SE = PSE.getSE();
4095 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4096 const SCEV *ExitCount = SE->getAddExpr(
4097 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4098 const SCEV *Rem = SE->getURemExpr(
4099 SE->applyLoopGuards(ExitCount, TheLoop),
4100 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4101 if (Rem->isZero()) {
4102 // Accept MaxFixedVF if we do not have a tail.
4103 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4104 return MaxFactors;
4105 }
4106 }
4107
4108 // If we don't know the precise trip count, or if the trip count that we
4109 // found modulo the vectorization factor is not zero, try to fold the tail
4110 // by masking.
4111 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4112 setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4113 if (foldTailByMasking()) {
4115 LLVM_DEBUG(
4116 dbgs()
4117 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4118 "try to generate VP Intrinsics with scalable vector "
4119 "factors only.\n");
4120 // Tail folded loop using VP intrinsics restricts the VF to be scalable
4121 // for now.
4122 // TODO: extend it for fixed vectors, if required.
4123 assert(MaxFactors.ScalableVF.isScalable() &&
4124 "Expected scalable vector factor.");
4125
4126 MaxFactors.FixedVF = ElementCount::getFixed(1);
4127 }
4128 return MaxFactors;
4129 }
4130
4131 // If there was a tail-folding hint/switch, but we can't fold the tail by
4132 // masking, fallback to a vectorization with a scalar epilogue.
4133 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4134 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4135 "scalar epilogue instead.\n");
4136 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4137 return MaxFactors;
4138 }
4139
4140 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4141 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4143 }
4144
4145 if (TC == 0) {
4147 "Unable to calculate the loop count due to complex control flow",
4148 "unable to calculate the loop count due to complex control flow",
4149 "UnknownLoopCountComplexCFG", ORE, TheLoop);
4151 }
4152
4154 "Cannot optimize for size and vectorize at the same time.",
4155 "cannot optimize for size and vectorize at the same time. "
4156 "Enable vectorization of this loop with '#pragma clang loop "
4157 "vectorize(enable)' when compiling with -Os/-Oz",
4158 "NoTailLoopWithOptForSize", ORE, TheLoop);
4160}
4161
4162ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4163 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4164 ElementCount MaxSafeVF, bool FoldTailByMasking) {
4165 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4166 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4167 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4169
4170 // Convenience function to return the minimum of two ElementCounts.
4171 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4172 assert((LHS.isScalable() == RHS.isScalable()) &&
4173 "Scalable flags must match");
4174 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4175 };
4176
4177 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4178 // Note that both WidestRegister and WidestType may not be a powers of 2.
4179 auto MaxVectorElementCount = ElementCount::get(
4180 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4181 ComputeScalableMaxVF);
4182 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4183 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4184 << (MaxVectorElementCount * WidestType) << " bits.\n");
4185
4186 if (!MaxVectorElementCount) {
4187 LLVM_DEBUG(dbgs() << "LV: The target has no "
4188 << (ComputeScalableMaxVF ? "scalable" : "fixed")
4189 << " vector registers.\n");
4190 return ElementCount::getFixed(1);
4191 }
4192
4193 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4194 if (MaxVectorElementCount.isScalable() &&
4195 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4196 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4197 auto Min = Attr.getVScaleRangeMin();
4198 WidestRegisterMinEC *= Min;
4199 }
4200
4201 // When a scalar epilogue is required, at least one iteration of the scalar
4202 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4203 // max VF that results in a dead vector loop.
4204 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4205 MaxTripCount -= 1;
4206
4207 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4208 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4209 // If upper bound loop trip count (TC) is known at compile time there is no
4210 // point in choosing VF greater than TC (as done in the loop below). Select
4211 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4212 // scalable, we only fall back on a fixed VF when the TC is less than or
4213 // equal to the known number of lanes.
4214 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4215 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4216 "exceeding the constant trip count: "
4217 << ClampedUpperTripCount << "\n");
4218 return ElementCount::get(
4219 ClampedUpperTripCount,
4220 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4221 }
4222
4224 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4226 ElementCount MaxVF = MaxVectorElementCount;
4227 if (MaximizeBandwidth ||
4228 (MaximizeBandwidth.getNumOccurrences() == 0 &&
4231 auto MaxVectorElementCountMaxBW = ElementCount::get(
4232 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4233 ComputeScalableMaxVF);
4234 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4235
4236 // Collect all viable vectorization factors larger than the default MaxVF
4237 // (i.e. MaxVectorElementCount).
4239 for (ElementCount VS = MaxVectorElementCount * 2;
4240 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4241 VFs.push_back(VS);
4242
4243 // For each VF calculate its register usage.
4244 auto RUs = calculateRegisterUsage(VFs);
4245
4246 // Select the largest VF which doesn't require more registers than existing
4247 // ones.
4248 for (int I = RUs.size() - 1; I >= 0; --I) {
4249 const auto &MLU = RUs[I].MaxLocalUsers;
4250 if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4251 return LU.second <= TTI.getNumberOfRegisters(LU.first);
4252 })) {
4253 MaxVF = VFs[I];
4254 break;
4255 }
4256 }
4257 if (ElementCount MinVF =
4258 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4259 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4260 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4261 << ") with target's minimum: " << MinVF << '\n');
4262 MaxVF = MinVF;
4263 }
4264 }
4265
4266 // Invalidate any widening decisions we might have made, in case the loop
4267 // requires prediction (decided later), but we have already made some
4268 // load/store widening decisions.
4270 }
4271 return MaxVF;
4272}
4273
4274/// Convenience function that returns the value of vscale_range iff
4275/// vscale_range.min == vscale_range.max or otherwise returns the value
4276/// returned by the corresponding TTI method.
4277static std::optional<unsigned>
4279 const Function *Fn = L->getHeader()->getParent();
4280 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4281 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4282 auto Min = Attr.getVScaleRangeMin();
4283 auto Max = Attr.getVScaleRangeMax();
4284 if (Max && Min == Max)
4285 return Max;
4286 }
4287
4288 return TTI.getVScaleForTuning();
4289}
4290
4291bool LoopVectorizationPlanner::isMoreProfitable(
4292 const VectorizationFactor &A, const VectorizationFactor &B) const {
4293 InstructionCost CostA = A.Cost;
4294 InstructionCost CostB = B.Cost;
4295
4296 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4297
4298 // Improve estimate for the vector width if it is scalable.
4299 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4300 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4301 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4302 if (A.Width.isScalable())
4303 EstimatedWidthA *= *VScale;
4304 if (B.Width.isScalable())
4305 EstimatedWidthB *= *VScale;
4306 }
4307
4308 // Assume vscale may be larger than 1 (or the value being tuned for),
4309 // so that scalable vectorization is slightly favorable over fixed-width
4310 // vectorization.
4311 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4312 A.Width.isScalable() && !B.Width.isScalable();
4313
4314 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4315 const InstructionCost &RHS) {
4316 return PreferScalable ? LHS <= RHS : LHS < RHS;
4317 };
4318
4319 // To avoid the need for FP division:
4320 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4321 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4322 if (!MaxTripCount)
4323 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4324
4325 auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4326 InstructionCost VectorCost,
4327 InstructionCost ScalarCost) {
4328 // If the trip count is a known (possibly small) constant, the trip count
4329 // will be rounded up to an integer number of iterations under
4330 // FoldTailByMasking. The total cost in that case will be
4331 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4332 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4333 // some extra overheads, but for the purpose of comparing the costs of
4334 // different VFs we can use this to compare the total loop-body cost
4335 // expected after vectorization.
4336 if (CM.foldTailByMasking())
4337 return VectorCost * divideCeil(MaxTripCount, VF);
4338 return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4339 };
4340
4341 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4342 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4343 return CmpFn(RTCostA, RTCostB);
4344}
4345
4348 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4349 LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
4350 SmallVector<RecipeVFPair> InvalidCosts;
4351 for (const auto &Plan : VPlans) {
4352 for (ElementCount VF : Plan->vectorFactors()) {
4353 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
4354 LLVMCtx, CM);
4355 auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
4356 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4357 for (auto &R : *VPBB) {
4358 if (!R.cost(VF, CostCtx).isValid())
4359 InvalidCosts.emplace_back(&R, VF);
4360 }
4361 }
4362 }
4363 }
4364 if (InvalidCosts.empty())
4365 return;
4366
4367 // Emit a report of VFs with invalid costs in the loop.
4368
4369 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4371 unsigned I = 0;
4372 for (auto &Pair : InvalidCosts)
4373 if (!Numbering.count(Pair.first))
4374 Numbering[Pair.first] = I++;
4375
4376 // Sort the list, first on recipe(number) then on VF.
4377 sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4378 if (Numbering[A.first] != Numbering[B.first])
4379 return Numbering[A.first] < Numbering[B.first];
4380 const auto &LHS = A.second;
4381 const auto &RHS = B.second;
4382 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
4383 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
4384 });
4385
4386 // For a list of ordered recipe-VF pairs:
4387 // [(load, VF1), (load, VF2), (store, VF1)]
4388 // group the recipes together to emit separate remarks for:
4389 // load (VF1, VF2)
4390 // store (VF1)
4391 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4392 auto Subset = ArrayRef<RecipeVFPair>();
4393 do {
4394 if (Subset.empty())
4395 Subset = Tail.take_front(1);
4396
4397 VPRecipeBase *R = Subset.front().first;
4398
4399 unsigned Opcode =
4402 [](const auto *R) { return Instruction::PHI; })
4403 .Case<VPWidenSelectRecipe>(
4404 [](const auto *R) { return Instruction::Select; })
4405 .Case<VPWidenStoreRecipe>(
4406 [](const auto *R) { return Instruction::Store; })
4407 .Case<VPWidenLoadRecipe>(
4408 [](const auto *R) { return Instruction::Load; })
4409 .Case<VPWidenCallRecipe>(
4410 [](const auto *R) { return Instruction::Call; })
4413 [](const auto *R) { return R->getOpcode(); })
4414 .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
4415 return R->getStoredValues().empty() ? Instruction::Load
4416 : Instruction::Store;
4417 });
4418
4419 // If the next recipe is different, or if there are no other pairs,
4420 // emit a remark for the collated subset. e.g.
4421 // [(load, VF1), (load, VF2))]
4422 // to emit:
4423 // remark: invalid costs for 'load' at VF=(VF1, VF2)
4424 if (Subset == Tail || Tail[Subset.size()].first != R) {
4425 std::string OutString;
4426 raw_string_ostream OS(OutString);
4427 assert(!Subset.empty() && "Unexpected empty range");
4428 OS << "Recipe with invalid costs prevented vectorization at VF=(";
4429 for (const auto &Pair : Subset)
4430 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4431 OS << "):";
4432 if (Opcode == Instruction::Call) {
4433 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
4434 Function *CalledFn =
4435 WidenCall ? WidenCall->getCalledScalarFunction()
4436 : cast<Function>(R->getOperand(R->getNumOperands() - 1)
4437 ->getLiveInIRValue());
4438 OS << " call to " << CalledFn->getName();
4439 } else
4440 OS << " " << Instruction::getOpcodeName(Opcode);
4441 OS.flush();
4442 reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4443 R->getDebugLoc());
4444 Tail = Tail.drop_front(Subset.size());
4445 Subset = {};
4446 } else
4447 // Grow the subset by one element
4448 Subset = Tail.take_front(Subset.size() + 1);
4449 } while (!Tail.empty());
4450}
4451
4452/// Check if any recipe of \p Plan will generate a vector value, which will be
4453/// assigned a vector register.
4455 const TargetTransformInfo &TTI) {
4456 assert(VF.isVector() && "Checking a scalar VF?");
4457 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
4459 DenseSet<VPRecipeBase *> EphemeralRecipes;
4460 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4461 // Set of already visited types.
4462 DenseSet<Type *> Visited;
4463 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4465 for (VPRecipeBase &R : *VPBB) {
4466 if (EphemeralRecipes.contains(&R))
4467 continue;
4468 // Continue early if the recipe is considered to not produce a vector
4469 // result. Note that this includes VPInstruction where some opcodes may
4470 // produce a vector, to preserve existing behavior as VPInstructions model
4471 // aspects not directly mapped to existing IR instructions.
4472 switch (R.getVPDefID()) {
4473 case VPDef::VPDerivedIVSC:
4474 case VPDef::VPScalarIVStepsSC:
4475 case VPDef::VPScalarCastSC:
4476 case VPDef::VPReplicateSC:
4477 case VPDef::VPInstructionSC:
4478 case VPDef::VPCanonicalIVPHISC:
4479 case VPDef::VPVectorPointerSC:
4480 case VPDef::VPExpandSCEVSC:
4481 case VPDef::VPEVLBasedIVPHISC:
4482 case VPDef::VPPredInstPHISC:
4483 case VPDef::VPBranchOnMaskSC:
4484 continue;
4485 case VPDef::VPReductionSC:
4486 case VPDef::VPActiveLaneMaskPHISC:
4487 case VPDef::VPWidenCallSC:
4488 case VPDef::VPWidenCanonicalIVSC:
4489 case VPDef::VPWidenCastSC:
4490 case VPDef::VPWidenGEPSC:
4491 case VPDef::VPWidenSC:
4492 case VPDef::VPWidenSelectSC:
4493 case VPDef::VPBlendSC:
4494 case VPDef::VPFirstOrderRecurrencePHISC:
4495 case VPDef::VPWidenPHISC:
4496 case VPDef::VPWidenIntOrFpInductionSC:
4497 case VPDef::VPWidenPointerInductionSC:
4498 case VPDef::VPReductionPHISC:
4499 case VPDef::VPInterleaveSC:
4500 case VPDef::VPWidenLoadEVLSC:
4501 case VPDef::VPWidenLoadSC:
4502 case VPDef::VPWidenStoreEVLSC:
4503 case VPDef::VPWidenStoreSC:
4504 break;
4505 default:
4506 llvm_unreachable("unhandled recipe");
4507 }
4508
4509 auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4510 Type *VectorTy = ToVectorTy(ScalarTy, VF);
4511 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4512 if (!NumLegalParts)
4513 return false;
4514 if (VF.isScalable()) {
4515 // <vscale x 1 x iN> is assumed to be profitable over iN because
4516 // scalable registers are a distinct register class from scalar
4517 // ones. If we ever find a target which wants to lower scalable
4518 // vectors back to scalars, we'll need to update this code to
4519 // explicitly ask TTI about the register class uses for each part.
4520 return NumLegalParts <= VF.getKnownMinValue();
4521 }
4522 // Two or more parts that share a register - are vectorized.
4523 return NumLegalParts < VF.getKnownMinValue();
4524 };
4525
4526 // If no def nor is a store, e.g., branches, continue - no value to check.
4527 if (R.getNumDefinedValues() == 0 &&
4528 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4529 &R))
4530 continue;
4531 // For multi-def recipes, currently only interleaved loads, suffice to
4532 // check first def only.
4533 // For stores check their stored value; for interleaved stores suffice
4534 // the check first stored value only. In all cases this is the second
4535 // operand.
4536 VPValue *ToCheck =
4537 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4538 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4539 if (!Visited.insert({ScalarTy}).second)
4540 continue;
4541 if (WillWiden(ScalarTy))
4542 return true;
4543 }
4544 }
4545
4546 return false;
4547}
4548
4549VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4551 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4552 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4553 assert(any_of(VPlans,
4554 [](std::unique_ptr<VPlan> &P) {
4555 return P->hasVF(ElementCount::getFixed(1));
4556 }) &&
4557 "Expected Scalar VF to be a candidate");
4558
4559 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4560 ExpectedCost);
4561 VectorizationFactor ChosenFactor = ScalarCost;
4562
4563 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4564 if (ForceVectorization &&
4565 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4566 // Ignore scalar width, because the user explicitly wants vectorization.
4567 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4568 // evaluation.
4569 ChosenFactor.Cost = InstructionCost::getMax();
4570 }
4571
4572 for (auto &P : VPlans) {
4573 for (ElementCount VF : P->vectorFactors()) {
4574 // The cost for scalar VF=1 is already calculated, so ignore it.
4575 if (VF.isScalar())
4576 continue;
4577
4579 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4580
4581#ifndef NDEBUG
4582 unsigned AssumedMinimumVscale =
4583 getVScaleForTuning(OrigLoop, TTI).value_or(1);
4584 unsigned Width =
4585 Candidate.Width.isScalable()
4586 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
4587 : Candidate.Width.getFixedValue();
4588 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4589 << " costs: " << (Candidate.Cost / Width));
4590 if (VF.isScalable())
4591 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4592 << AssumedMinimumVscale << ")");
4593 LLVM_DEBUG(dbgs() << ".\n");
4594#endif
4595
4596 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4597 LLVM_DEBUG(
4598 dbgs()
4599 << "LV: Not considering vector loop of width " << VF
4600 << " because it will not generate any vector instructions.\n");
4601 continue;
4602 }
4603
4604 if (isMoreProfitable(Candidate, ChosenFactor))
4605 ChosenFactor = Candidate;
4606 }
4607 }
4608
4611 "There are conditional stores.",
4612 "store that is conditionally executed prevents vectorization",
4613 "ConditionalStore", ORE, OrigLoop);
4614 ChosenFactor = ScalarCost;
4615 }
4616
4617 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4618 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
4619 << "LV: Vectorization seems to be not beneficial, "
4620 << "but was forced by a user.\n");
4621 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
4622 return ChosenFactor;
4623}
4624
4625bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4626 ElementCount VF) const {
4627 // Cross iteration phis such as reductions need special handling and are
4628 // currently unsupported.
4629 if (any_of(OrigLoop->getHeader()->phis(),
4630 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
4631 return false;
4632
4633 // Phis with uses outside of the loop require special handling and are
4634 // currently unsupported.
4635 for (const auto &Entry : Legal->getInductionVars()) {
4636 // Look for uses of the value of the induction at the last iteration.
4637 Value *PostInc =
4638 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4639 for (User *U : PostInc->users())
4640 if (!OrigLoop->contains(cast<Instruction>(U)))
4641 return false;
4642 // Look for uses of penultimate value of the induction.
4643 for (User *U : Entry.first->users())
4644 if (!OrigLoop->contains(cast<Instruction>(U)))
4645 return false;
4646 }
4647
4648 // Epilogue vectorization code has not been auditted to ensure it handles
4649 // non-latch exits properly. It may be fine, but it needs auditted and
4650 // tested.
4651 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4652 return false;
4653
4654 return true;
4655}
4656
4658 const ElementCount VF) const {
4659 // FIXME: We need a much better cost-model to take different parameters such
4660 // as register pressure, code size increase and cost of extra branches into
4661 // account. For now we apply a very crude heuristic and only consider loops
4662 // with vectorization factors larger than a certain value.
4663
4664 // Allow the target to opt out entirely.
4666 return false;
4667
4668 // We also consider epilogue vectorization unprofitable for targets that don't
4669 // consider interleaving beneficial (eg. MVE).
4670 if (TTI.getMaxInterleaveFactor(VF) <= 1)
4671 return false;
4672
4673 unsigned Multiplier = 1;
4674 if (VF.isScalable())
4675 Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
4676 if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
4677 return true;
4678 return false;
4679}
4680
4682 const ElementCount MainLoopVF, unsigned IC) {
4685 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4686 return Result;
4687 }
4688
4689 if (!CM.isScalarEpilogueAllowed()) {
4690 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4691 "epilogue is allowed.\n");
4692 return Result;
4693 }
4694
4695 // Not really a cost consideration, but check for unsupported cases here to
4696 // simplify the logic.
4697 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4698 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4699 "is not a supported candidate.\n");
4700 return Result;
4701 }
4702
4704 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4706 if (hasPlanWithVF(ForcedEC))
4707 return {ForcedEC, 0, 0};
4708 else {
4709 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4710 "viable.\n");
4711 return Result;
4712 }
4713 }
4714
4715 if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
4716 OrigLoop->getHeader()->getParent()->hasMinSize()) {
4717 LLVM_DEBUG(
4718 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4719 return Result;
4720 }
4721
4722 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
4723 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4724 "this loop\n");
4725 return Result;
4726 }
4727
4728 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4729 // the main loop handles 8 lanes per iteration. We could still benefit from
4730 // vectorizing the epilogue loop with VF=4.
4731 ElementCount EstimatedRuntimeVF = MainLoopVF;
4732 if (MainLoopVF.isScalable()) {
4733 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
4734 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
4735 EstimatedRuntimeVF *= *VScale;
4736 }
4737
4738 ScalarEvolution &SE = *PSE.getSE();
4739 Type *TCType = Legal->getWidestInductionType();
4740 const SCEV *RemainingIterations = nullptr;
4741 for (auto &NextVF : ProfitableVFs) {
4742 // Skip candidate VFs without a corresponding VPlan.
4743 if (!hasPlanWithVF(NextVF.Width))
4744 continue;
4745
4746 // Skip candidate VFs with widths >= the estimate runtime VF (scalable
4747 // vectors) or the VF of the main loop (fixed vectors).
4748 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4749 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4750 ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
4751 continue;
4752
4753 // If NextVF is greater than the number of remaining iterations, the
4754 // epilogue loop would be dead. Skip such factors.
4755 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4756 // TODO: extend to support scalable VFs.
4757 if (!RemainingIterations) {
4758 const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
4759 RemainingIterations = SE.getURemExpr(
4760 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
4761 }
4762 if (SE.isKnownPredicate(
4764 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
4765 RemainingIterations))
4766 continue;
4767 }
4768
4769 if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
4770 Result = NextVF;
4771 }
4772
4773 if (Result != VectorizationFactor::Disabled())
4774 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4775 << Result.Width << "\n");
4776 return Result;
4777}
4778
4779std::pair<unsigned, unsigned>
4781 unsigned MinWidth = -1U;
4782 unsigned MaxWidth = 8;
4784 // For in-loop reductions, no element types are added to ElementTypesInLoop
4785 // if there are no loads/stores in the loop. In this case, check through the
4786 // reduction variables to determine the maximum width.
4787 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4788 // Reset MaxWidth so that we can find the smallest type used by recurrences
4789 // in the loop.
4790 MaxWidth = -1U;
4791 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4792 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4793 // When finding the min width used by the recurrence we need to account
4794 // for casts on the input operands of the recurrence.
4795 MaxWidth = std::min<unsigned>(
4796 MaxWidth, std::min<unsigned>(
4799 }
4800 } else {
4801 for (Type *T : ElementTypesInLoop) {
4802 MinWidth = std::min<unsigned>(
4803 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4804 MaxWidth = std::max<unsigned>(
4805 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4806 }
4807 }
4808 return {MinWidth, MaxWidth};
4809}
4810
4812 ElementTypesInLoop.clear();
4813 // For each block.
4814 for (BasicBlock *BB : TheLoop->blocks()) {
4815 // For each instruction in the loop.
4816 for (Instruction &I : BB->instructionsWithoutDebug()) {
4817 Type *T = I.getType();
4818
4819 // Skip ignored values.
4820 if (ValuesToIgnore.count(&I))
4821 continue;
4822
4823 // Only examine Loads, Stores and PHINodes.
4824 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4825 continue;
4826
4827 // Examine PHI nodes that are reduction variables. Update the type to
4828 // account for the recurrence type.
4829 if (auto *PN = dyn_cast<PHINode>(&I)) {
4830 if (!Legal->isReductionVariable(PN))
4831 continue;
4832 const RecurrenceDescriptor &RdxDesc =
4833 Legal->getReductionVars().find(PN)->second;
4836 RdxDesc.getRecurrenceType(),
4838 continue;
4839 T = RdxDesc.getRecurrenceType();
4840 }
4841
4842 // Examine the stored values.
4843 if (auto *ST = dyn_cast<StoreInst>(&I))
4844 T = ST->getValueOperand()->getType();
4845
4846 assert(T->isSized() &&
4847 "Expected the load/store/recurrence type to be sized");
4848
4849 ElementTypesInLoop.insert(T);
4850 }
4851 }
4852}
4853
4854unsigned
4856 InstructionCost LoopCost) {
4857 // -- The interleave heuristics --
4858 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4859 // There are many micro-architectural considerations that we can't predict
4860 // at this level. For example, frontend pressure (on decode or fetch) due to
4861 // code size, or the number and capabilities of the execution ports.
4862 //
4863 // We use the following heuristics to select the interleave count:
4864 // 1. If the code has reductions, then we interleave to break the cross
4865 // iteration dependency.
4866 // 2. If the loop is really small, then we interleave to reduce the loop
4867 // overhead.
4868 // 3. We don't interleave if we think that we will spill registers to memory
4869 // due to the increased register pressure.
4870
4872 return 1;
4873
4874 // Do not interleave if EVL is preferred and no User IC is specified.
4875 if (foldTailWithEVL()) {
4876 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4877 "Unroll factor forced to be 1.\n");
4878 return 1;
4879 }
4880
4881 // We used the distance for the interleave count.
4883 return 1;
4884
4885 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
4886 const bool HasReductions = !Legal->getReductionVars().empty();
4887
4888 // If we did not calculate the cost for VF (because the user selected the VF)
4889 // then we calculate the cost of VF here.
4890 if (LoopCost == 0) {
4891 LoopCost = expectedCost(VF);
4892 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4893
4894 // Loop body is free and there is no need for interleaving.
4895 if (LoopCost == 0)
4896 return 1;
4897 }
4898
4900 // We divide by these constants so assume that we have at least one
4901 // instruction that uses at least one register.
4902 for (auto& pair : R.MaxLocalUsers) {
4903 pair.second = std::max(pair.second, 1U);
4904 }
4905
4906 // We calculate the interleave count using the following formula.
4907 // Subtract the number of loop invariants from the number of available
4908 // registers. These registers are used by all of the interleaved instances.
4909 // Next, divide the remaining registers by the number of registers that is
4910 // required by the loop, in order to estimate how many parallel instances
4911 // fit without causing spills. All of this is rounded down if necessary to be
4912 // a power of two. We want power of two interleave count to simplify any
4913 // addressing operations or alignment considerations.
4914 // We also want power of two interleave counts to ensure that the induction
4915 // variable of the vector loop wraps to zero, when tail is folded by masking;
4916 // this currently happens when OptForSize, in which case IC is set to 1 above.
4917 unsigned IC = UINT_MAX;
4918
4919 for (auto& pair : R.MaxLocalUsers) {
4920 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
4921 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4922 << " registers of "
4923 << TTI.getRegisterClassName(pair.first) << " register class\n");
4924 if (VF.isScalar()) {
4925 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4926 TargetNumRegisters = ForceTargetNumScalarRegs;
4927 } else {
4928 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4929 TargetNumRegisters = ForceTargetNumVectorRegs;
4930 }
4931 unsigned MaxLocalUsers = pair.second;
4932 unsigned LoopInvariantRegs = 0;
4933 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
4934 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
4935
4936 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
4937 MaxLocalUsers);
4938 // Don't count the induction variable as interleaved.
4940 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
4941 std::max(1U, (MaxLocalUsers - 1)));
4942 }
4943
4944 IC = std::min(IC, TmpIC);
4945 }
4946
4947 // Clamp the interleave ranges to reasonable counts.
4948 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4949
4950 // Check if the user has overridden the max.
4951 if (VF.isScalar()) {
4952 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4953 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4954 } else {
4955 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4956 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4957 }
4958
4959 unsigned EstimatedVF = VF.getKnownMinValue();
4960 if (VF.isScalable()) {
4961 if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
4962 EstimatedVF *= *VScale;
4963 }
4964 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4965
4966 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4967 if (KnownTC > 0) {
4968 // At least one iteration must be scalar when this constraint holds. So the
4969 // maximum available iterations for interleaving is one less.
4970 unsigned AvailableTC =
4971 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
4972
4973 // If trip count is known we select between two prospective ICs, where
4974 // 1) the aggressive IC is capped by the trip count divided by VF
4975 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4976 // The final IC is selected in a way that the epilogue loop trip count is
4977 // minimized while maximizing the IC itself, so that we either run the
4978 // vector loop at least once if it generates a small epilogue loop, or else
4979 // we run the vector loop at least twice.
4980
4981 unsigned InterleaveCountUB = bit_floor(
4982 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
4983 unsigned InterleaveCountLB = bit_floor(std::max(
4984 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
4985 MaxInterleaveCount = InterleaveCountLB;
4986
4987 if (InterleaveCountUB != InterleaveCountLB) {
4988 unsigned TailTripCountUB =
4989 (AvailableTC % (EstimatedVF * InterleaveCountUB));
4990 unsigned TailTripCountLB =
4991 (AvailableTC % (EstimatedVF * InterleaveCountLB));
4992 // If both produce same scalar tail, maximize the IC to do the same work
4993 // in fewer vector loop iterations
4994 if (TailTripCountUB == TailTripCountLB)
4995 MaxInterleaveCount = InterleaveCountUB;
4996 }
4997 } else if (BestKnownTC && *BestKnownTC > 0) {
4998 // At least one iteration must be scalar when this constraint holds. So the
4999 // maximum available iterations for interleaving is one less.
5000 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5001 ? (*BestKnownTC) - 1
5002 : *BestKnownTC;
5003
5004 // If trip count is an estimated compile time constant, limit the
5005 // IC to be capped by the trip count divided by VF * 2, such that the vector
5006 // loop runs at least twice to make interleaving seem profitable when there
5007 // is an epilogue loop present. Since exact Trip count is not known we
5008 // choose to be conservative in our IC estimate.
5009 MaxInterleaveCount = bit_floor(std::max(
5010 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5011 }
5012
5013 assert(MaxInterleaveCount > 0 &&
5014 "Maximum interleave count must be greater than 0");
5015
5016 // Clamp the calculated IC to be between the 1 and the max interleave count
5017 // that the target and trip count allows.
5018 if (IC > MaxInterleaveCount)
5019 IC = MaxInterleaveCount;
5020 else
5021 // Make sure IC is greater than 0.
5022 IC = std::max(1u, IC);
5023
5024 assert(IC > 0 && "Interleave count must be greater than 0.");
5025
5026 // Interleave if we vectorized this loop and there is a reduction that could
5027 // benefit from interleaving.
5028 if (VF.isVector() && HasReductions) {
5029 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5030 return IC;
5031 }
5032
5033 // For any scalar loop that either requires runtime checks or predication we
5034 // are better off leaving this to the unroller. Note that if we've already
5035 // vectorized the loop we will have done the runtime check and so interleaving
5036 // won't require further checks.
5037 bool ScalarInterleavingRequiresPredication =
5038 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5039 return Legal->blockNeedsPredication(BB);
5040 }));
5041 bool ScalarInterleavingRequiresRuntimePointerCheck =
5043
5044 // We want to interleave small loops in order to reduce the loop overhead and
5045 // potentially expose ILP opportunities.
5046 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5047 << "LV: IC is " << IC << '\n'
5048 << "LV: VF is " << VF << '\n');
5049 const bool AggressivelyInterleaveReductions =
5050 TTI.enableAggressiveInterleaving(HasReductions);
5051 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5052 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5053 // We assume that the cost overhead is 1 and we use the cost model
5054 // to estimate the cost of the loop and interleave until the cost of the
5055 // loop overhead is about 5% of the cost of the loop.
5056 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5057 SmallLoopCost / *LoopCost.getValue()));
5058
5059 // Interleave until store/load ports (estimated by max interleave count) are
5060 // saturated.
5061 unsigned NumStores = Legal->getNumStores();
5062 unsigned NumLoads = Legal->getNumLoads();
5063 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5064 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5065
5066 // There is little point in interleaving for reductions containing selects
5067 // and compares when VF=1 since it may just create more overhead than it's
5068 // worth for loops with small trip counts. This is because we still have to
5069 // do the final reduction after the loop.
5070 bool HasSelectCmpReductions =
5071 HasReductions &&
5072 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5073 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5074 return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5075 RdxDesc.getRecurrenceKind());
5076 });
5077 if (HasSelectCmpReductions) {
5078 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5079 return 1;
5080 }
5081
5082 // If we have a scalar reduction (vector reductions are already dealt with
5083 // by this point), we can increase the critical path length if the loop
5084 // we're interleaving is inside another loop. For tree-wise reductions
5085 // set the limit to 2, and for ordered reductions it's best to disable
5086 // interleaving entirely.
5087 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5088 bool HasOrderedReductions =
5089 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5090 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5091 return RdxDesc.isOrdered();
5092 });
5093 if (HasOrderedReductions) {
5094 LLVM_DEBUG(
5095 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5096 return 1;
5097 }
5098
5099 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5100 SmallIC = std::min(SmallIC, F);
5101 StoresIC = std::min(StoresIC, F);
5102 LoadsIC = std::min(LoadsIC, F);
5103 }
5104
5106 std::max(StoresIC, LoadsIC) > SmallIC) {
5107 LLVM_DEBUG(
5108 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5109 return std::max(StoresIC, LoadsIC);
5110 }
5111
5112 // If there are scalar reductions and TTI has enabled aggressive
5113 // interleaving for reductions, we will interleave to expose ILP.
5114 if (VF.isScalar() && AggressivelyInterleaveReductions) {
5115 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5116 // Interleave no less than SmallIC but not as aggressive as the normal IC
5117 // to satisfy the rare situation when resources are too limited.
5118 return std::max(IC / 2, SmallIC);
5119 } else {
5120 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5121 return SmallIC;
5122 }
5123 }
5124
5125 // Interleave if this is a large loop (small loops are already dealt with by
5126 // this point) that could benefit from interleaving.
5127 if (AggressivelyInterleaveReductions) {
5128 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5129 return IC;
5130 }
5131
5132 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5133 return 1;
5134}
5135
5138 // This function calculates the register usage by measuring the highest number
5139 // of values that are alive at a single location. Obviously, this is a very
5140 // rough estimation. We scan the loop in a topological order in order and
5141 // assign a number to each instruction. We use RPO to ensure that defs are
5142 // met before their users. We assume that each instruction that has in-loop
5143 // users starts an interval. We record every time that an in-loop value is
5144 // used, so we have a list of the first and last occurrences of each
5145 // instruction. Next, we transpose this data structure into a multi map that
5146 // holds the list of intervals that *end* at a specific location. This multi
5147 // map allows us to perform a linear search. We scan the instructions linearly
5148 // and record each time that a new interval starts, by placing it in a set.
5149 // If we find this value in the multi-map then we remove it from the set.
5150 // The max register usage is the maximum size of the set.
5151 // We also search for instructions that are defined outside the loop, but are
5152 // used inside the loop. We need this number separately from the max-interval
5153 // usage number because when we unroll, loop-invariant values do not take
5154 // more register.
5156 DFS.perform(LI);
5157
5158 RegisterUsage RU;
5159
5160 // Each 'key' in the map opens a new interval. The values
5161 // of the map are the index of the 'last seen' usage of the
5162 // instruction that is the key.
5164
5165 // Maps instruction to its index.
5167 // Marks the end of each interval.
5168 IntervalMap EndPoint;
5169 // Saves the list of instruction indices that are used in the loop.
5171 // Saves the list of values that are used in the loop but are defined outside
5172 // the loop (not including non-instruction values such as arguments and
5173 // constants).
5174 SmallSetVector<Instruction *, 8> LoopInvariants;
5175
5176 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5177 for (Instruction &I : BB->instructionsWithoutDebug()) {
5178 IdxToInstr.push_back(&I);
5179
5180 // Save the end location of each USE.
5181 for (Value *U : I.operands()) {
5182 auto *Instr = dyn_cast<Instruction>(U);
5183
5184 // Ignore non-instruction values such as arguments, constants, etc.
5185 // FIXME: Might need some motivation why these values are ignored. If
5186 // for example an argument is used inside the loop it will increase the
5187 // register pressure (so shouldn't we add it to LoopInvariants).
5188 if (!Instr)
5189 continue;
5190
5191 // If this instruction is outside the loop then record it and continue.
5192 if (!TheLoop->contains(Instr)) {
5193 LoopInvariants.insert(Instr);
5194 continue;
5195 }
5196
5197 // Overwrite previous end points.
5198 EndPoint[Instr] = IdxToInstr.size();
5199 Ends.insert(Instr);
5200 }
5201 }
5202 }
5203
5204 // Saves the list of intervals that end with the index in 'key'.
5205 using InstrList = SmallVector<Instruction *, 2>;
5206 DenseMap<unsigned, InstrList> TransposeEnds;
5207
5208 // Transpose the EndPoints to a list of values that end at each index.
5209 for (auto &Interval : EndPoint)
5210 TransposeEnds[Interval.second].push_back(Interval.first);
5211
5212 SmallPtrSet<Instruction *, 8> OpenIntervals;
5215
5216 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5217
5218 const auto &TTICapture = TTI;
5219 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5220 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5221 return 0;
5222 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5223 };
5224
5225 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5226 Instruction *I = IdxToInstr[i];
5227
5228 // Remove all of the instructions that end at this location.
5229 InstrList &List = TransposeEnds[i];
5230 for (Instruction *ToRemove : List)
5231 OpenIntervals.erase(ToRemove);
5232
5233 // Ignore instructions that are never used within the loop.
5234 if (!Ends.count(I))
5235 continue;
5236
5237 // Skip ignored values.
5238 if (ValuesToIgnore.count(I))
5239 continue;
5240
5242
5243 // For each VF find the maximum usage of registers.
5244 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5245 // Count the number of registers used, per register class, given all open
5246 // intervals.
5247 // Note that elements in this SmallMapVector will be default constructed
5248 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5249 // there is no previous entry for ClassID.
5251
5252 if (VFs[j].isScalar()) {
5253 for (auto *Inst : OpenIntervals) {
5254 unsigned ClassID =
5255 TTI.getRegisterClassForType(false, Inst->getType());
5256 // FIXME: The target might use more than one register for the type
5257 // even in the scalar case.
5258 RegUsage[ClassID] += 1;
5259 }
5260 } else {
5262 for (auto *Inst : OpenIntervals) {
5263 // Skip ignored values for VF > 1.
5264 if (VecValuesToIgnore.count(Inst))
5265 continue;
5266 if (isScalarAfterVectorization(Inst, VFs[j])) {
5267 unsigned ClassID =
5268 TTI.getRegisterClassForType(false, Inst->getType());
5269 // FIXME: The target might use more than one register for the type
5270 // even in the scalar case.
5271 RegUsage[ClassID] += 1;
5272 } else {
5273 unsigned ClassID =
5274 TTI.getRegisterClassForType(true, Inst->getType());
5275 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5276 }
5277 }
5278 }
5279
5280 for (auto& pair : RegUsage) {
5281 auto &Entry = MaxUsages[j][pair.first];
5282 Entry = std::max(Entry, pair.second);
5283 }
5284 }
5285
5286 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5287 << OpenIntervals.size() << '\n');
5288
5289 // Add the current instruction to the list of open intervals.
5290 OpenIntervals.insert(I);
5291 }
5292
5293 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5294 // Note that elements in this SmallMapVector will be default constructed
5295 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5296 // there is no previous entry for ClassID.
5298
5299 for (auto *Inst : LoopInvariants) {
5300 // FIXME: The target might use more than one register for the type
5301 // even in the scalar case.
5302 bool IsScalar = all_of(Inst->users(), [&](User *U) {
5303 auto *I = cast<Instruction>(U);
5304 return TheLoop != LI->getLoopFor(I->getParent()) ||
5305 isScalarAfterVectorization(I, VFs[i]);
5306 });
5307
5308 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
5309 unsigned ClassID =
5310 TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5311 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5312 }
5313
5314 LLVM_DEBUG({
5315 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5316 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5317 << " item\n";
5318 for (const auto &pair : MaxUsages[i]) {
5319 dbgs() << "LV(REG): RegisterClass: "
5320 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5321 << " registers\n";
5322 }
5323 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5324 << " item\n";
5325 for (const auto &pair : Invariant) {
5326 dbgs() << "LV(REG): RegisterClass: "
5327 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5328 << " registers\n";
5329 }
5330 });
5331
5332 RU.LoopInvariantRegs = Invariant;
5333 RU.MaxLocalUsers = MaxUsages[i];
5334 RUs[i] = RU;
5335 }
5336
5337 return RUs;
5338}
5339
5340bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5341 ElementCount VF) {
5342 // TODO: Cost model for emulated masked load/store is completely
5343 // broken. This hack guides the cost model to use an artificially
5344 // high enough value to practically disable vectorization with such
5345 // operations, except where previously deployed legality hack allowed
5346 // using very low cost values. This is to avoid regressions coming simply
5347 // from moving "masked load/store" check from legality to cost model.
5348 // Masked Load/Gather emulation was previously never allowed.
5349 // Limited number of Masked Store/Scatter emulation was allowed.
5351 "Expecting a scalar emulated instruction");
5352 return isa<LoadInst>(I) ||
5353 (isa<StoreInst>(I) &&
5354 NumPredStores > NumberOfStoresToPredicate);
5355}
5356
5358 // If we aren't vectorizing the loop, or if we've already collected the
5359 // instructions to scalarize, there's nothing to do. Collection may already
5360 // have occurred if we have a user-selected VF and are now computing the
5361 // expected cost for interleaving.
5362 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5363 return;
5364
5365 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5366 // not profitable to scalarize any instructions, the presence of VF in the
5367 // map will indicate that we've analyzed it already.
5368 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5369
5370 PredicatedBBsAfterVectorization[VF].clear();
5371
5372 // Find all the instructions that are scalar with predication in the loop and
5373 // determine if it would be better to not if-convert the blocks they are in.
5374 // If so, we also record the instructions to scalarize.
5375 for (BasicBlock *BB : TheLoop->blocks()) {
5377 continue;
5378 for (Instruction &I : *BB)
5379 if (isScalarWithPredication(&I, VF)) {
5380 ScalarCostsTy ScalarCosts;
5381 // Do not apply discount logic for:
5382 // 1. Scalars after vectorization, as there will only be a single copy
5383 // of the instruction.
5384 // 2. Scalable VF, as that would lead to invalid scalarization costs.
5385 // 3. Emulated masked memrefs, if a hacked cost is needed.
5386 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5387 !useEmulatedMaskMemRefHack(&I, VF) &&
5388 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5389 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5390 // Remember that BB will remain after vectorization.
5391 PredicatedBBsAfterVectorization[VF].insert(BB);
5392 for (auto *Pred : predecessors(BB)) {
5393 if (Pred->getSingleSuccessor() == BB)
5394 PredicatedBBsAfterVectorization[VF].insert(Pred);
5395 }
5396 }
5397 }
5398}
5399
5400InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5401 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5402 assert(!isUniformAfterVectorization(PredInst, VF) &&
5403 "Instruction marked uniform-after-vectorization will be predicated");
5404
5405 // Initialize the discount to zero, meaning that the scalar version and the
5406 // vector version cost the same.
5407 InstructionCost Discount = 0;
5408
5409 // Holds instructions to analyze. The instructions we visit are mapped in
5410 // ScalarCosts. Those instructions are the ones that would be scalarized if
5411 // we find that the scalar version costs less.
5413
5414 // Returns true if the given instruction can be scalarized.
5415 auto canBeScalarized = [&](Instruction *I) -> bool {
5416 // We only attempt to scalarize instructions forming a single-use chain
5417 // from the original predicated block that would otherwise be vectorized.
5418 // Although not strictly necessary, we give up on instructions we know will
5419 // already be scalar to avoid traversing chains that are unlikely to be
5420 // beneficial.
5421 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5423 return false;
5424
5425 // If the instruction is scalar with predication, it will be analyzed
5426 // separately. We ignore it within the context of PredInst.
5427 if (isScalarWithPredication(I, VF))
5428 return false;
5429
5430 // If any of the instruction's operands are uniform after vectorization,
5431 // the instruction cannot be scalarized. This prevents, for example, a
5432 // masked load from being scalarized.
5433 //
5434 // We assume we will only emit a value for lane zero of an instruction
5435 // marked uniform after vectorization, rather than VF identical values.
5436 // Thus, if we scalarize an instruction that uses a uniform, we would
5437 // create uses of values corresponding to the lanes we aren't emitting code
5438 // for. This behavior can be changed by allowing getScalarValue to clone
5439 // the lane zero values for uniforms rather than asserting.
5440 for (Use &U : I->operands())
5441 if (auto *J = dyn_cast<Instruction>(U.get()))
5442 if (isUniformAfterVectorization(J, VF))
5443 return false;
5444
5445 // Otherwise, we can scalarize the instruction.
5446 return true;
5447 };
5448
5449 // Compute the expected cost discount from scalarizing the entire expression
5450 // feeding the predicated instruction. We currently only consider expressions
5451 // that are single-use instruction chains.
5452 Worklist.push_back(PredInst);
5453 while (!Worklist.empty()) {
5454 Instruction *I = Worklist.pop_back_val();
5455
5456 // If we've already analyzed the instruction, there's nothing to do.
5457 if (ScalarCosts.contains(I))
5458 continue;
5459
5460 // Compute the cost of the vector instruction. Note that this cost already
5461 // includes the scalarization overhead of the predicated instruction.
5462 InstructionCost VectorCost = getInstructionCost(I, VF);
5463
5464 // Compute the cost of the scalarized instruction. This cost is the cost of
5465 // the instruction as if it wasn't if-converted and instead remained in the
5466 // predicated block. We will scale this cost by block probability after
5467 // computing the scalarization overhead.
5468 InstructionCost ScalarCost =
5470
5471 // Compute the scalarization overhead of needed insertelement instructions
5472 // and phi nodes.
5474 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5475 ScalarCost += TTI.getScalarizationOverhead(
5476 cast<VectorType>(ToVectorTy(I->getType(), VF)),
5477 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5478 /*Extract*/ false, CostKind);
5479 ScalarCost +=
5480 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5481 }
5482
5483 // Compute the scalarization overhead of needed extractelement
5484 // instructions. For each of the instruction's operands, if the operand can
5485 // be scalarized, add it to the worklist; otherwise, account for the
5486 // overhead.
5487 for (Use &U : I->operands())
5488 if (auto *J = dyn_cast<Instruction>(U.get())) {
5489 assert(VectorType::isValidElementType(J->getType()) &&
5490 "Instruction has non-scalar type");
5491 if (canBeScalarized(J))
5492 Worklist.push_back(J);
5493 else if (needsExtract(J, VF)) {
5494 ScalarCost += TTI.getScalarizationOverhead(
5495 cast<VectorType>(ToVectorTy(J->getType(), VF)),
5496 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5497 /*Extract*/ true, CostKind);
5498 }
5499 }
5500
5501 // Scale the total scalar cost by block probability.
5502 ScalarCost /= getReciprocalPredBlockProb();
5503
5504 // Compute the discount. A non-negative discount means the vector version
5505 // of the instruction costs more, and scalarizing would be beneficial.
5506 Discount += VectorCost - ScalarCost;
5507 ScalarCosts[I] = ScalarCost;
5508 }
5509
5510 return Discount;
5511}
5512
5515
5516 // For each block.
5517 for (BasicBlock *BB : TheLoop->blocks()) {
5518 InstructionCost BlockCost;
5519
5520 // For each instruction in the old loop.
5521 for (Instruction &I : BB->instructionsWithoutDebug()) {
5522 // Skip ignored values.
5523 if (ValuesToIgnore.count(&I) ||
5524 (VF.isVector() && VecValuesToIgnore.count(&I)))
5525 continue;
5526
5528
5529 // Check if we should override the cost.
5530 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5532
5533 BlockCost += C;
5534 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5535 << VF << " For instruction: " << I << '\n');
5536 }
5537
5538 // If we are vectorizing a predicated block, it will have been
5539 // if-converted. This means that the block's instructions (aside from
5540 // stores and instructions that may divide by zero) will now be
5541 // unconditionally executed. For the scalar case, we may not always execute
5542 // the predicated block, if it is an if-else block. Thus, scale the block's
5543 // cost by the probability of executing it. blockNeedsPredication from
5544 // Legal is used so as to not include all blocks in tail folded loops.
5545 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5546 BlockCost /= getReciprocalPredBlockProb();
5547
5548 Cost += BlockCost;
5549 }
5550
5551 return Cost;
5552}
5553
5554/// Gets Address Access SCEV after verifying that the access pattern
5555/// is loop invariant except the induction variable dependence.
5556///
5557/// This SCEV can be sent to the Target in order to estimate the address
5558/// calculation cost.
5560 Value *Ptr,
5563 const Loop *TheLoop) {
5564
5565 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5566 if (!Gep)
5567 return nullptr;
5568
5569 // We are looking for a gep with all loop invariant indices except for one
5570 // which should be an induction variable.
5571 auto SE = PSE.getSE();
5572 unsigned NumOperands = Gep->getNumOperands();
5573 for (unsigned i = 1; i < NumOperands; ++i) {
5574 Value *Opd = Gep->getOperand(i);
5575 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5576 !Legal->isInductionVariable(Opd))
5577 return nullptr;
5578 }
5579
5580 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5581 return PSE.getSCEV(Ptr);
5582}
5583
5585LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5586 ElementCount VF) {
5587 assert(VF.isVector() &&
5588 "Scalarization cost of instruction implies vectorization.");
5589 if (VF.isScalable())
5591
5592 Type *ValTy = getLoadStoreType(I);
5593 auto SE = PSE.getSE();
5594
5595 unsigned AS = getLoadStoreAddressSpace(I);
5597 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5598 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5599 // that it is being called from this specific place.
5600
5601 // Figure out whether the access is strided and get the stride value
5602 // if it's known in compile time
5603 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5604
5605 // Get the cost of the scalar memory instruction and address computation.
5607 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5608
5609 // Don't pass *I here, since it is scalar but will actually be part of a
5610 // vectorized loop where the user of it is a vectorized instruction.
5612 const Align Alignment = getLoadStoreAlignment(I);
5613 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5614 ValTy->getScalarType(),
5615 Alignment, AS, CostKind);
5616
5617 // Get the overhead of the extractelement and insertelement instructions
5618 // we might create due to scalarization.
5619 Cost += getScalarizationOverhead(I, VF, CostKind);
5620
5621 // If we have a predicated load/store, it will need extra i1 extracts and
5622 // conditional branches, but may not be executed for each vector lane. Scale
5623 // the cost by the probability of executing the predicated block.
5624 if (isPredicatedInst(I)) {
5626
5627 // Add the cost of an i1 extract and a branch
5628 auto *Vec_i1Ty =
5631 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
5632 /*Insert=*/false, /*Extract=*/true, CostKind);
5633 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5634
5635 if (useEmulatedMaskMemRefHack(I, VF))
5636 // Artificially setting to a high enough value to practically disable
5637 // vectorization with such operations.
5638 Cost = 3000000;
5639 }
5640
5641 return Cost;
5642}
5643
5645LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5646 ElementCount VF) {
5647 Type *ValTy = getLoadStoreType(I);
5648 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5650 unsigned AS = getLoadStoreAddressSpace(I);
5651 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5653
5654 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5655 "Stride should be 1 or -1 for consecutive memory access");
5656 const Align Alignment = getLoadStoreAlignment(I);
5658 if (Legal->isMaskRequired(I)) {
5659 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5660 CostKind);
5661 } else {
5662 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5663 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5664 CostKind, OpInfo, I);
5665 }
5666
5667 bool Reverse = ConsecutiveStride < 0;
5668 if (Reverse)
5670 std::nullopt, CostKind, 0);
5671 return Cost;
5672}
5673
5675LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5676 ElementCount VF) {
5677 assert(Legal->isUniformMemOp(*I, VF));
5678
5679 Type *ValTy = getLoadStoreType(I);
5680 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5681 const Align Alignment = getLoadStoreAlignment(I);
5682 unsigned AS = getLoadStoreAddressSpace(I);
5684 if (isa<LoadInst>(I)) {
5685 return TTI.getAddressComputationCost(ValTy) +
5686 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5687 CostKind) +
5689 }
5690 StoreInst *SI = cast<StoreInst>(I);
5691
5692 bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5693 return TTI.getAddressComputationCost(ValTy) +
5694 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5695 CostKind) +
5696 (isLoopInvariantStoreValue
5697 ? 0
5698 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5699 CostKind, VF.getKnownMinValue() - 1));
5700}
5701
5703LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5704 ElementCount VF) {
5705 Type *ValTy = getLoadStoreType(I);
5706 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5707 const Align Alignment = getLoadStoreAlignment(I);
5709
5710 return TTI.getAddressComputationCost(VectorTy) +
5712 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
5714}
5715
5717LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5718 ElementCount VF) {
5719 Type *ValTy = getLoadStoreType(I);
5720 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5721 unsigned AS = getLoadStoreAddressSpace(I);
5723
5724 auto Group = getInterleavedAccessGroup(I);
5725 assert(Group && "Fail to get an interleaved access group.");
5726
5727 unsigned InterleaveFactor = Group->getFactor();
5728 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5729
5730 // Holds the indices of existing members in the interleaved group.
5732 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5733 if (Group->getMember(IF))
5734 Indices.push_back(IF);
5735
5736 // Calculate the cost of the whole interleaved group.
5737 bool UseMaskForGaps =
5738 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5739 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
5741 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
5742 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
5743
5744 if (Group->isReverse()) {
5745 // TODO: Add support for reversed masked interleaved access.
5747 "Reverse masked interleaved access not supported.");
5748 Cost += Group->getNumMembers() *
5750 std::nullopt, CostKind, 0);
5751 }
5752 return Cost;
5753}
5754
5755std::optional<InstructionCost>
5757 Instruction *I, ElementCount VF, Type *Ty,
5759 using namespace llvm::PatternMatch;
5760 // Early exit for no inloop reductions
5761 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5762 return std::nullopt;
5763 auto *VectorTy = cast<VectorType>(Ty);
5764
5765 // We are looking for a pattern of, and finding the minimal acceptable cost:
5766 // reduce(mul(ext(A), ext(B))) or
5767 // reduce(mul(A, B)) or
5768 // reduce(ext(A)) or
5769 // reduce(A).
5770 // The basic idea is that we walk down the tree to do that, finding the root
5771 // reduction instruction in InLoopReductionImmediateChains. From there we find
5772 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5773 // of the components. If the reduction cost is lower then we return it for the
5774 // reduction instruction and 0 for the other instructions in the pattern. If
5775 // it is not we return an invalid cost specifying the orignal cost method
5776 // should be used.
5777 Instruction *RetI = I;
5778 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5779 if (!RetI->hasOneUser())
5780 return std::nullopt;
5781 RetI = RetI->user_back();
5782 }
5783
5784 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5785 RetI->user_back()->getOpcode() == Instruction::Add) {
5786 RetI = RetI->user_back();
5787 }
5788
5789 // Test if the found instruction is a reduction, and if not return an invalid
5790 // cost specifying the parent to use the original cost modelling.
5791 if (!InLoopReductionImmediateChains.count(RetI))
5792 return std::nullopt;
5793
5794 // Find the reduction this chain is a part of and calculate the basic cost of
5795 // the reduction on its own.
5796 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
5797 Instruction *ReductionPhi = LastChain;
5798 while (!isa<PHINode>(ReductionPhi))
5799 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5800
5801 const RecurrenceDescriptor &RdxDesc =
5802 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
5803
5804 InstructionCost BaseCost;
5805 RecurKind RK = RdxDesc.getRecurrenceKind();
5808 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5809 RdxDesc.getFastMathFlags(), CostKind);
5810 } else {
5812 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5813 }
5814
5815 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5816 // normal fmul instruction to the cost of the fadd reduction.
5817 if (RK == RecurKind::FMulAdd)
5818 BaseCost +=
5819 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5820
5821 // If we're using ordered reductions then we can just return the base cost
5822 // here, since getArithmeticReductionCost calculates the full ordered
5823 // reduction cost when FP reassociation is not allowed.
5824 if (useOrderedReductions(RdxDesc))
5825 return BaseCost;
5826
5827 // Get the operand that was not the reduction chain and match it to one of the
5828 // patterns, returning the better cost if it is found.
5829 Instruction *RedOp = RetI->getOperand(1) == LastChain
5830 ? dyn_cast<Instruction>(RetI->getOperand(0))
5831 : dyn_cast<Instruction>(RetI->getOperand(1));
5832
5833 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5834
5835 Instruction *Op0, *Op1;
5836 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5837 match(RedOp,
5839 match(Op0, m_ZExtOrSExt(m_Value())) &&
5840 Op0->getOpcode() == Op1->getOpcode() &&
5841 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5843 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5844
5845 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5846 // Note that the extend opcodes need to all match, or if A==B they will have
5847 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5848 // which is equally fine.
5849 bool IsUnsigned = isa<ZExtInst>(Op0);
5850 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5851 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5852
5853 InstructionCost ExtCost =
5854 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5856 InstructionCost MulCost =
5857 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5858 InstructionCost Ext2Cost =
5859 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5861
5863 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5864
5865 if (RedCost.isValid() &&
5866 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5867 return I == RetI ? RedCost : 0;
5868 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
5869 !TheLoop->isLoopInvariant(RedOp)) {
5870 // Matched reduce(ext(A))
5871 bool IsUnsigned = isa<ZExtInst>(RedOp);
5872 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
5874 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
5875 RdxDesc.getFastMathFlags(), CostKind);
5876
5877 InstructionCost ExtCost =
5878 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
5880 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5881 return I == RetI ? RedCost : 0;
5882 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5883 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
5884 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
5885 Op0->getOpcode() == Op1->getOpcode() &&
5887 bool IsUnsigned = isa<ZExtInst>(Op0);
5888 Type *Op0Ty = Op0->getOperand(0)->getType();
5889 Type *Op1Ty = Op1->getOperand(0)->getType();
5890 Type *LargestOpTy =
5891 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5892 : Op0Ty;
5893 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
5894
5895 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5896 // different sizes. We take the largest type as the ext to reduce, and add
5897 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5899 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
5902 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
5904 InstructionCost MulCost =
5905 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5906
5908 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5909 InstructionCost ExtraExtCost = 0;
5910 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5911 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5912 ExtraExtCost = TTI.getCastInstrCost(
5913 ExtraExtOp->getOpcode(), ExtType,
5914 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
5916 }
5917
5918 if (RedCost.isValid() &&
5919 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5920 return I == RetI ? RedCost : 0;
5921 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
5922 // Matched reduce.add(mul())
5923 InstructionCost MulCost =
5924 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5925
5927 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
5928
5929 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5930 return I == RetI ? RedCost : 0;
5931 }
5932 }
5933
5934 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5935}
5936
5938LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5939 ElementCount VF) {
5940 // Calculate scalar cost only. Vectorization cost should be ready at this
5941 // moment.
5942 if (VF.isScalar()) {
5943 Type *ValTy = getLoadStoreType(I);
5944 const Align Alignment = getLoadStoreAlignment(I);
5945 unsigned AS = getLoadStoreAddressSpace(I);
5946
5947 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5948 return TTI.getAddressComputationCost(ValTy) +
5949 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
5950 TTI::TCK_RecipThroughput, OpInfo, I);
5951 }
5952 return getWideningCost(I, VF);
5953}
5954
5955InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
5957
5958 // There is no mechanism yet to create a scalable scalarization loop,
5959 // so this is currently Invalid.
5960 if (VF.isScalable())
5962
5963 if (VF.isScalar())
5964 return 0;
5965
5967 Type *RetTy = ToVectorTy(I->getType(), VF);
5968 if (!RetTy->isVoidTy() &&
5969 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5971 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
5972 /*Insert*/ true,
5973 /*Extract*/ false, CostKind);
5974
5975 // Some targets keep addresses scalar.
5976 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5977 return Cost;
5978
5979 // Some targets support efficient element stores.
5980 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5981 return Cost;
5982
5983 // Collect operands to consider.
5984 CallInst *CI = dyn_cast<CallInst>(I);
5985 Instruction::op_range Ops = CI ? CI->args() : I->operands();
5986
5987 // Skip operands that do not require extraction/scalarization and do not incur
5988 // any overhead.
5990 for (auto *V : filterExtractingOperands(Ops, VF))
5991 Tys.push_back(MaybeVectorizeType(V->getType(), VF));
5993 filterExtractingOperands(Ops, VF), Tys, CostKind);
5994}
5995
5997 if (VF.isScalar())
5998 return;
5999 NumPredStores = 0;
6000 for (BasicBlock *BB : TheLoop->blocks()) {
6001 // For each instruction in the old loop.
6002 for (Instruction &I : *BB) {
6004 if (!Ptr)
6005 continue;
6006
6007 // TODO: We should generate better code and update the cost model for
6008 // predicated uniform stores. Today they are treated as any other
6009 // predicated store (see added test cases in
6010 // invariant-store-vectorization.ll).
6011 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6012 NumPredStores++;
6013
6014 if (Legal->isUniformMemOp(I, VF)) {
6015 auto isLegalToScalarize = [&]() {
6016 if (!VF.isScalable())
6017 // Scalarization of fixed length vectors "just works".
6018 return true;
6019
6020 // We have dedicated lowering for unpredicated uniform loads and
6021 // stores. Note that even with tail folding we know that at least
6022 // one lane is active (i.e. generalized predication is not possible
6023 // here), and the logic below depends on this fact.
6024 if (!foldTailByMasking())
6025 return true;
6026
6027 // For scalable vectors, a uniform memop load is always
6028 // uniform-by-parts and we know how to scalarize that.
6029 if (isa<LoadInst>(I))
6030 return true;
6031
6032 // A uniform store isn't neccessarily uniform-by-part
6033 // and we can't assume scalarization.
6034 auto &SI = cast<StoreInst>(I);
6035 return TheLoop->isLoopInvariant(SI.getValueOperand());
6036 };
6037
6038 const InstructionCost GatherScatterCost =
6040 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6041
6042 // Load: Scalar load + broadcast
6043 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6044 // FIXME: This cost is a significant under-estimate for tail folded
6045 // memory ops.
6046 const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6047 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6048
6049 // Choose better solution for the current VF, Note that Invalid
6050 // costs compare as maximumal large. If both are invalid, we get
6051 // scalable invalid which signals a failure and a vectorization abort.
6052 if (GatherScatterCost < ScalarizationCost)
6053 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6054 else
6055 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6056 continue;
6057 }
6058
6059 // We assume that widening is the best solution when possible.
6060 if (memoryInstructionCanBeWidened(&I, VF)) {
6061 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6062 int ConsecutiveStride = Legal->isConsecutivePtr(
6064 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6065 "Expected consecutive stride.");
6066 InstWidening Decision =
6067 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6068 setWideningDecision(&I, VF, Decision, Cost);
6069 continue;
6070 }
6071
6072 // Choose between Interleaving, Gather/Scatter or Scalarization.
6074 unsigned NumAccesses = 1;
6075 if (isAccessInterleaved(&I)) {
6076 auto Group = getInterleavedAccessGroup(&I);
6077 assert(Group && "Fail to get an interleaved access group.");
6078
6079 // Make one decision for the whole group.
6080 if (getWideningDecision(&I, VF) != CM_Unknown)
6081 continue;
6082
6083 NumAccesses = Group->getNumMembers();
6085 InterleaveCost = getInterleaveGroupCost(&I, VF);
6086 }
6087
6088 InstructionCost GatherScatterCost =
6090 ? getGatherScatterCost(&I, VF) * NumAccesses
6092
6093 InstructionCost ScalarizationCost =
6094 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6095
6096 // Choose better solution for the current VF,
6097 // write down this decision and use it during vectorization.
6099 InstWidening Decision;
6100 if (InterleaveCost <= GatherScatterCost &&
6101 InterleaveCost < ScalarizationCost) {
6102 Decision = CM_Interleave;
6103 Cost = InterleaveCost;
6104 } else if (GatherScatterCost < ScalarizationCost) {
6105 Decision = CM_GatherScatter;
6106 Cost = GatherScatterCost;
6107 } else {
6108 Decision = CM_Scalarize;
6109 Cost = ScalarizationCost;
6110 }
6111 // If the instructions belongs to an interleave group, the whole group
6112 // receives the same decision. The whole group receives the cost, but
6113 // the cost will actually be assigned to one instruction.
6114 if (auto Group = getInterleavedAccessGroup(&I))
6115 setWideningDecision(Group, VF, Decision, Cost);
6116 else
6117 setWideningDecision(&I, VF, Decision, Cost);
6118 }
6119 }
6120
6121 // Make sure that any load of address and any other address computation
6122 // remains scalar unless there is gather/scatter support. This avoids
6123 // inevitable extracts into address registers, and also has the benefit of
6124 // activating LSR more, since that pass can't optimize vectorized
6125 // addresses.
6127 return;
6128
6129 // Start with all scalar pointer uses.
6131 for (BasicBlock *BB : TheLoop->blocks())
6132 for (Instruction &I : *BB) {
6133 Instruction *PtrDef =
6134 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6135 if (PtrDef && TheLoop->contains(PtrDef) &&
6137 AddrDefs.insert(PtrDef);
6138 }
6139
6140 // Add all instructions used to generate the addresses.
6142 append_range(Worklist, AddrDefs);
6143 while (!Worklist.empty()) {
6144 Instruction *I = Worklist.pop_back_val();
6145 for (auto &Op : I->operands())
6146 if (auto *InstOp = dyn_cast<Instruction>(Op))
6147 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6148 AddrDefs.insert(InstOp).second)
6149 Worklist.push_back(InstOp);
6150 }
6151
6152 for (auto *I : AddrDefs) {
6153 if (isa<LoadInst>(I)) {
6154 // Setting the desired widening decision should ideally be handled in
6155 // by cost functions, but since this involves the task of finding out
6156 // if the loaded register is involved in an address computation, it is
6157 // instead changed here when we know this is the case.
6158 InstWidening Decision = getWideningDecision(I, VF);
6159 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6160 // Scalarize a widened load of address.
6162 I, VF, CM_Scalarize,
6163 (VF.getKnownMinValue() *
6164 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6165 else if (auto Group = getInterleavedAccessGroup(I)) {
6166 // Scalarize an interleave group of address loads.
6167 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6168 if (Instruction *Member = Group->getMember(I))
6170 Member, VF, CM_Scalarize,
6171 (VF.getKnownMinValue() *
6172 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6173 }
6174 }
6175 } else
6176 // Make sure I gets scalarized and a cost estimate without
6177 // scalarization overhead.
6178 ForcedScalars[VF].insert(I);
6179 }
6180}
6181
6183 assert(!VF.isScalar() &&
6184 "Trying to set a vectorization decision for a scalar VF");
6185
6186 for (BasicBlock *BB : TheLoop->blocks()) {
6187 // For each instruction in the old loop.
6188 for (Instruction &I : *BB) {
6189 CallInst *CI = dyn_cast<CallInst>(&I);
6190
6191 if (!CI)
6192 continue;
6193
6198
6199 Function *ScalarFunc = CI->getCalledFunction();
6200 Type *ScalarRetTy = CI->getType();
6201 SmallVector<Type *, 4> Tys, ScalarTys;
6202 bool MaskRequired = Legal->isMaskRequired(CI);
6203 for (auto &ArgOp : CI->args())
6204 ScalarTys.push_back(ArgOp->getType());
6205
6206 // Compute corresponding vector type for return value and arguments.
6207 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
6208 for (Type *ScalarTy : ScalarTys)
6209 Tys.push_back(ToVectorTy(ScalarTy, VF));
6210
6211 // An in-loop reduction using an fmuladd intrinsic is a special case;
6212 // we don't want the normal cost for that intrinsic.
6214 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6217 std::nullopt, *RedCost);
6218 continue;
6219 }
6220
6221 // Estimate cost of scalarized vector call. The source operands are
6222 // assumed to be vectors, so we need to extract individual elements from
6223 // there, execute VF scalar calls, and then gather the result into the
6224 // vector return value.
6225 InstructionCost ScalarCallCost =
6226 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6227
6228 // Compute costs of unpacking argument values for the scalar calls and
6229 // packing the return values to a vector.
6230 InstructionCost ScalarizationCost =
6231 getScalarizationOverhead(CI, VF, CostKind);
6232
6233 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6234
6235 // Find the cost of vectorizing the call, if we can find a suitable
6236 // vector variant of the function.
6237 bool UsesMask = false;
6238 VFInfo FuncInfo;
6239 Function *VecFunc = nullptr;
6240 // Search through any available variants for one we can use at this VF.
6241 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6242 // Must match requested VF.
6243 if (Info.Shape.VF != VF)
6244 continue;
6245
6246 // Must take a mask argument if one is required
6247 if (MaskRequired && !Info.isMasked())
6248 continue;
6249
6250 // Check that all parameter kinds are supported
6251 bool ParamsOk = true;
6252 for (VFParameter Param : Info.Shape.Parameters) {
6253 switch (Param.ParamKind) {
6255 break;
6257 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6258 // Make sure the scalar parameter in the loop is invariant.
6259 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6260 TheLoop))
6261 ParamsOk = false;
6262 break;
6263 }
6265 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6266 // Find the stride for the scalar parameter in this loop and see if
6267 // it matches the stride for the variant.
6268 // TODO: do we need to figure out the cost of an extract to get the
6269 // first lane? Or do we hope that it will be folded away?
6270 ScalarEvolution *SE = PSE.getSE();
6271 const auto *SAR =
6272 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6273
6274 if (!SAR || SAR->getLoop() != TheLoop) {
6275 ParamsOk = false;
6276 break;
6277 }
6278
6279 const SCEVConstant *Step =
6280 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6281
6282 if (!Step ||
6283 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6284 ParamsOk = false;
6285
6286 break;
6287 }
6289 UsesMask = true;
6290 break;
6291 default:
6292 ParamsOk = false;
6293 break;
6294 }
6295 }
6296
6297 if (!ParamsOk)
6298 continue;
6299
6300 // Found a suitable candidate, stop here.
6301 VecFunc = CI->getModule()->getFunction(Info.VectorName);
6302 FuncInfo = Info;
6303 break;
6304 }
6305
6306 // Add in the cost of synthesizing a mask if one wasn't required.
6307 InstructionCost MaskCost = 0;
6308 if (VecFunc && UsesMask && !MaskRequired)
6309 MaskCost = TTI.getShuffleCost(
6312 VecFunc->getFunctionType()->getContext()),
6313 VF));
6314
6315 if (TLI && VecFunc && !CI->isNoBuiltin())
6316 VectorCost =
6317 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6318
6319 // Find the cost of an intrinsic; some targets may have instructions that
6320 // perform the operation without needing an actual call.
6322 if (IID != Intrinsic::not_intrinsic)
6323 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6324
6325 InstructionCost Cost = ScalarCost;
6326 InstWidening Decision = CM_Scalarize;
6327
6328 if (VectorCost <= Cost) {
6329 Cost = VectorCost;
6330 Decision = CM_VectorCall;
6331 }
6332
6333 if (IntrinsicCost <= Cost) {
6334 Cost = IntrinsicCost;
6335 Decision = CM_IntrinsicCall;
6336 }
6337
6338 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6340 }
6341 }
6342}
6343
6346 ElementCount VF) {
6347 // If we know that this instruction will remain uniform, check the cost of
6348 // the scalar version.
6350 VF = ElementCount::getFixed(1);
6351
6352 if (VF.isVector() && isProfitableToScalarize(I, VF))
6353 return InstsToScalarize[VF][I];
6354
6355 // Forced scalars do not have any scalarization overhead.
6356 auto ForcedScalar = ForcedScalars.find(VF);
6357 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6358 auto InstSet = ForcedScalar->second;
6359 if (InstSet.count(I))
6361 VF.getKnownMinValue();
6362 }
6363
6364 Type *RetTy = I->getType();
6366 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6367 auto SE = PSE.getSE();
6369
6370 auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6371 ElementCount VF) -> bool {
6372 if (VF.isScalar())
6373 return true;
6374
6375 auto Scalarized = InstsToScalarize.find(VF);
6376 assert(Scalarized != InstsToScalarize.end() &&
6377 "VF not yet analyzed for scalarization profitability");
6378 return !Scalarized->second.count(I) &&
6379 llvm::all_of(I->users(), [&](User *U) {
6380 auto *UI = cast<Instruction>(U);
6381 return !Scalarized->second.count(UI);
6382 });
6383 };
6384 (void) hasSingleCopyAfterVectorization;
6385
6386 Type *VectorTy;
6387 if (isScalarAfterVectorization(I, VF)) {
6388 // With the exception of GEPs and PHIs, after scalarization there should
6389 // only be one copy of the instruction generated in the loop. This is
6390 // because the VF is either 1, or any instructions that need scalarizing
6391 // have already been dealt with by the time we get here. As a result,
6392 // it means we don't have to multiply the instruction cost by VF.
6393 assert(I->getOpcode() == Instruction::GetElementPtr ||
6394 I->getOpcode() == Instruction::PHI ||
6395 (I->getOpcode() == Instruction::BitCast &&
6396 I->getType()->isPointerTy()) ||
6397 hasSingleCopyAfterVectorization(I, VF));
6398 VectorTy = RetTy;
6399 } else
6400 VectorTy = ToVectorTy(RetTy, VF);
6401
6402 if (VF.isVector() && VectorTy->isVectorTy() &&
6403 !TTI.getNumberOfParts(VectorTy))
6405
6406 // TODO: We need to estimate the cost of intrinsic calls.
6407 switch (I->getOpcode()) {
6408 case Instruction::GetElementPtr:
6409 // We mark this instruction as zero-cost because the cost of GEPs in
6410 // vectorized code depends on whether the corresponding memory instruction
6411 // is scalarized or not. Therefore, we handle GEPs with the memory
6412 // instruction cost.
6413 return 0;
6414 case Instruction::Br: {
6415 // In cases of scalarized and predicated instructions, there will be VF
6416 // predicated blocks in the vectorized loop. Each branch around these
6417 // blocks requires also an extract of its vector compare i1 element.
6418 // Note that the conditional branch from the loop latch will be replaced by
6419 // a single branch controlling the loop, so there is no extra overhead from
6420 // scalarization.
6421 bool ScalarPredicatedBB = false;
6422 BranchInst *BI = cast<BranchInst>(I);
6423 if (VF.isVector() && BI->isConditional() &&
6424 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6425 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6426 BI->getParent() != TheLoop->getLoopLatch())
6427 ScalarPredicatedBB = true;
6428
6429 if (ScalarPredicatedBB) {
6430 // Not possible to scalarize scalable vector with predicated instructions.
6431 if (VF.isScalable())
6433 // Return cost for branches around scalarized and predicated blocks.
6434 auto *Vec_i1Ty =
6435 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6436 return (
6438 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
6439 /*Insert*/ false, /*Extract*/ true, CostKind) +
6440 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6441 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6442 // The back-edge branch will remain, as will all scalar branches.
6443 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6444 else
6445 // This branch will be eliminated by if-conversion.
6446 return 0;
6447 // Note: We currently assume zero cost for an unconditional branch inside
6448 // a predicated block since it will become a fall-through, although we
6449 // may decide in the future to call TTI for all branches.
6450 }
6451 case Instruction::Switch: {
6452 if (VF.isScalar())
6453 return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6454 auto *Switch = cast<SwitchInst>(I);
6455 return Switch->getNumCases() *
6457 Instruction::ICmp,
6458 ToVectorTy(Switch->getCondition()->getType(), VF),
6459 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
6461 }
6462 case Instruction::PHI: {
6463 auto *Phi = cast<PHINode>(I);
6464
6465 // First-order recurrences are replaced by vector shuffles inside the loop.
6466 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6467 // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
6468 // penultimate value of the recurrence.
6469 // TODO: Consider vscale_range info.
6470 if (VF.isScalable() && VF.getKnownMinValue() == 1)
6473 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6475 cast<VectorType>(VectorTy), Mask, CostKind,
6476 VF.getKnownMinValue() - 1);
6477 }
6478
6479 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6480 // converted into select instructions. We require N - 1 selects per phi
6481 // node, where N is the number of incoming values.
6482 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6483 return (Phi->getNumIncomingValues() - 1) *
6485 Instruction::Select, ToVectorTy(Phi->getType(), VF),
6486 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6488
6489 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6490 }
6491 case Instruction::UDiv:
6492 case Instruction::SDiv:
6493 case Instruction::URem:
6494 case Instruction::SRem:
6495 if (VF.isVector() && isPredicatedInst(I)) {
6496 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6497 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6498 ScalarCost : SafeDivisorCost;
6499 }
6500 // We've proven all lanes safe to speculate, fall through.
6501 [[fallthrough]];
6502 case Instruction::Add:
6503 case Instruction::FAdd:
6504 case Instruction::Sub:
6505 case Instruction::FSub:
6506 case Instruction::Mul:
6507 case Instruction::FMul:
6508 case Instruction::FDiv:
6509 case Instruction::FRem:
6510 case Instruction::Shl:
6511 case Instruction::LShr:
6512 case Instruction::AShr:
6513 case Instruction::And:
6514 case Instruction::Or:
6515 case Instruction::Xor: {
6516 // If we're speculating on the stride being 1, the multiplication may
6517 // fold away. We can generalize this for all operations using the notion
6518 // of neutral elements. (TODO)
6519 if (I->getOpcode() == Instruction::Mul &&
6520 (PSE.getSCEV(I->getOperand(0))->isOne() ||
6521 PSE.getSCEV(I->getOperand(1))->isOne()))
6522 return 0;
6523
6524 // Detect reduction patterns
6525 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6526 return *RedCost;
6527
6528 // Certain instructions can be cheaper to vectorize if they have a constant
6529 // second vector operand. One example of this are shifts on x86.
6530 Value *Op2 = I->getOperand(1);
6531 auto Op2Info = TTI.getOperandInfo(Op2);
6532 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6533 Legal->isInvariant(Op2))
6535
6536 SmallVector<const Value *, 4> Operands(I->operand_values());
6538 I->getOpcode(), VectorTy, CostKind,
6539 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6540 Op2Info, Operands, I, TLI);
6541 }
6542 case Instruction::FNeg: {
6544 I->getOpcode(), VectorTy, CostKind,
6545 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6546 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6547 I->getOperand(0), I);
6548 }
6549 case Instruction::Select: {
6550 SelectInst *SI = cast<SelectInst>(I);
6551 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6552 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6553
6554 const Value *Op0, *Op1;
6555 using namespace llvm::PatternMatch;
6556 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6557 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6558 // select x, y, false --> x & y
6559 // select x, true, y --> x | y
6560 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6561 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6562 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6563 Op1->getType()->getScalarSizeInBits() == 1);
6564
6567 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6568 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6569 }
6570
6571 Type *CondTy = SI->getCondition()->getType();
6572 if (!ScalarCond)
6573 CondTy = VectorType::get(CondTy, VF);
6574
6576 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6577 Pred = Cmp->getPredicate();
6578 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6579 CostKind, I);
6580 }
6581 case Instruction::ICmp:
6582 case Instruction::FCmp: {
6583 Type *ValTy = I->getOperand(0)->getType();
6584 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6585 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6586 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6587 VectorTy = ToVectorTy(ValTy, VF);
6588 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6589 cast<CmpInst>(I)->getPredicate(), CostKind,
6590 I);
6591 }
6592 case Instruction::Store:
6593 case Instruction::Load: {
6594 ElementCount Width = VF;
6595 if (Width.isVector()) {
6596 InstWidening Decision = getWideningDecision(I, Width);
6597 assert(Decision != CM_Unknown &&
6598 "CM decision should be taken at this point");
6601 if (Decision == CM_Scalarize)
6602 Width = ElementCount::getFixed(1);
6603 }
6604 VectorTy = ToVectorTy(getLoadStoreType(I), Width);
6605 return getMemoryInstructionCost(I, VF);
6606 }
6607 case Instruction::BitCast:
6608 if (I->getType()->isPointerTy())
6609 return 0;
6610 [[fallthrough]];
6611 case Instruction::ZExt:
6612 case Instruction::SExt:
6613 case Instruction::FPToUI:
6614 case Instruction::FPToSI:
6615 case Instruction::FPExt:
6616 case Instruction::PtrToInt:
6617 case Instruction::IntToPtr:
6618 case Instruction::SIToFP:
6619 case Instruction::UIToFP:
6620 case Instruction::Trunc:
6621 case Instruction::FPTrunc: {
6622 // Computes the CastContextHint from a Load/Store instruction.
6623 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6624 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6625 "Expected a load or a store!");
6626
6627 if (VF.isScalar() || !TheLoop->contains(I))
6629
6630 switch (getWideningDecision(I, VF)) {
6642 llvm_unreachable("Instr did not go through cost modelling?");
6645 llvm_unreachable_internal("Instr has invalid widening decision");
6646 }
6647
6648 llvm_unreachable("Unhandled case!");
6649 };
6650
6651 unsigned Opcode = I->getOpcode();
6653 // For Trunc, the context is the only user, which must be a StoreInst.
6654 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6655 if (I->hasOneUse())
6656 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6657 CCH = ComputeCCH(Store);
6658 }
6659 // For Z/Sext, the context is the operand, which must be a LoadInst.
6660 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6661 Opcode == Instruction::FPExt) {
6662 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6663 CCH = ComputeCCH(Load);
6664 }
6665
6666 // We optimize the truncation of induction variables having constant
6667 // integer steps. The cost of these truncations is the same as the scalar
6668 // operation.
6669 if (isOptimizableIVTruncate(I, VF)) {
6670 auto *Trunc = cast<TruncInst>(I);
6671 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6672 Trunc->getSrcTy(), CCH, CostKind, Trunc);
6673 }
6674
6675 // Detect reduction patterns
6676 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6677 return *RedCost;
6678
6679 Type *SrcScalarTy = I->getOperand(0)->getType();
6680 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6681 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6682 SrcScalarTy =
6683 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6684 Type *SrcVecTy =
6685 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6686
6688 // If the result type is <= the source type, there will be no extend
6689 // after truncating the users to the minimal required bitwidth.
6690 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6691 (I->getOpcode() == Instruction::ZExt ||
6692 I->getOpcode() == Instruction::SExt))
6693 return 0;
6694 }
6695
6696 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6697 }
6698 case Instruction::Call:
6699 return getVectorCallCost(cast<CallInst>(I), VF);
6700 case Instruction::ExtractValue:
6702 case Instruction::Alloca:
6703 // We cannot easily widen alloca to a scalable alloca, as
6704 // the result would need to be a vector of pointers.
6705 if (VF.isScalable())
6707 [[fallthrough]];
6708 default:
6709 // This opcode is unknown. Assume that it is the same as 'mul'.
6710 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6711 } // end of switch.
6712}
6713
6715 // Ignore ephemeral values.
6717
6718 SmallVector<Value *, 4> DeadInterleavePointerOps;
6720
6721 // If a scalar epilogue is required, users outside the loop won't use
6722 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6723 // that is the case.
6724 bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6725 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6726 return RequiresScalarEpilogue &&
6727 !TheLoop->contains(cast<Instruction>(U)->getParent());
6728 };
6729
6731 DFS.perform(LI);
6732 MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;
6733 for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
6734 for (Instruction &I : reverse(*BB)) {
6735 // Find all stores to invariant variables. Since they are going to sink
6736 // outside the loop we do not need calculate cost for them.
6737 StoreInst *SI;
6738 if ((SI = dyn_cast<StoreInst>(&I)) &&
6739 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
6740 ValuesToIgnore.insert(&I);
6741 auto I = DeadInvariantStoreOps.insert({SI->getPointerOperand(), {}});
6742 I.first->second.push_back(SI->getValueOperand());
6743 }
6744
6745 if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6746 continue;
6747
6748 // Add instructions that would be trivially dead and are only used by
6749 // values already ignored to DeadOps to seed worklist.
6751 all_of(I.users(), [this, IsLiveOutDead](User *U) {
6752 return VecValuesToIgnore.contains(U) ||
6753 ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6754 }))
6755 DeadOps.push_back(&I);
6756
6757 // For interleave groups, we only create a pointer for the start of the
6758 // interleave group. Queue up addresses of group members except the insert
6759 // position for further processing.
6760 if (isAccessInterleaved(&I)) {
6761 auto *Group = getInterleavedAccessGroup(&I);
6762 if (Group->getInsertPos() == &I)
6763 continue;
6764 Value *PointerOp = getLoadStorePointerOperand(&I);
6765 DeadInterleavePointerOps.push_back(PointerOp);
6766 }
6767
6768 // Queue branches for analysis. They are dead, if their successors only
6769 // contain dead instructions.
6770 if (auto *Br = dyn_cast<BranchInst>(&I)) {
6771 if (Br->isConditional())
6772 DeadOps.push_back(&I);
6773 }
6774 }
6775
6776 // Mark ops feeding interleave group members as free, if they are only used
6777 // by other dead computations.
6778 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6779 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
6780 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
6781 Instruction *UI = cast<Instruction>(U);
6782 return !VecValuesToIgnore.contains(U) &&
6783 (!isAccessInterleaved(UI) ||
6784 getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6785 }))
6786 continue;
6787 VecValuesToIgnore.insert(Op);
6788 DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
6789 }
6790
6791 for (const auto &[_, Ops] : DeadInvariantStoreOps) {
6792 for (Value *Op : ArrayRef(Ops).drop_back())
6793 DeadOps.push_back(Op);
6794 }
6795 // Mark ops that would be trivially dead and are only used by ignored
6796 // instructions as free.
6797 BasicBlock *Header = TheLoop->getHeader();
6798
6799 // Returns true if the block contains only dead instructions. Such blocks will
6800 // be removed by VPlan-to-VPlan transforms and won't be considered by the
6801 // VPlan-based cost model, so skip them in the legacy cost-model as well.
6802 auto IsEmptyBlock = [this](BasicBlock *BB) {
6803 return all_of(*BB, [this](Instruction &I) {
6804 return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
6805 (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());
6806 });
6807 };
6808 for (unsigned I = 0; I != DeadOps.size(); ++I) {
6809 auto *Op = dyn_cast<Instruction>(DeadOps[I]);
6810
6811 // Check if the branch should be considered dead.
6812 if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {
6813 BasicBlock *ThenBB = Br->getSuccessor(0);
6814 BasicBlock *ElseBB = Br->getSuccessor(1);
6815 bool ThenEmpty = IsEmptyBlock(ThenBB);
6816 bool ElseEmpty = IsEmptyBlock(ElseBB);
6817 if ((ThenEmpty && ElseEmpty) ||
6818 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6819 ElseBB->phis().empty()) ||
6820 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6821 ThenBB->phis().empty())) {
6822 VecValuesToIgnore.insert(Br);
6823 DeadOps.push_back(Br->getCondition());
6824 }
6825 continue;
6826 }
6827
6828 // Skip any op that shouldn't be considered dead.
6829 if (!Op || !TheLoop->contains(Op) ||
6830 (isa<PHINode>(Op) && Op->getParent() == Header) ||
6832 any_of(Op->users(), [this, IsLiveOutDead](User *U) {
6833 return !VecValuesToIgnore.contains(U) &&
6834 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
6835 }))
6836 continue;
6837
6838 if (!TheLoop->contains(Op->getParent()))
6839 continue;
6840
6841 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6842 // which applies for both scalar and vector versions. Otherwise it is only
6843 // dead in vector versions, so only add it to VecValuesToIgnore.
6844 if (all_of(Op->users(),
6845 [this](User *U) { return ValuesToIgnore.contains(U); }))
6846 ValuesToIgnore.insert(Op);
6847
6848 VecValuesToIgnore.insert(Op);
6849 DeadOps.append(Op->op_begin(), Op->op_end());
6850 }
6851
6852 // Ignore type-promoting instructions we identified during reduction
6853 // detection.
6854 for (const auto &Reduction : Legal->getReductionVars()) {
6855 const RecurrenceDescriptor &RedDes = Reduction.second;
6856 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6857 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6858 }
6859 // Ignore type-casting instructions we identified during induction
6860 // detection.
6861 for (const auto &Induction : Legal->getInductionVars()) {
6862 const InductionDescriptor &IndDes = Induction.second;
6863 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6864 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6865 }
6866}
6867
6869 for (const auto &Reduction : Legal->getReductionVars()) {
6870 PHINode *Phi = Reduction.first;
6871 const RecurrenceDescriptor &RdxDesc = Reduction.second;
6872
6873 // We don't collect reductions that are type promoted (yet).
6874 if (RdxDesc.getRecurrenceType() != Phi->getType())
6875 continue;
6876
6877 // If the target would prefer this reduction to happen "in-loop", then we
6878 // want to record it as such.
6879 unsigned Opcode = RdxDesc.getOpcode();
6880 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6881 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
6883 continue;
6884
6885 // Check that we can correctly put the reductions into the loop, by
6886 // finding the chain of operations that leads from the phi to the loop
6887 // exit value.
6888 SmallVector<Instruction *, 4> ReductionOperations =
6889 RdxDesc.getReductionOpChain(Phi, TheLoop);
6890 bool InLoop = !ReductionOperations.empty();
6891
6892 if (InLoop) {
6893 InLoopReductions.insert(Phi);
6894 // Add the elements to InLoopReductionImmediateChains for cost modelling.
6895 Instruction *LastChain = Phi;
6896 for (auto *I : ReductionOperations) {
6897 InLoopReductionImmediateChains[I] = LastChain;
6898 LastChain = I;
6899 }
6900 }
6901 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6902 << " reduction for phi: " << *Phi << "\n");
6903 }
6904}
6905
6907 DebugLoc DL, const Twine &Name) {
6909 Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
6910 return tryInsertInstruction(
6911 new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
6912}
6913
6914// This function will select a scalable VF if the target supports scalable
6915// vectors and a fixed one otherwise.
6916// TODO: we could return a pair of values that specify the max VF and
6917// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6918// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6919// doesn't have a cost model that can choose which plan to execute if
6920// more than one is generated.
6923 unsigned WidestType;
6924 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6925
6930
6932 unsigned N = RegSize.getKnownMinValue() / WidestType;
6933 return ElementCount::get(N, RegSize.isScalable());
6934}
6935
6938 ElementCount VF = UserVF;
6939 // Outer loop handling: They may require CFG and instruction level
6940 // transformations before even evaluating whether vectorization is profitable.
6941 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6942 // the vectorization pipeline.
6943 if (!OrigLoop->isInnermost()) {
6944 // If the user doesn't provide a vectorization factor, determine a
6945 // reasonable one.
6946 if (UserVF.isZero()) {
6947 VF = determineVPlanVF(TTI, CM);
6948 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6949
6950 // Make sure we have a VF > 1 for stress testing.
6951 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6952 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6953 << "overriding computed VF.\n");
6954 VF = ElementCount::getFixed(4);
6955 }
6956 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6958 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6959 << "not supported by the target.\n");
6961 "Scalable vectorization requested but not supported by the target",
6962 "the scalable user-specified vectorization width for outer-loop "
6963 "vectorization cannot be used because the target does not support "
6964 "scalable vectors.",
6965 "ScalableVFUnfeasible", ORE, OrigLoop);
6967 }
6968 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6970 "VF needs to be a power of two");
6971 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6972 << "VF " << VF << " to build VPlans.\n");
6973 buildVPlans(VF, VF);
6974
6975 // For VPlan build stress testing, we bail out after VPlan construction.
6978
6979 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
6980 }
6981
6982 LLVM_DEBUG(
6983 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6984 "VPlan-native path.\n");
6986}
6987
6988std::optional<VectorizationFactor>
6990 assert(OrigLoop->isInnermost() && "Inner loop expected.");
6993
6994 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6995 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6996 return std::nullopt;
6997
6998 // Invalidate interleave groups if all blocks of loop will be predicated.
6999 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7001 LLVM_DEBUG(
7002 dbgs()
7003 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7004 "which requires masked-interleaved support.\n");
7006 // Invalidating interleave groups also requires invalidating all decisions
7007 // based on them, which includes widening decisions and uniform and scalar
7008 // values.
7010 }
7011
7012 if (CM.foldTailByMasking())
7014
7015 ElementCount MaxUserVF =
7016 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7017 if (UserVF) {
7018 if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
7020 "UserVF ignored because it may be larger than the maximal safe VF",
7021 "InvalidUserVF", ORE, OrigLoop);
7022 } else {
7024 "VF needs to be a power of two");
7025 // Collect the instructions (and their associated costs) that will be more
7026 // profitable to scalarize.
7028 if (CM.selectUserVectorizationFactor(UserVF)) {
7029 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7030 buildVPlansWithVPRecipes(UserVF, UserVF);
7031 if (!hasPlanWithVF(UserVF)) {
7033 << "LV: No VPlan could be built for " << UserVF << ".\n");
7034 return std::nullopt;
7035 }
7036
7038 return {{UserVF, 0, 0}};
7039 } else
7040 reportVectorizationInfo("UserVF ignored because of invalid costs.",
7041 "InvalidCost", ORE, OrigLoop);
7042 }
7043 }
7044
7045 // Collect the Vectorization Factor Candidates.
7046 SmallVector<ElementCount> VFCandidates;
7047 for (auto VF = ElementCount::getFixed(1);
7048 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7049 VFCandidates.push_back(VF);
7050 for (auto VF = ElementCount::getScalable(1);
7051 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7052 VFCandidates.push_back(VF);
7053
7055 for (const auto &VF : VFCandidates) {
7056 // Collect Uniform and Scalar instructions after vectorization with VF.
7058
7059 // Collect the instructions (and their associated costs) that will be more
7060 // profitable to scalarize.
7061 if (VF.isVector())
7063 }
7064
7065 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7066 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7067
7069 if (VPlans.empty())
7070 return std::nullopt;
7071 if (all_of(VPlans,
7072 [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }))
7074
7075 // Select the optimal vectorization factor according to the legacy cost-model.
7076 // This is now only used to verify the decisions by the new VPlan-based
7077 // cost-model and will be retired once the VPlan-based cost-model is
7078 // stabilized.
7079 VectorizationFactor VF = selectVectorizationFactor();
7080 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7081 if (!hasPlanWithVF(VF.Width)) {
7082 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
7083 << ".\n");
7084 return std::nullopt;
7085 }
7086 return VF;
7087}
7088
7090 ElementCount VF) const {
7091 return CM.getInstructionCost(UI, VF);
7092}
7093
7094bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
7095 return CM.ValuesToIgnore.contains(UI) ||
7096 (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
7097 SkipCostComputation.contains(UI);
7098}
7099
7100InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7101 ElementCount VF) const {
7103 LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
7104 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
7105 LLVMCtx, CM);
7106
7107 // Cost modeling for inductions is inaccurate in the legacy cost model
7108 // compared to the recipes that are generated. To match here initially during
7109 // VPlan cost model bring up directly use the induction costs from the legacy
7110 // cost model. Note that we do this as pre-processing; the VPlan may not have
7111 // any recipes associated with the original induction increment instruction
7112 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7113 // the cost of induction phis and increments (both that are represented by
7114 // recipes and those that are not), to avoid distinguishing between them here,
7115 // and skip all recipes that represent induction phis and increments (the
7116 // former case) later on, if they exist, to avoid counting them twice.
7117 // Similarly we pre-compute the cost of any optimized truncates.
7118 // TODO: Switch to more accurate costing based on VPlan.
7119 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
7120 Instruction *IVInc = cast<Instruction>(
7121 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
7122 SmallVector<Instruction *> IVInsts = {IVInc};
7123 for (unsigned I = 0; I != IVInsts.size(); I++) {
7124 for (Value *Op : IVInsts[I]->operands()) {
7125 auto *OpI = dyn_cast<Instruction>(Op);
7126 if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
7127 continue;
7128 IVInsts.push_back(OpI);
7129 }
7130 }
7131 IVInsts.push_back(IV);
7132 for (User *U : IV->users()) {
7133 auto *CI = cast<Instruction>(U);
7134 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7135 continue;
7136 IVInsts.push_back(CI);
7137 }
7138 for (Instruction *IVInst : IVInsts) {
7139 if (!CostCtx.SkipCostComputation.insert(IVInst).second)
7140 continue;
7141 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
7142 LLVM_DEBUG({
7143 dbgs() << "Cost of " << InductionCost << " for VF " << VF
7144 << ": induction instruction " << *IVInst << "\n";
7145 });
7146 Cost += InductionCost;
7147 }
7148 }
7149
7150 /// Compute the cost of all exiting conditions of the loop using the legacy
7151 /// cost model. This is to match the legacy behavior, which adds the cost of
7152 /// all exit conditions. Note that this over-estimates the cost, as there will
7153 /// be a single condition to control the vector loop.
7155 CM.TheLoop->getExitingBlocks(Exiting);
7156 SetVector<Instruction *> ExitInstrs;
7157 // Collect all exit conditions.
7158 for (BasicBlock *EB : Exiting) {
7159 auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7160 if (!Term)
7161 continue;
7162 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7163 ExitInstrs.insert(CondI);
7164 }
7165 }
7166 // Compute the cost of all instructions only feeding the exit conditions.
7167 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7168 Instruction *CondI = ExitInstrs[I];
7169 if (!OrigLoop->contains(CondI) ||
7170 !CostCtx.SkipCostComputation.insert(CondI).second)
7171 continue;
7172 Cost += CostCtx.getLegacyCost(CondI, VF);
7173 for (Value *Op : CondI->operands()) {
7174 auto *OpI = dyn_cast<Instruction>(Op);
7175 if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {
7176 return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
7177 !ExitInstrs.contains(cast<Instruction>(U));
7178 }))
7179 continue;
7180 ExitInstrs.insert(OpI);
7181 }
7182 }
7183
7184 // The legacy cost model has special logic to compute the cost of in-loop
7185 // reductions, which may be smaller than the sum of all instructions involved
7186 // in the reduction. For AnyOf reductions, VPlan codegen may remove the select
7187 // which the legacy cost model uses to assign cost. Pre-compute their costs
7188 // for now.
7189 // TODO: Switch to costing based on VPlan once the logic has been ported.
7190 for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7191 if (!CM.isInLoopReduction(RedPhi) &&
7193 RdxDesc.getRecurrenceKind()))
7194 continue;
7195
7196 // AnyOf reduction codegen may remove the select. To match the legacy cost
7197 // model, pre-compute the cost for AnyOf reductions here.
7199 RdxDesc.getRecurrenceKind())) {
7200 auto *Select = cast<SelectInst>(*find_if(
7201 RedPhi->users(), [](User *U) { return isa<SelectInst>(U); }));
7202 assert(!CostCtx.SkipCostComputation.contains(Select) &&
7203 "reduction op visited multiple times");
7204 CostCtx.SkipCostComputation.insert(Select);
7205 auto ReductionCost = CostCtx.getLegacyCost(Select, VF);
7206 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7207 << ":\n any-of reduction " << *Select << "\n");
7208 Cost += ReductionCost;
7209 continue;
7210 }
7211
7212 const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7213 SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7214 ChainOps.end());
7215 // Also include the operands of instructions in the chain, as the cost-model
7216 // may mark extends as free.
7217 for (auto *ChainOp : ChainOps) {
7218 for (Value *Op : ChainOp->operands()) {
7219 if (auto *I = dyn_cast<Instruction>(Op))
7220 ChainOpsAndOperands.insert(I);
7221 }
7222 }
7223
7224 // Pre-compute the cost for I, if it has a reduction pattern cost.
7225 for (Instruction *I : ChainOpsAndOperands) {
7226 auto ReductionCost = CM.getReductionPatternCost(
7227 I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
7228 if (!ReductionCost)
7229 continue;
7230
7231 assert(!CostCtx.SkipCostComputation.contains(I) &&
7232 "reduction op visited multiple times");
7233 CostCtx.SkipCostComputation.insert(I);
7234 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7235 << ":\n in-loop reduction " << *I << "\n");
7236 Cost += *ReductionCost;
7237 }
7238 }
7239
7240 // Pre-compute the costs for branches except for the backedge, as the number
7241 // of replicate regions in a VPlan may not directly match the number of
7242 // branches, which would lead to different decisions.
7243 // TODO: Compute cost of branches for each replicate region in the VPlan,
7244 // which is more accurate than the legacy cost model.
7245 for (BasicBlock *BB : OrigLoop->blocks()) {
7246 if (BB == OrigLoop->getLoopLatch())
7247 continue;
7248 CostCtx.SkipCostComputation.insert(BB->getTerminator());
7249 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7250 Cost += BranchCost;
7251 }
7252 // Now compute and add the VPlan-based cost.
7253 Cost += Plan.cost(VF, CostCtx);
7254 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
7255 return Cost;
7256}
7257
7259 // If there is a single VPlan with a single VF, return it directly.
7260 VPlan &FirstPlan = *VPlans[0];
7261 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7262 return *FirstPlan.vectorFactors().begin();
7263
7265 assert(hasPlanWithVF(ScalarVF) &&
7266 "More than a single plan/VF w/o any plan having scalar VF");
7267
7268 // TODO: Compute scalar cost using VPlan-based cost model.
7269 InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7270 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7271 VectorizationFactor BestFactor = ScalarFactor;
7272
7273 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7274 if (ForceVectorization) {
7275 // Ignore scalar width, because the user explicitly wants vectorization.
7276 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7277 // evaluation.
7278 BestFactor.Cost = InstructionCost::getMax();
7279 }
7280
7281 for (auto &P : VPlans) {
7282 for (ElementCount VF : P->vectorFactors()) {
7283 if (VF.isScalar())
7284 continue;
7285 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7286 LLVM_DEBUG(
7287 dbgs()
7288 << "LV: Not considering vector loop of width " << VF
7289 << " because it will not generate any vector instructions.\n");
7290 continue;
7291 }
7292
7293 InstructionCost Cost = cost(*P, VF);
7294 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7295 if (isMoreProfitable(CurrentFactor, BestFactor))
7296 BestFactor = CurrentFactor;
7297
7298 // If profitable add it to ProfitableVF list.
7299 if (isMoreProfitable(CurrentFactor, ScalarFactor))
7300 ProfitableVFs.push_back(CurrentFactor);
7301 }
7302 }
7303 return BestFactor.Width;
7304}
7305
7308 // Reserve first location for self reference to the LoopID metadata node.
7309 MDs.push_back(nullptr);
7310 bool IsUnrollMetadata = false;
7311 MDNode *LoopID = L->getLoopID();
7312 if (LoopID) {
7313 // First find existing loop unrolling disable metadata.
7314 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7315 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7316 if (MD) {
7317 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7318 IsUnrollMetadata =
7319 S && S->getString().starts_with("llvm.loop.unroll.disable");
7320 }
7321 MDs.push_back(LoopID->getOperand(i));
7322 }
7323 }
7324
7325 if (!IsUnrollMetadata) {
7326 // Add runtime unroll disable metadata.
7327 LLVMContext &Context = L->getHeader()->getContext();
7328 SmallVector<Metadata *, 1> DisableOperands;
7329 DisableOperands.push_back(
7330 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7331 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7332 MDs.push_back(DisableNode);
7333 MDNode *NewLoopID = MDNode::get(Context, MDs);
7334 // Set operand 0 to refer to the loop id itself.
7335 NewLoopID->replaceOperandWith(0, NewLoopID);
7336 L->setLoopID(NewLoopID);
7337 }
7338}
7339
7340// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7341// create a merge phi node for it and add it to \p ReductionResumeValues.
7343 VPInstruction *RedResult,
7345 VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock,
7346 bool VectorizingEpilogue) {
7347 if (!RedResult ||
7349 return;
7350
7351 auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
7352 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7353
7354 Value *FinalValue =
7355 State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
7356 auto *ResumePhi =
7357 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
7358 if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind(
7359 RdxDesc.getRecurrenceKind())) {
7360 auto *Cmp = cast<ICmpInst>(PhiR->getStartValue()->getUnderlyingValue());
7361 assert(Cmp->getPredicate() == CmpInst::ICMP_NE);
7362 assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue());
7363 ResumePhi = cast<PHINode>(Cmp->getOperand(0));
7364 }
7365 assert((!VectorizingEpilogue || ResumePhi) &&
7366 "when vectorizing the epilogue loop, we need a resume phi from main "
7367 "vector loop");
7368
7369 // TODO: bc.merge.rdx should not be created here, instead it should be
7370 // modeled in VPlan.
7371 BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7372 // Create a phi node that merges control-flow from the backedge-taken check
7373 // block and the middle block.
7374 auto *BCBlockPhi =
7375 PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
7376 LoopScalarPreHeader->getTerminator()->getIterator());
7377
7378 // If we are fixing reductions in the epilogue loop then we should already
7379 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7380 // we carry over the incoming values correctly.
7381 for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
7382 if (Incoming == LoopMiddleBlock)
7383 BCBlockPhi->addIncoming(FinalValue, Incoming);
7384 else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
7385 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
7386 Incoming);
7387 else
7388 BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming);
7389 }
7390
7391 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
7392 // TODO: This fixup should instead be modeled in VPlan.
7393 // Fix the scalar loop reduction variable with the incoming reduction sum
7394 // from the vector body and from the backedge value.
7395 int IncomingEdgeBlockIdx =
7396 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
7397 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
7398 // Pick the other block.
7399 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7400 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
7401 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7402 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
7403
7404 ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7405}
7406
7407std::pair<DenseMap<const SCEV *, Value *>,
7410 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7411 InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7412 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7413 assert(BestVPlan.hasVF(BestVF) &&
7414 "Trying to execute plan with unsupported VF");
7415 assert(BestVPlan.hasUF(BestUF) &&
7416 "Trying to execute plan with unsupported UF");
7417 assert(
7418 (IsEpilogueVectorization || !ExpandedSCEVs) &&
7419 "expanded SCEVs to reuse can only be used during epilogue vectorization");
7420 (void)IsEpilogueVectorization;
7421
7422 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7423
7424 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
7425 << ", UF=" << BestUF << '\n');
7426 BestVPlan.setName("Final VPlan");
7427 LLVM_DEBUG(BestVPlan.dump());
7428
7429 // Perform the actual loop transformation.
7430 VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7431 OrigLoop->getHeader()->getContext());
7432
7433 // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7434 // before making any changes to the CFG.
7435 if (!BestVPlan.getPreheader()->empty()) {
7436 State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7438 BestVPlan.getPreheader()->execute(&State);
7439 }
7440 if (!ILV.getTripCount())
7441 ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
7442 else
7443 assert(IsEpilogueVectorization && "should only re-use the existing trip "
7444 "count during epilogue vectorization");
7445
7446 // 1. Set up the skeleton for vectorization, including vector pre-header and
7447 // middle block. The vector loop is created during VPlan execution.
7448 Value *CanonicalIVStartValue;
7449 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7450 ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
7451 : State.ExpandedSCEVs);
7452#ifdef EXPENSIVE_CHECKS
7453 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7454#endif
7455
7456 // Only use noalias metadata when using memory checks guaranteeing no overlap
7457 // across all iterations.
7458 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7459 std::unique_ptr<LoopVersioning> LVer = nullptr;
7460 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7462
7463 // We currently don't use LoopVersioning for the actual loop cloning but we
7464 // still use it to add the noalias metadata.
7465 // TODO: Find a better way to re-use LoopVersioning functionality to add
7466 // metadata.
7467 LVer = std::make_unique<LoopVersioning>(
7468 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7469 PSE.getSE());
7470 State.LVer = &*LVer;
7472 }
7473
7475
7476 //===------------------------------------------------===//
7477 //
7478 // Notice: any optimization or new instruction that go
7479 // into the code below should also be implemented in
7480 // the cost-model.
7481 //
7482 //===------------------------------------------------===//
7483
7484 // 2. Copy and widen instructions from the old loop into the new loop.
7485 BestVPlan.prepareToExecute(ILV.getTripCount(),
7486 ILV.getOrCreateVectorTripCount(nullptr),
7487 CanonicalIVStartValue, State);
7488
7489 BestVPlan.execute(&State);
7490
7491 // 2.5 Collect reduction resume values.
7493 auto *ExitVPBB =
7494 cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7495 for (VPRecipeBase &R : *ExitVPBB) {
7497 dyn_cast<VPInstruction>(&R), ReductionResumeValues, State, OrigLoop,
7498 State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
7499 }
7500
7501 // 2.6. Maintain Loop Hints
7502 // Keep all loop hints from the original loop on the vector loop (we'll
7503 // replace the vectorizer-specific hints below).
7504 MDNode *OrigLoopID = OrigLoop->getLoopID();
7505
7506 std::optional<MDNode *> VectorizedLoopID =
7509
7510 VPBasicBlock *HeaderVPBB =
7512 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7513 if (VectorizedLoopID)
7514 L->setLoopID(*VectorizedLoopID);
7515 else {
7516 // Keep all loop hints from the original loop on the vector loop (we'll
7517 // replace the vectorizer-specific hints below).
7518 if (MDNode *LID = OrigLoop->getLoopID())
7519 L->setLoopID(LID);
7520
7521 LoopVectorizeHints Hints(L, true, *ORE);
7522 Hints.setAlreadyVectorized();
7523 }
7525 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7526 if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
7528
7529 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7530 // predication, updating analyses.
7531 ILV.fixVectorizedLoop(State, BestVPlan);
7532
7534
7535 // 4. Adjust branch weight of the branch in the middle block.
7536 auto *MiddleTerm =
7537 cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
7538 if (MiddleTerm->isConditional() &&
7539 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7540 // Assume that `Count % VectorTripCount` is equally distributed.
7541 unsigned TripCount = State.UF * State.VF.getKnownMinValue();
7542 assert(TripCount > 0 && "trip count should not be zero");
7543 const uint32_t Weights[] = {1, TripCount - 1};
7544 setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7545 }
7546
7547 return {State.ExpandedSCEVs, ReductionResumeValues};
7548}
7549
7550//===--------------------------------------------------------------------===//
7551// EpilogueVectorizerMainLoop
7552//===--------------------------------------------------------------------===//
7553
7554/// This function is partially responsible for generating the control flow
7555/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7556std::pair<BasicBlock *, Value *>
7558 const SCEV2ValueTy &ExpandedSCEVs) {
7560
7561 // Generate the code to check the minimum iteration count of the vector
7562 // epilogue (see below).
7566
7567 // Generate the code to check any assumptions that we've made for SCEV
7568 // expressions.
7570
7571 // Generate the code that checks at runtime if arrays overlap. We put the
7572 // checks into a separate block to make the more common case of few elements
7573 // faster.
7575
7576 // Generate the iteration count check for the main loop, *after* the check
7577 // for the epilogue loop, so that the path-length is shorter for the case
7578 // that goes directly through the vector epilogue. The longer-path length for
7579 // the main loop is compensated for, by the gain from vectorizing the larger
7580 // trip count. Note: the branch will get updated later on when we vectorize
7581 // the epilogue.
7584
7585 // Generate the induction variable.
7587
7588 // Skip induction resume value creation here because they will be created in
7589 // the second pass for the scalar loop. The induction resume values for the
7590 // inductions in the epilogue loop are created before executing the plan for
7591 // the epilogue loop.
7592
7593 return {LoopVectorPreHeader, nullptr};
7594}
7595
7597 LLVM_DEBUG({
7598 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7599 << "Main Loop VF:" << EPI.MainLoopVF
7600 << ", Main Loop UF:" << EPI.MainLoopUF
7601 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7602 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7603 });
7604}
7605
7608 dbgs() << "intermediate fn:\n"
7609 << *OrigLoop->getHeader()->getParent() << "\n";
7610 });
7611}
7612
7613BasicBlock *
7615 bool ForEpilogue) {
7616 assert(Bypass && "Expected valid bypass basic block.");
7617 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7618 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7619 Value *Count = getTripCount();
7620 // Reuse existing vector loop preheader for TC checks.
7621 // Note that new preheader block is generated for vector loop.
7622 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7623 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7624
7625 // Generate code to check if the loop's trip count is less than VF * UF of the
7626 // main vector loop.
7627 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7628 : VF.isVector())
7631
7632 Value *CheckMinIters = Builder.CreateICmp(
7633 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7634 "min.iters.check");
7635
7636 if (!ForEpilogue)
7637 TCCheckBlock->setName("vector.main.loop.iter.check");
7638
7639 // Create new preheader for vector loop.
7640 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7641 DT, LI, nullptr, "vector.ph");
7642
7643 if (ForEpilogue) {
7644 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7645 DT->getNode(Bypass)->getIDom()) &&
7646 "TC check is expected to dominate Bypass");
7647
7648 // Update dominator for Bypass.
7649 DT->changeImmediateDominator(Bypass, TCCheckBlock);
7650 LoopBypassBlocks.push_back(TCCheckBlock);
7651
7652 // Save the trip count so we don't have to regenerate it in the
7653 // vec.epilog.iter.check. This is safe to do because the trip count
7654 // generated here dominates the vector epilog iter check.
7655 EPI.TripCount = Count;
7656 }
7657
7658 BranchInst &BI =
7659 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7661 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7662 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7663
7664 return TCCheckBlock;
7665}
7666
7667//===--------------------------------------------------------------------===//
7668// EpilogueVectorizerEpilogueLoop
7669//===--------------------------------------------------------------------===//
7670
7671/// This function is partially responsible for generating the control flow
7672/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7673std::pair<BasicBlock *, Value *>
7675 const SCEV2ValueTy &ExpandedSCEVs) {
7676 createVectorLoopSkeleton("vec.epilog.");
7677
7678 // Now, compare the remaining count and if there aren't enough iterations to
7679 // execute the vectorized epilogue skip to the scalar part.
7680 LoopVectorPreHeader->setName("vec.epilog.ph");
7681 BasicBlock *VecEpilogueIterationCountCheck =
7683 nullptr, "vec.epilog.iter.check", true);
7685 VecEpilogueIterationCountCheck);
7686
7687 // Adjust the control flow taking the state info from the main loop
7688 // vectorization into account.
7690 "expected this to be saved from the previous pass.");
7692 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7693
7696
7698 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7699
7700 if (EPI.SCEVSafetyCheck)
7702 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7703 if (EPI.MemSafetyCheck)
7705 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7706
7708 VecEpilogueIterationCountCheck,
7709 VecEpilogueIterationCountCheck->getSinglePredecessor());
7710
7713 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7714 // If there is an epilogue which must run, there's no edge from the
7715 // middle block to exit blocks and thus no need to update the immediate
7716 // dominator of the exit blocks.
7719
7720 // Keep track of bypass blocks, as they feed start values to the induction and
7721 // reduction phis in the scalar loop preheader.
7722 if (EPI.SCEVSafetyCheck)
7724 if (EPI.MemSafetyCheck)
7727
7728 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7729 // reductions which merge control-flow from the latch block and the middle
7730 // block. Update the incoming values here and move the Phi into the preheader.
7731 SmallVector<PHINode *, 4> PhisInBlock;
7732 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7733 PhisInBlock.push_back(&Phi);
7734
7735 for (PHINode *Phi : PhisInBlock) {
7736 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7737 Phi->replaceIncomingBlockWith(
7738 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7739 VecEpilogueIterationCountCheck);
7740
7741 // If the phi doesn't have an incoming value from the
7742 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7743 // value and also those from other check blocks. This is needed for
7744 // reduction phis only.
7745 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7746 return EPI.EpilogueIterationCountCheck == IncB;
7747 }))
7748 continue;
7749 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7750 if (EPI.SCEVSafetyCheck)
7751 Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7752 if (EPI.MemSafetyCheck)
7753 Phi->removeIncomingValue(EPI.MemSafetyCheck);
7754 }
7755
7756 // Generate a resume induction for the vector epilogue and put it in the
7757 // vector epilogue preheader
7758 Type *IdxTy = Legal->getWidestInductionType();
7759 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
7761 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7762 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7764
7765 // Generate induction resume values. These variables save the new starting
7766 // indexes for the scalar loop. They are used to test if there are any tail
7767 // iterations left once the vector loop has completed.
7768 // Note that when the vectorized epilogue is skipped due to iteration count
7769 // check, then the resume value for the induction variable comes from
7770 // the trip count of the main vector loop, hence passing the AdditionalBypass
7771 // argument.
7772 createInductionResumeValues(ExpandedSCEVs,
7773 {VecEpilogueIterationCountCheck,
7774 EPI.VectorTripCount} /* AdditionalBypass */);
7775
7776 return {LoopVectorPreHeader, EPResumeVal};
7777}
7778
7779BasicBlock *
7781 BasicBlock *Bypass, BasicBlock *Insert) {
7782
7784 "Expected trip count to have been safed in the first pass.");
7785 assert(
7786 (!isa<Instruction>(EPI.TripCount) ||
7787 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7788 "saved trip count does not dominate insertion point.");
7789 Value *TC = EPI.TripCount;
7790 IRBuilder<> Builder(Insert->getTerminator());
7791 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7792
7793 // Generate code to check if the loop's trip count is less than VF * UF of the
7794 // vector epilogue loop.
7795 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7798
7799 Value *CheckMinIters =
7800 Builder.CreateICmp(P, Count,
7803 "min.epilog.iters.check");
7804
7805 BranchInst &BI =
7806 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7808 unsigned MainLoopStep = UF * VF.getKnownMinValue();
7809 unsigned EpilogueLoopStep =
7811 // We assume the remaining `Count` is equally distributed in
7812 // [0, MainLoopStep)
7813 // So the probability for `Count < EpilogueLoopStep` should be
7814 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7815 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7816 const uint32_t Weights[] = {EstimatedSkipCount,
7817 MainLoopStep - EstimatedSkipCount};
7818 setBranchWeights(BI, Weights, /*IsExpected=*/false);
7819 }
7820 ReplaceInstWithInst(Insert->getTerminator(), &BI);
7821 LoopBypassBlocks.push_back(Insert);
7822 return Insert;
7823}
7824
7826 LLVM_DEBUG({
7827 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7828 << "Epilogue Loop VF:" << EPI.EpilogueVF
7829 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7830 });
7831}
7832
7835 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7836 });
7837}
7838
7839iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
7841 std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
7842 if (auto *I = dyn_cast<Instruction>(Op)) {
7843 if (auto *R = Ingredient2Recipe.lookup(I))
7844 return R->getVPSingleValue();
7845 }
7846 return Plan.getOrAddLiveIn(Op);
7847 };
7848 return map_range(Operands, Fn);
7849}
7850
7852 BasicBlock *Src = SI->getParent();
7853 assert(!OrigLoop->isLoopExiting(Src) &&
7854 all_of(successors(Src),
7855 [this](BasicBlock *Succ) {
7856 return OrigLoop->getHeader() != Succ;
7857 }) &&
7858 "unsupported switch either exiting loop or continuing to header");
7859 // Create masks where the terminator in Src is a switch. We create mask for
7860 // all edges at the same time. This is more efficient, as we can create and
7861 // collect compares for all cases once.
7862 VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition(), Plan);
7863 BasicBlock *DefaultDst = SI->getDefaultDest();
7865 for (auto &C : SI->cases()) {
7866 BasicBlock *Dst = C.getCaseSuccessor();
7867 assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created");
7868 // Cases whose destination is the same as default are redundant and can be
7869 // ignored - they will get there anyhow.
7870 if (Dst == DefaultDst)
7871 continue;
7872 auto I = Dst2Compares.insert({Dst, {}});
7873 VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue(), Plan);
7874 I.first->second.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
7875 }
7876
7877 // We need to handle 2 separate cases below for all entries in Dst2Compares,
7878 // which excludes destinations matching the default destination.
7879 VPValue *SrcMask = getBlockInMask(Src);
7880 VPValue *DefaultMask = nullptr;
7881 for (const auto &[Dst, Conds] : Dst2Compares) {
7882 // 1. Dst is not the default destination. Dst is reached if any of the cases
7883 // with destination == Dst are taken. Join the conditions for each case
7884 // whose destination == Dst using an OR.
7885 VPValue *Mask = Conds[0];
7886 for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
7887 Mask = Builder.createOr(Mask, V);
7888 if (SrcMask)
7889 Mask = Builder.createLogicalAnd(SrcMask, Mask);
7890 EdgeMaskCache[{Src, Dst}] = Mask;
7891
7892 // 2. Create the mask for the default destination, which is reached if none
7893 // of the cases with destination != default destination are taken. Join the
7894 // conditions for each case where the destination is != Dst using an OR and
7895 // negate it.
7896 DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
7897 }
7898
7899 if (DefaultMask) {
7900 DefaultMask = Builder.createNot(DefaultMask);
7901 if (SrcMask)
7902 DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
7903 }
7904 EdgeMaskCache[{Src, DefaultDst}] = DefaultMask;
7905}
7906
7908 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7909
7910 // Look for cached value.
7911 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7912 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7913 if (ECEntryIt != EdgeMaskCache.end())
7914 return ECEntryIt->second;
7915
7916 if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) {
7918 assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?");
7919 return EdgeMaskCache[Edge];
7920 }
7921
7922 VPValue *SrcMask = getBlockInMask(Src);
7923
7924 // The terminator has to be a branch inst!
7925 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7926 assert(BI && "Unexpected terminator found");
7927 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7928 return EdgeMaskCache[Edge] = SrcMask;
7929
7930 // If source is an exiting block, we know the exit edge is dynamically dead
7931 // in the vector loop, and thus we don't need to restrict the mask. Avoid
7932 // adding uses of an otherwise potentially dead instruction.
7933 if (OrigLoop->isLoopExiting(Src))
7934 return EdgeMaskCache[Edge] = SrcMask;
7935
7936 VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition(), Plan);
7937 assert(EdgeMask && "No Edge Mask found for condition");
7938
7939 if (BI->getSuccessor(0) != Dst)
7940 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
7941
7942 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
7943 // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
7944 // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
7945 // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
7946 EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
7947 }
7948
7949 return EdgeMaskCache[Edge] = EdgeMask;
7950}
7951
7953 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7954
7955 // Look for cached value.
7956 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7957 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
7958 assert(ECEntryIt != EdgeMaskCache.end() &&
7959 "looking up mask for edge which has not been created");
7960 return ECEntryIt->second;
7961}
7962
7964 BasicBlock *Header = OrigLoop->getHeader();
7965
7966 // When not folding the tail, use nullptr to model all-true mask.
7967 if (!CM.foldTailByMasking()) {
7968 BlockMaskCache[Header] = nullptr;
7969 return;
7970 }
7971
7972 // Introduce the early-exit compare IV <= BTC to form header block mask.
7973 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
7974 // constructing the desired canonical IV in the header block as its first
7975 // non-phi instructions.
7976
7977 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
7978 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
7979 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
7980 HeaderVPBB->insert(IV, NewInsertionPoint);
7981
7982 VPBuilder::InsertPointGuard Guard(Builder);
7983 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
7984 VPValue *BlockMask = nullptr;
7986 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
7987 BlockMaskCache[Header] = BlockMask;
7988}
7989
7991 // Return the cached value.
7992 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
7993 assert(BCEntryIt != BlockMaskCache.end() &&
7994 "Trying to access mask for block without one.");
7995 return BCEntryIt->second;
7996}
7997
7999 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8000 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8001 assert(OrigLoop->getHeader() != BB &&
8002 "Loop header must have cached block mask");
8003
8004 // All-one mask is modelled as no-mask following the convention for masked
8005 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8006 VPValue *BlockMask = nullptr;
8007 // This is the block mask. We OR all unique incoming edges.
8008 for (auto *Predecessor :
8010 VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
8011 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8012 BlockMaskCache[BB] = EdgeMask;
8013 return;
8014 }
8015
8016 if (!BlockMask) { // BlockMask has its initialized nullptr value.
8017 BlockMask = EdgeMask;
8018 continue;
8019 }
8020
8021 BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8022 }
8023
8024 BlockMaskCache[BB] = BlockMask;
8025}
8026
8028VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8029 VFRange &Range) {
8030 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8031 "Must be called with either a load or store");
8032
8033 auto willWiden = [&](ElementCount VF) -> bool {
8035 CM.getWideningDecision(I, VF);
8037 "CM decision should be taken at this point.");
8039 return true;
8040 if (CM.isScalarAfterVectorization(I, VF) ||
8041 CM.isProfitableToScalarize(I, VF))
8042 return false;
8044 };
8045
8047 return nullptr;
8048
8049 VPValue *Mask = nullptr;
8050 if (Legal->isMaskRequired(I))
8051 Mask = getBlockInMask(I->getParent());
8052
8053 // Determine if the pointer operand of the access is either consecutive or
8054 // reverse consecutive.
8056 CM.getWideningDecision(I, Range.Start);
8058 bool Consecutive =
8060
8061 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8062 if (Consecutive) {
8063 auto *GEP = dyn_cast<GetElementPtrInst>(
8064 Ptr->getUnderlyingValue()->stripPointerCasts());
8065 auto *VectorPtr = new VPVectorPointerRecipe(
8066 Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
8067 I->getDebugLoc());
8068 Builder.getInsertBlock()->appendRecipe(VectorPtr);
8069 Ptr = VectorPtr;
8070 }
8071 if (LoadInst *Load = dyn_cast<LoadInst>(I))
8072 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
8073 I->getDebugLoc());
8074
8075 StoreInst *Store = cast<StoreInst>(I);
8076 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
8077 Reverse, I->getDebugLoc());
8078}
8079
8080/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8081/// insert a recipe to expand the step for the induction recipe.
8084 VPValue *Start, const InductionDescriptor &IndDesc,
8085 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
8086 assert(IndDesc.getStartValue() ==
8087 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8088 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8089 "step must be loop invariant");
8090
8091 VPValue *Step =
8093 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8094 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
8095 }
8096 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8097 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
8098}
8099
8100VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8102
8103 // Check if this is an integer or fp induction. If so, build the recipe that
8104 // produces its scalar and vector values.
8105 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8106 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8107 *PSE.getSE(), *OrigLoop);
8108
8109 // Check if this is pointer induction. If so, build the recipe for it.
8110 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8111 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8112 *PSE.getSE());
8114 Phi, Operands[0], Step, *II,
8116 [&](ElementCount VF) {
8117 return CM.isScalarAfterVectorization(Phi, VF);
8118 },
8119 Range));
8120 }
8121 return nullptr;
8122}
8123
8124VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8126 // Optimize the special case where the source is a constant integer
8127 // induction variable. Notice that we can only optimize the 'trunc' case
8128 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8129 // (c) other casts depend on pointer size.
8130
8131 // Determine whether \p K is a truncation based on an induction variable that
8132 // can be optimized.
8133 auto isOptimizableIVTruncate =
8134 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8135 return [=](ElementCount VF) -> bool {
8136 return CM.isOptimizableIVTruncate(K, VF);
8137 };
8138 };
8139
8141 isOptimizableIVTruncate(I), Range)) {
8142
8143 auto *Phi = cast<PHINode>(I->getOperand(0));
8145 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
8146 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8147 *OrigLoop);
8148 }
8149 return nullptr;
8150}
8151
8152VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8154 unsigned NumIncoming = Phi->getNumIncomingValues();
8155
8156 // We know that all PHIs in non-header blocks are converted into selects, so
8157 // we don't have to worry about the insertion order and we can just use the
8158 // builder. At this point we generate the predication tree. There may be
8159 // duplications since this is a simple recursive scan, but future
8160 // optimizations will clean it up.
8161 // TODO: At the moment the first mask is always skipped, but it would be
8162 // better to skip the most expensive mask.
8163 SmallVector<VPValue *, 2> OperandsWithMask;
8164
8165 for (unsigned In = 0; In < NumIncoming; In++) {
8166 OperandsWithMask.push_back(Operands[In]);
8167 VPValue *EdgeMask =
8168 getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8169 if (!EdgeMask) {
8170 assert(In == 0 && "Both null and non-null edge masks found");
8172 "Distinct incoming values with one having a full mask");
8173 break;
8174 }
8175 if (In == 0)
8176 continue;
8177 OperandsWithMask.push_back(EdgeMask);
8178 }
8179 return new VPBlendRecipe(Phi, OperandsWithMask);
8180}
8181
8182VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8184 VFRange &Range) {
8186 [this, CI](ElementCount VF) {
8187 return CM.isScalarWithPredication(CI, VF);
8188 },
8189 Range);
8190
8191 if (IsPredicated)
8192 return nullptr;
8193
8195 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8196 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8197 ID == Intrinsic::pseudoprobe ||
8198 ID == Intrinsic::experimental_noalias_scope_decl))
8199 return nullptr;
8200
8201 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8202 Ops.push_back(Operands.back());
8203
8204 // Is it beneficial to perform intrinsic call compared to lib call?
8205 bool ShouldUseVectorIntrinsic =
8207 [&](ElementCount VF) -> bool {
8208 return CM.getCallWideningDecision(CI, VF).Kind ==
8210 },
8211 Range);
8212 if (ShouldUseVectorIntrinsic)
8213 return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()), ID,
8214 CI->getDebugLoc());
8215
8216 Function *Variant = nullptr;
8217 std::optional<unsigned> MaskPos;
8218 // Is better to call a vectorized version of the function than to to scalarize
8219 // the call?
8220 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8221 [&](ElementCount VF) -> bool {
8222 // The following case may be scalarized depending on the VF.
8223 // The flag shows whether we can use a usual Call for vectorized
8224 // version of the instruction.
8225
8226 // If we've found a variant at a previous VF, then stop looking. A
8227 // vectorized variant of a function expects input in a certain shape
8228 // -- basically the number of input registers, the number of lanes
8229 // per register, and whether there's a mask required.
8230 // We store a pointer to the variant in the VPWidenCallRecipe, so
8231 // once we have an appropriate variant it's only valid for that VF.
8232 // This will force a different vplan to be generated for each VF that
8233 // finds a valid variant.
8234 if (Variant)
8235 return false;
8237 CM.getCallWideningDecision(CI, VF);
8239 Variant = Decision.Variant;
8240 MaskPos = Decision.MaskPos;
8241 return true;
8242 }
8243
8244 return false;
8245 },
8246 Range);
8247 if (ShouldUseVectorCall) {
8248 if (MaskPos.has_value()) {
8249 // We have 2 cases that would require a mask:
8250 // 1) The block needs to be predicated, either due to a conditional
8251 // in the scalar loop or use of an active lane mask with
8252 // tail-folding, and we use the appropriate mask for the block.
8253 // 2) No mask is required for the block, but the only available
8254 // vector variant at this VF requires a mask, so we synthesize an
8255 // all-true mask.
8256 VPValue *Mask = nullptr;
8257 if (Legal->isMaskRequired(CI))
8258 Mask = getBlockInMask(CI->getParent());
8259 else
8261 IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
8262
8263 Ops.insert(Ops.begin() + *MaskPos, Mask);
8264 }
8265
8266 return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()),
8268 Variant);
8269 }
8270
8271 return nullptr;
8272}
8273
8274bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8275 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8276 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8277 // Instruction should be widened, unless it is scalar after vectorization,
8278 // scalarization is profitable or it is predicated.
8279 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8280 return CM.isScalarAfterVectorization(I, VF) ||
8281 CM.isProfitableToScalarize(I, VF) ||
8282 CM.isScalarWithPredication(I, VF);
8283 };
8285 Range);
8286}
8287
8288VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8290 VPBasicBlock *VPBB) {
8291 switch (I->getOpcode()) {
8292 default:
8293 return nullptr;
8294 case Instruction::SDiv:
8295 case Instruction::UDiv:
8296 case Instruction::SRem:
8297 case Instruction::URem: {
8298 // If not provably safe, use a select to form a safe divisor before widening the
8299 // div/rem operation itself. Otherwise fall through to general handling below.
8300 if (CM.isPredicatedInst(I)) {
8302 VPValue *Mask = getBlockInMask(I->getParent());
8303 VPValue *One =
8304 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8305 auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
8306 Ops[1] = SafeRHS;
8307 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8308 }
8309 [[fallthrough]];
8310 }
8311 case Instruction::Add:
8312 case Instruction::And:
8313 case Instruction::AShr:
8314 case Instruction::FAdd:
8315 case Instruction::FCmp:
8316 case Instruction::FDiv:
8317 case Instruction::FMul:
8318 case Instruction::FNeg:
8319 case Instruction::FRem:
8320 case Instruction::FSub:
8321 case Instruction::ICmp:
8322 case Instruction::LShr:
8323 case Instruction::Mul:
8324 case Instruction::Or:
8325 case Instruction::Select:
8326 case Instruction::Shl:
8327 case Instruction::Sub:
8328 case Instruction::Xor:
8329 case Instruction::Freeze:
8330 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8331 };
8332}
8333
8335 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8336 for (VPHeaderPHIRecipe *R : PhisToFix) {
8337 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8338 VPRecipeBase *IncR =
8339 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8340 R->addOperand(IncR->getVPSingleValue());
8341 }
8342}
8343
8345 VFRange &Range) {
8347 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8348 Range);
8349
8350 bool IsPredicated = CM.isPredicatedInst(I);
8351
8352 // Even if the instruction is not marked as uniform, there are certain
8353 // intrinsic calls that can be effectively treated as such, so we check for
8354 // them here. Conservatively, we only do this for scalable vectors, since
8355 // for fixed-width VFs we can always fall back on full scalarization.
8356 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8357 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8358 case Intrinsic::assume:
8359 case Intrinsic::lifetime_start:
8360 case Intrinsic::lifetime_end:
8361 // For scalable vectors if one of the operands is variant then we still
8362 // want to mark as uniform, which will generate one instruction for just
8363 // the first lane of the vector. We can't scalarize the call in the same
8364 // way as for fixed-width vectors because we don't know how many lanes
8365 // there are.
8366 //
8367 // The reasons for doing it this way for scalable vectors are:
8368 // 1. For the assume intrinsic generating the instruction for the first
8369 // lane is still be better than not generating any at all. For
8370 // example, the input may be a splat across all lanes.
8371 // 2. For the lifetime start/end intrinsics the pointer operand only
8372 // does anything useful when the input comes from a stack object,
8373 // which suggests it should always be uniform. For non-stack objects
8374 // the effect is to poison the object, which still allows us to
8375 // remove the call.
8376 IsUniform = true;
8377 break;
8378 default:
8379 break;
8380 }
8381 }
8382 VPValue *BlockInMask = nullptr;
8383 if (!IsPredicated) {
8384 // Finalize the recipe for Instr, first if it is not predicated.
8385 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8386 } else {
8387 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8388 // Instructions marked for predication are replicated and a mask operand is
8389 // added initially. Masked replicate recipes will later be placed under an
8390 // if-then construct to prevent side-effects. Generate recipes to compute
8391 // the block mask for this region.
8392 BlockInMask = getBlockInMask(I->getParent());
8393 }
8394
8395 // Note that there is some custom logic to mark some intrinsics as uniform
8396 // manually above for scalable vectors, which this assert needs to account for
8397 // as well.
8398 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8399 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8400 "Should not predicate a uniform recipe");
8401 auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
8402 IsUniform, BlockInMask);
8403 return Recipe;
8404}
8405
8409 VFRange &Range, VPBasicBlock *VPBB) {
8410 // First, check for specific widening recipes that deal with inductions, Phi
8411 // nodes, calls and memory operations.
8412 VPRecipeBase *Recipe;
8413 if (auto Phi = dyn_cast<PHINode>(Instr)) {
8414 if (Phi->getParent() != OrigLoop->getHeader())
8415 return tryToBlend(Phi, Operands);
8416
8417 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8418 return Recipe;
8419
8420 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8421 assert((Legal->isReductionVariable(Phi) ||
8422 Legal->isFixedOrderRecurrence(Phi)) &&
8423 "can only widen reductions and fixed-order recurrences here");
8424 VPValue *StartV = Operands[0];
8425 if (Legal->isReductionVariable(Phi)) {
8426 const RecurrenceDescriptor &RdxDesc =
8427 Legal->getReductionVars().find(Phi)->second;
8428 assert(RdxDesc.getRecurrenceStartValue() ==
8429 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8430 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8431 CM.isInLoopReduction(Phi),
8432 CM.useOrderedReductions(RdxDesc));
8433 } else {
8434 // TODO: Currently fixed-order recurrences are modeled as chains of
8435 // first-order recurrences. If there are no users of the intermediate
8436 // recurrences in the chain, the fixed order recurrence should be modeled
8437 // directly, enabling more efficient codegen.
8438 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8439 }
8440
8441 PhisToFix.push_back(PhiRecipe);
8442 return PhiRecipe;
8443 }
8444
8445 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8446 cast<TruncInst>(Instr), Operands, Range)))
8447 return Recipe;
8448
8449 // All widen recipes below deal only with VF > 1.
8451 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8452 return nullptr;
8453
8454 if (auto *CI = dyn_cast<CallInst>(Instr))
8455 return tryToWidenCall(CI, Operands, Range);
8456
8457 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8458 return tryToWidenMemory(Instr, Operands, Range);
8459
8460 if (!shouldWiden(Instr, Range))
8461 return nullptr;
8462
8463 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8464 return new VPWidenGEPRecipe(GEP,
8465 make_range(Operands.begin(), Operands.end()));
8466
8467 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8468 return new VPWidenSelectRecipe(
8469 *SI, make_range(Operands.begin(), Operands.end()));
8470 }
8471
8472 if (auto *CI = dyn_cast<CastInst>(Instr)) {
8473 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8474 *CI);
8475 }
8476
8477 return tryToWiden(Instr, Operands, VPBB);
8478}
8479
8480void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8481 ElementCount MaxVF) {
8482 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8483
8484 auto MaxVFTimes2 = MaxVF * 2;
8485 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8486 VFRange SubRange = {VF, MaxVFTimes2};
8487 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8488 // Now optimize the initial VPlan.
8489 if (!Plan->hasVF(ElementCount::getFixed(1)))
8491 *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
8492 VPlanTransforms::optimize(*Plan, *PSE.getSE());
8493 // TODO: try to put it close to addActiveLaneMask().
8494 // Discard the plan if it is not EVL-compatible
8495 if (CM.foldTailWithEVL() &&
8497 break;
8498 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8499 VPlans.push_back(std::move(Plan));
8500 }
8501 VF = SubRange.End;
8502 }
8503}
8504
8505// Add the necessary canonical IV and branch recipes required to control the
8506// loop.
8507static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8508 DebugLoc DL) {
8509 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8510 auto *StartV = Plan.getOrAddLiveIn(StartIdx);
8511
8512 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8513 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8514 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8515 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8516 Header->insert(CanonicalIVPHI, Header->begin());
8517
8518 VPBuilder Builder(TopRegion->getExitingBasicBlock());
8519 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8520 auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8521 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
8522 "index.next");
8523 CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8524
8525 // Add the BranchOnCount VPInstruction to the latch.
8527 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8528}
8529
8530// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8531// original exit block.
8533 Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
8535 auto MiddleVPBB =
8536 cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
8537 // No edge from the middle block to the unique exit block has been inserted
8538 // and there is nothing to fix from vector loop; phis should have incoming
8539 // from scalar loop only.
8540 if (MiddleVPBB->getNumSuccessors() != 2)
8541 return;
8542
8543 // Introduce VPUsers modeling the exit values.
8544 BasicBlock *ExitBB =
8545 cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock();
8546 BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8547 for (PHINode &ExitPhi : ExitBB->phis()) {
8548 Value *IncomingValue =
8549 ExitPhi.getIncomingValueForBlock(ExitingBB);
8550 VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan);
8551 // Exit values for inductions are computed and updated outside of VPlan and
8552 // independent of induction recipes.
8553 // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
8554 // live-outs.
8555 if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
8556 !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
8557 isa<VPWidenPointerInductionRecipe>(V) ||
8558 (isa<Instruction>(IncomingValue) &&
8559 any_of(IncomingValue->users(), [&Inductions](User *U) {
8560 auto *P = dyn_cast<PHINode>(U);
8561 return P && Inductions.contains(P);
8562 })))
8563 continue;
8564 Plan.addLiveOut(&ExitPhi, V);
8565 }
8566}
8567
8568/// Feed a resume value for every FOR from the vector loop to the scalar loop,
8569/// if middle block branches to scalar preheader, by introducing ExtractFromEnd
8570/// and ResumePhi recipes in each, respectively, and a VPLiveOut which uses the
8571/// latter and corresponds to the scalar header.
8573 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8574
8575 // Start by finding out if middle block branches to scalar preheader, which is
8576 // not a VPIRBasicBlock, unlike Exit block - the other possible successor of
8577 // middle block.
8578 // TODO: Should be replaced by
8579 // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
8580 // scalar region is modeled as well.
8581 VPBasicBlock *ScalarPHVPBB = nullptr;
8582 auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
8583 for (VPBlockBase *Succ : MiddleVPBB->getSuccessors()) {
8584 if (isa<VPIRBasicBlock>(Succ))
8585 continue;
8586 assert(!ScalarPHVPBB && "Two candidates for ScalarPHVPBB?");
8587 ScalarPHVPBB = cast<VPBasicBlock>(Succ);
8588 }
8589 if (!ScalarPHVPBB)
8590 return;
8591
8592 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
8593 VPBuilder MiddleBuilder(MiddleVPBB);
8594 // Reset insert point so new recipes are inserted before terminator and
8595 // condition, if there is either the former or both.
8596 if (auto *Terminator = MiddleVPBB->getTerminator()) {
8597 auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand(0));
8598 assert((!Condition || Condition->getParent() == MiddleVPBB) &&
8599 "Condition expected in MiddleVPBB");
8600 MiddleBuilder.setInsertPoint(Condition ? Condition : Terminator);
8601 }
8602 VPValue *OneVPV = Plan.getOrAddLiveIn(
8603 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
8604
8605 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
8606 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
8607 if (!FOR)
8608 continue;
8609
8610 // Extract the resume value and create a new VPLiveOut for it.
8611 auto *Resume = MiddleBuilder.createNaryOp(VPInstruction::ExtractFromEnd,
8612 {FOR->getBackedgeValue(), OneVPV},
8613 {}, "vector.recur.extract");
8614 auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
8615 VPInstruction::ResumePhi, {Resume, FOR->getStartValue()}, {},
8616 "scalar.recur.init");
8617 Plan.addLiveOut(cast<PHINode>(FOR->getUnderlyingInstr()), ResumePhiRecipe);
8618 }
8619}
8620
8622LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8623
8625
8626 // ---------------------------------------------------------------------------
8627 // Build initial VPlan: Scan the body of the loop in a topological order to
8628 // visit each basic block after having visited its predecessor basic blocks.
8629 // ---------------------------------------------------------------------------
8630
8631 // Create initial VPlan skeleton, having a basic block for the pre-header
8632 // which contains SCEV expansions that need to happen before the CFG is
8633 // modified; a basic block for the vector pre-header, followed by a region for
8634 // the vector loop, followed by the middle basic block. The skeleton vector
8635 // loop region contains a header and latch basic blocks.
8636
8637 bool RequiresScalarEpilogueCheck =
8639 [this](ElementCount VF) {
8640 return !CM.requiresScalarEpilogue(VF.isVector());
8641 },
8642 Range);
8644 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8645 *PSE.getSE(), RequiresScalarEpilogueCheck, CM.foldTailByMasking(),
8646 OrigLoop);
8647
8648 // Don't use getDecisionAndClampRange here, because we don't know the UF
8649 // so this function is better to be conservative, rather than to split
8650 // it up into different VPlans.
8651 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8652 bool IVUpdateMayOverflow = false;
8653 for (ElementCount VF : Range)
8654 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8655
8657 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8658 // When not folding the tail, we know that the induction increment will not
8659 // overflow.
8660 bool HasNUW = Style == TailFoldingStyle::None;
8661 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8662
8663 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
8664
8665 // ---------------------------------------------------------------------------
8666 // Pre-construction: record ingredients whose recipes we'll need to further
8667 // process after constructing the initial VPlan.
8668 // ---------------------------------------------------------------------------
8669
8670 // For each interleave group which is relevant for this (possibly trimmed)
8671 // Range, add it to the set of groups to be later applied to the VPlan and add
8672 // placeholders for its members' Recipes which we'll be replacing with a
8673 // single VPInterleaveRecipe.
8675 auto applyIG = [IG, this](ElementCount VF) -> bool {
8676 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8677 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8679 // For scalable vectors, the only interleave factor currently supported
8680 // is 2 since we require the (de)interleave2 intrinsics instead of
8681 // shufflevectors.
8682 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8683 "Unsupported interleave factor for scalable vectors");
8684 return Result;
8685 };
8686 if (!getDecisionAndClampRange(applyIG, Range))
8687 continue;
8688 InterleaveGroups.insert(IG);
8689 };
8690
8691 // ---------------------------------------------------------------------------
8692 // Construct recipes for the instructions in the loop
8693 // ---------------------------------------------------------------------------
8694
8695 // Scan the body of the loop in a topological order to visit each basic block
8696 // after having visited its predecessor basic blocks.
8697 LoopBlocksDFS DFS(OrigLoop);
8698 DFS.perform(LI);
8699
8700 VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
8701 VPBasicBlock *VPBB = HeaderVPBB;
8702 BasicBlock *HeaderBB = OrigLoop->getHeader();
8703 bool NeedsMasks =
8704 CM.foldTailByMasking() ||
8705 any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
8706 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
8707 return Legal->blockNeedsPredication(BB) || NeedsBlends;
8708 });
8709 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8710 // Relevant instructions from basic block BB will be grouped into VPRecipe
8711 // ingredients and fill a new VPBasicBlock.
8712 if (VPBB != HeaderVPBB)
8713 VPBB->setName(BB->getName());
8714 Builder.setInsertPoint(VPBB);
8715
8716 if (VPBB == HeaderVPBB)
8717 RecipeBuilder.createHeaderMask();
8718 else if (NeedsMasks)
8719 RecipeBuilder.createBlockInMask(BB);
8720
8721 // Introduce each ingredient into VPlan.
8722 // TODO: Model and preserve debug intrinsics in VPlan.
8723 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
8724 Instruction *Instr = &I;
8726 auto *Phi = dyn_cast<PHINode>(Instr);
8727 if (Phi && Phi->getParent() == HeaderBB) {
8728 Operands.push_back(Plan->getOrAddLiveIn(
8729 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8730 } else {
8731 auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
8732 Operands = {OpRange.begin(), OpRange.end()};
8733 }
8734
8735 // Invariant stores inside loop will be deleted and a single store
8736 // with the final reduction value will be added to the exit block
8737 StoreInst *SI;
8738 if ((SI = dyn_cast<StoreInst>(&I)) &&
8739 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8740 continue;
8741
8742 VPRecipeBase *Recipe =
8743 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
8744 if (!Recipe)
8745 Recipe = RecipeBuilder.handleReplication(Instr, Range);
8746
8747 RecipeBuilder.setRecipe(Instr, Recipe);
8748 if (isa<VPHeaderPHIRecipe>(Recipe)) {
8749 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8750 // the following cases, VPHeaderPHIRecipes may be created after non-phi
8751 // recipes and need to be moved to the phi section of HeaderVPBB:
8752 // * tail-folding (non-phi recipes computing the header mask are
8753 // introduced earlier than regular header phi recipes, and should appear
8754 // after them)
8755 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8756
8757 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8758 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8759 "unexpected recipe needs moving");
8760 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8761 } else
8762 VPBB->appendRecipe(Recipe);
8763 }
8764
8766 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8767 }
8768
8769 // After here, VPBB should not be used.
8770 VPBB = nullptr;
8771
8772 addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan,
8773 Legal->getInductionVars());
8774
8775 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8776 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8777 "entry block must be set to a VPRegionBlock having a non-empty entry "
8778 "VPBasicBlock");
8779 RecipeBuilder.fixHeaderPhis();
8780
8782
8783 // ---------------------------------------------------------------------------
8784 // Transform initial VPlan: Apply previously taken decisions, in order, to
8785 // bring the VPlan to its final state.
8786 // ---------------------------------------------------------------------------
8787
8788 // Adjust the recipes for any inloop reductions.
8789 adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
8790
8791 // Interleave memory: for each Interleave Group we marked earlier as relevant
8792 // for this VPlan, replace the Recipes widening its memory instructions with a
8793 // single VPInterleaveRecipe at its insertion point.
8794 for (const auto *IG : InterleaveGroups) {
8795 auto *Recipe =
8796 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos()));
8797 SmallVector<VPValue *, 4> StoredValues;
8798 for (unsigned i = 0; i < IG->getFactor(); ++i)
8799 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8800 auto *StoreR = cast<VPWidenStoreRecipe>(RecipeBuilder.getRecipe(SI));
8801 StoredValues.push_back(StoreR->getStoredValue());
8802 }
8803
8804 bool NeedsMaskForGaps =
8805 IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8806 assert((!NeedsMaskForGaps || useMaskedInterleavedAccesses(CM.TTI)) &&
8807 "masked interleaved groups are not allowed.");
8808 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8809 Recipe->getMask(), NeedsMaskForGaps);
8810 VPIG->insertBefore(Recipe);
8811 unsigned J = 0;
8812 for (unsigned i = 0; i < IG->getFactor(); ++i)
8813 if (Instruction *Member = IG->getMember(i)) {
8814 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
8815 if (!Member->getType()->isVoidTy()) {
8816 VPValue *OriginalV = MemberR->getVPSingleValue();
8817 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8818 J++;
8819 }
8820 MemberR->eraseFromParent();
8821 }
8822 }
8823
8824 for (ElementCount VF : Range)
8825 Plan->addVF(VF);
8826 Plan->setName("Initial VPlan");
8827
8828 // Replace VPValues for known constant strides guaranteed by predicate scalar
8829 // evolution.
8830 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8831 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8832 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
8833 // Only handle constant strides for now.
8834 if (!ScevStride)
8835 continue;
8836
8837 auto *CI = Plan->getOrAddLiveIn(
8838 ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
8839 if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
8840 StrideVPV->replaceAllUsesWith(CI);
8841
8842 // The versioned value may not be used in the loop directly but through a
8843 // sext/zext. Add new live-ins in those cases.
8844 for (Value *U : StrideV->users()) {
8845 if (!isa<SExtInst, ZExtInst>(U))
8846 continue;
8847 VPValue *StrideVPV = Plan->getLiveIn(U);
8848 if (!StrideVPV)
8849 continue;
8850 unsigned BW = U->getType()->getScalarSizeInBits();
8851 APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
8852 : ScevStride->getAPInt().zext(BW);
8853 VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
8854 StrideVPV->replaceAllUsesWith(CI);
8855 }
8856 }
8857
8859 return Legal->blockNeedsPredication(BB);
8860 });
8861
8862 // Sink users of fixed-order recurrence past the recipe defining the previous
8863 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8865 return nullptr;
8866
8867 if (useActiveLaneMask(Style)) {
8868 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8869 // TailFoldingStyle is visible there.
8870 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8871 bool WithoutRuntimeCheck =
8873 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8874 WithoutRuntimeCheck);
8875 }
8876 return Plan;
8877}
8878
8879VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8880 // Outer loop handling: They may require CFG and instruction level
8881 // transformations before even evaluating whether vectorization is profitable.
8882 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8883 // the vectorization pipeline.
8884 assert(!OrigLoop->isInnermost());
8885 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8886
8887 // Create new empty VPlan
8888 auto Plan = VPlan::createInitialVPlan(
8889 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8890 *PSE.getSE(), true, false, OrigLoop);
8891
8892 // Build hierarchical CFG
8893 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8894 HCFGBuilder.buildHierarchicalCFG();
8895
8896 for (ElementCount VF : Range)
8897 Plan->addVF(VF);
8898
8900 Plan,
8901 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
8902 *PSE.getSE(), *TLI);
8903
8904 // Remove the existing terminator of the exiting block of the top-most region.
8905 // A BranchOnCount will be added instead when adding the canonical IV recipes.
8906 auto *Term =
8907 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8908 Term->eraseFromParent();
8909
8910 // Tail folding is not supported for outer loops, so the induction increment
8911 // is guaranteed to not wrap.
8912 bool HasNUW = true;
8913 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
8914 DebugLoc());
8915 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8916 return Plan;
8917}
8918
8919// Adjust the recipes for reductions. For in-loop reductions the chain of
8920// instructions leading from the loop exit instr to the phi need to be converted
8921// to reductions, with one operand being vector and the other being the scalar
8922// reduction chain. For other reductions, a select is introduced between the phi
8923// and live-out recipes when folding the tail.
8924//
8925// A ComputeReductionResult recipe is added to the middle block, also for
8926// in-loop reductions which compute their result in-loop, because generating
8927// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8928//
8929// Adjust AnyOf reductions; replace the reduction phi for the selected value
8930// with a boolean reduction phi node to check if the condition is true in any
8931// iteration. The final value is selected by the final ComputeReductionResult.
8932void LoopVectorizationPlanner::adjustRecipesForReductions(
8933 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
8934 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8935 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8936 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8937 // sank outside of the loop would keep the same order as they had in the
8938 // original loop.
8939 SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8940 for (VPRecipeBase &R : Header->phis()) {
8941 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
8942 ReductionPHIList.emplace_back(ReductionPhi);
8943 }
8944 bool HasIntermediateStore = false;
8945 stable_sort(ReductionPHIList,
8946 [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8947 const VPReductionPHIRecipe *R2) {
8948 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8949 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8950 HasIntermediateStore |= IS1 || IS2;
8951
8952 // If neither of the recipes has an intermediate store, keep the
8953 // order the same.
8954 if (!IS1 && !IS2)
8955 return false;
8956
8957 // If only one of the recipes has an intermediate store, then
8958 // move it towards the beginning of the list.
8959 if (IS1 && !IS2)
8960 return true;
8961
8962 if (!IS1 && IS2)
8963 return false;
8964
8965 // If both recipes have an intermediate store, then the recipe
8966 // with the later store should be processed earlier. So it
8967 // should go to the beginning of the list.
8968 return DT->dominates(IS2, IS1);
8969 });
8970
8971 if (HasIntermediateStore && ReductionPHIList.size() > 1)
8972 for (VPRecipeBase *R : ReductionPHIList)
8973 R->moveBefore(*Header, Header->getFirstNonPhi());
8974
8975 for (VPRecipeBase &R : Header->phis()) {
8976 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8977 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8978 continue;
8979
8980 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
8981 RecurKind Kind = RdxDesc.getRecurrenceKind();
8983 "AnyOf reductions are not allowed for in-loop reductions");
8984
8985 // Collect the chain of "link" recipes for the reduction starting at PhiR.
8987 Worklist.insert(PhiR);
8988 for (unsigned I = 0; I != Worklist.size(); ++I) {
8989 VPSingleDefRecipe *Cur = Worklist[I];
8990 for (VPUser *U : Cur->users()) {
8991 auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
8992 if (!UserRecipe) {
8993 assert(isa<VPLiveOut>(U) &&
8994 "U must either be a VPSingleDef or VPLiveOut");
8995 continue;
8996 }
8997 Worklist.insert(UserRecipe);
8998 }
8999 }
9000
9001 // Visit operation "Links" along the reduction chain top-down starting from
9002 // the phi until LoopExitValue. We keep track of the previous item
9003 // (PreviousLink) to tell which of the two operands of a Link will remain
9004 // scalar and which will be reduced. For minmax by select(cmp), Link will be
9005 // the select instructions. Blend recipes of in-loop reduction phi's will
9006 // get folded to their non-phi operand, as the reduction recipe handles the
9007 // condition directly.
9008 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9009 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9010 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9011
9012 // Index of the first operand which holds a non-mask vector operand.
9013 unsigned IndexOfFirstOperand;
9014 // Recognize a call to the llvm.fmuladd intrinsic.
9015 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9016 VPValue *VecOp;
9017 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9018 if (IsFMulAdd) {
9019 assert(
9021 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9022 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9023 isa<VPWidenCallRecipe>(CurrentLink)) &&
9024 CurrentLink->getOperand(2) == PreviousLink &&
9025 "expected a call where the previous link is the added operand");
9026
9027 // If the instruction is a call to the llvm.fmuladd intrinsic then we
9028 // need to create an fmul recipe (multiplying the first two operands of
9029 // the fmuladd together) to use as the vector operand for the fadd
9030 // reduction.
9031 VPInstruction *FMulRecipe = new VPInstruction(
9032 Instruction::FMul,
9033 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9034 CurrentLinkI->getFastMathFlags());
9035 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9036 VecOp = FMulRecipe;
9037 } else {
9038 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
9039 if (PhiR->isInLoop() && Blend) {
9040 assert(Blend->getNumIncomingValues() == 2 &&
9041 "Blend must have 2 incoming values");
9042 if (Blend->getIncomingValue(0) == PhiR)
9043 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9044 else {
9045 assert(Blend->getIncomingValue(1) == PhiR &&
9046 "PhiR must be an operand of the blend");
9047 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9048 }
9049 continue;
9050 }
9051
9053 if (isa<VPWidenRecipe>(CurrentLink)) {
9054 assert(isa<CmpInst>(CurrentLinkI) &&
9055 "need to have the compare of the select");
9056 continue;
9057 }
9058 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9059 "must be a select recipe");
9060 IndexOfFirstOperand = 1;
9061 } else {
9062 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9063 "Expected to replace a VPWidenSC");
9064 IndexOfFirstOperand = 0;
9065 }
9066 // Note that for non-commutable operands (cmp-selects), the semantics of
9067 // the cmp-select are captured in the recurrence kind.
9068 unsigned VecOpId =
9069 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9070 ? IndexOfFirstOperand + 1
9071 : IndexOfFirstOperand;
9072 VecOp = CurrentLink->getOperand(VecOpId);
9073 assert(VecOp != PreviousLink &&
9074 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9075 (VecOpId - IndexOfFirstOperand)) ==
9076 PreviousLink &&
9077 "PreviousLink must be the operand other than VecOp");
9078 }
9079
9080 BasicBlock *BB = CurrentLinkI->getParent();
9081 VPValue *CondOp = nullptr;
9083 CondOp = RecipeBuilder.getBlockInMask(BB);
9084
9085 VPReductionRecipe *RedRecipe =
9086 new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp,
9087 CondOp, CM.useOrderedReductions(RdxDesc));
9088 // Append the recipe to the end of the VPBasicBlock because we need to
9089 // ensure that it comes after all of it's inputs, including CondOp.
9090 // Note that this transformation may leave over dead recipes (including
9091 // CurrentLink), which will be cleaned by a later VPlan transform.
9092 LinkVPBB->appendRecipe(RedRecipe);
9093 CurrentLink->replaceAllUsesWith(RedRecipe);
9094 PreviousLink = RedRecipe;
9095 }
9096 }
9097 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9098 Builder.setInsertPoint(&*LatchVPBB->begin());
9099 VPBasicBlock *MiddleVPBB =
9100 cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
9101 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9102 for (VPRecipeBase &R :
9103 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9104 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9105 if (!PhiR)
9106 continue;
9107
9108 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9109 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9110 // with a boolean reduction phi node to check if the condition is true in
9111 // any iteration. The final value is selected by the final
9112 // ComputeReductionResult.
9114 RdxDesc.getRecurrenceKind())) {
9115 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9116 return isa<VPWidenSelectRecipe>(U) ||
9117 (isa<VPReplicateRecipe>(U) &&
9118 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9119 Instruction::Select);
9120 }));
9121 VPValue *Cmp = Select->getOperand(0);
9122 // If the compare is checking the reduction PHI node, adjust it to check
9123 // the start value.
9124 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
9125 for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
9126 if (CmpR->getOperand(I) == PhiR)
9127 CmpR->setOperand(I, PhiR->getStartValue());
9128 }
9129 VPBuilder::InsertPointGuard Guard(Builder);
9130 Builder.setInsertPoint(Select);
9131
9132 // If the true value of the select is the reduction phi, the new value is
9133 // selected if the negated condition is true in any iteration.
9134 if (Select->getOperand(1) == PhiR)
9135 Cmp = Builder.createNot(Cmp);
9136 VPValue *Or = Builder.createOr(PhiR, Cmp);
9137 Select->getVPSingleValue()->replaceAllUsesWith(Or);
9138
9139 // Convert the reduction phi to operate on bools.
9140 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9141 OrigLoop->getHeader()->getContext())));
9142 }
9143
9144 // If tail is folded by masking, introduce selects between the phi
9145 // and the live-out instruction of each reduction, at the beginning of the
9146 // dedicated latch block.
9147 auto *OrigExitingVPV = PhiR->getBackedgeValue();
9148 auto *NewExitingVPV = PhiR->getBackedgeValue();
9149 if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9150 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9151 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9152 "reduction recipe must be defined before latch");
9153 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9154 std::optional<FastMathFlags> FMFs =
9155 PhiTy->isFloatingPointTy()
9156 ? std::make_optional(RdxDesc.getFastMathFlags())
9157 : std::nullopt;
9158 NewExitingVPV =
9159 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9160 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9161 return isa<VPInstruction>(&U) &&
9162 cast<VPInstruction>(&U)->getOpcode() ==
9164 });
9167 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
9169 PhiR->setOperand(1, NewExitingVPV);
9170 }
9171
9172 // If the vector reduction can be performed in a smaller type, we truncate
9173 // then extend the loop exit value to enable InstCombine to evaluate the
9174 // entire expression in the smaller type.
9175 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9176 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9178 RdxDesc.getRecurrenceKind())) {
9179 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9180 Type *RdxTy = RdxDesc.getRecurrenceType();
9181 auto *Trunc =
9182 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9183 auto *Extnd =
9184 RdxDesc.isSigned()
9185 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9186 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9187
9188 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9189 Extnd->insertAfter(Trunc);
9190 if (PhiR->getOperand(1) == NewExitingVPV)
9191 PhiR->setOperand(1, Extnd->getVPSingleValue());
9192 NewExitingVPV = Extnd;
9193 }
9194
9195 // We want code in the middle block to appear to execute on the location of
9196 // the scalar loop's latch terminator because: (a) it is all compiler
9197 // generated, (b) these instructions are always executed after evaluating
9198 // the latch conditional branch, and (c) other passes may add new
9199 // predecessors which terminate on this line. This is the easiest way to
9200 // ensure we don't accidentally cause an extra step back into the loop while
9201 // debugging.
9202 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9203
9204 // TODO: At the moment ComputeReductionResult also drives creation of the
9205 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9206 // even for in-loop reductions, until the reduction resume value handling is
9207 // also modeled in VPlan.
9208 auto *FinalReductionResult = new VPInstruction(
9209 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9210 FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9211 OrigExitingVPV->replaceUsesWithIf(
9212 FinalReductionResult,
9213 [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
9214 }
9215
9217}
9218
9220 assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9221
9222 // Fast-math-flags propagate from the original induction instruction.
9224 if (FPBinOp)
9225 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9226
9227 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9228 Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0));
9229 Value *DerivedIV = emitTransformedIndex(
9230 State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9231 Kind, cast_if_present<BinaryOperator>(FPBinOp));
9232 DerivedIV->setName("offset.idx");
9233 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9234
9235 State.set(this, DerivedIV, VPIteration(0, 0));
9236}
9237
9240 if (State.Instance) { // Generate a single instance.
9241 assert((State.VF.isScalar() || !isUniform()) &&
9242 "uniform recipe shouldn't be predicated");
9243 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9244 State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
9245 // Insert scalar instance packing it into a vector.
9246 if (State.VF.isVector() && shouldPack()) {
9247 // If we're constructing lane 0, initialize to start from poison.
9248 if (State.Instance->Lane.isFirstLane()) {
9249 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9251 VectorType::get(UI->getType(), State.VF));
9252 State.set(this, Poison, State.Instance->Part);
9253 }
9254 State.packScalarIntoVectorValue(this, *State.Instance);
9255 }
9256 return;
9257 }
9258
9259 if (IsUniform) {
9260 // If the recipe is uniform across all parts (instead of just per VF), only
9261 // generate a single instance.
9262 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9263 all_of(operands(), [](VPValue *Op) {
9264 return Op->isDefinedOutsideVectorRegions();
9265 })) {
9266 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
9267 if (user_begin() != user_end()) {
9268 for (unsigned Part = 1; Part < State.UF; ++Part)
9269 State.set(this, State.get(this, VPIteration(0, 0)),
9270 VPIteration(Part, 0));
9271 }
9272 return;
9273 }
9274
9275 // Uniform within VL means we need to generate lane 0 only for each
9276 // unrolled copy.
9277 for (unsigned Part = 0; Part < State.UF; ++Part)
9278 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
9279 return;
9280 }
9281
9282 // A store of a loop varying value to a uniform address only needs the last
9283 // copy of the store.
9284 if (isa<StoreInst>(UI) &&
9286 auto Lane = VPLane::getLastLaneForVF(State.VF);
9287 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
9288 State);
9289 return;
9290 }
9291
9292 // Generate scalar instances for all VF lanes of all UF parts.
9293 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9294 const unsigned EndLane = State.VF.getKnownMinValue();
9295 for (unsigned Part = 0; Part < State.UF; ++Part)
9296 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9297 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
9298}
9299
9300/// Use all-true mask for reverse rather than actual mask, as it avoids a
9301/// dependence w/o affecting the result.
9303 Value *EVL, const Twine &Name) {
9304 VectorType *ValTy = cast<VectorType>(Operand->getType());
9305 Value *AllTrueMask =
9306 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
9307 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
9308 {Operand, AllTrueMask, EVL}, nullptr, Name);
9309}
9310
9312 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9313 "explicit vector length.");
9314 auto *LI = cast<LoadInst>(&Ingredient);
9315
9316 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9317 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9318 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9319 bool CreateGather = !isConsecutive();
9320
9321 auto &Builder = State.Builder;
9323 CallInst *NewLI;
9324 Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9325 Value *Addr = State.get(getAddr(), 0, !CreateGather);
9326 Value *Mask = nullptr;
9327 if (VPValue *VPMask = getMask()) {
9328 Mask = State.get(VPMask, 0);
9329 if (isReverse())
9330 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
9331 } else {
9332 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9333 }
9334
9335 if (CreateGather) {
9336 NewLI =
9337 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
9338 nullptr, "wide.masked.gather");
9339 } else {
9340 VectorBuilder VBuilder(Builder);
9341 VBuilder.setEVL(EVL).setMask(Mask);
9342 NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
9343 Instruction::Load, DataTy, Addr, "vp.op.load"));
9344 }
9345 NewLI->addParamAttr(
9346 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
9347 State.addMetadata(NewLI, LI);
9348 Instruction *Res = NewLI;
9349 if (isReverse())
9350 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
9351 State.set(this, Res, 0);
9352}
9353
9355 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9356 "explicit vector length.");
9357 auto *SI = cast<StoreInst>(&Ingredient);
9358
9359 VPValue *StoredValue = getStoredValue();
9360 bool CreateScatter = !isConsecutive();
9361 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9362
9363 auto &Builder = State.Builder;
9365
9366 CallInst *NewSI = nullptr;
9367 Value *StoredVal = State.get(StoredValue, 0);
9368 Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9369 if (isReverse())
9370 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
9371 Value *Mask = nullptr;
9372 if (VPValue *VPMask = getMask()) {
9373 Mask = State.get(VPMask, 0);
9374 if (isReverse())
9375 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
9376 } else {
9377 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9378 }
9379 Value *Addr = State.get(getAddr(), 0, !CreateScatter);
9380 if (CreateScatter) {
9381 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
9382 Intrinsic::vp_scatter,
9383 {StoredVal, Addr, Mask, EVL});
9384 } else {
9385 VectorBuilder VBuilder(Builder);
9386 VBuilder.setEVL(EVL).setMask(Mask);
9387 NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
9388 Instruction::Store, Type::getVoidTy(EVL->getContext()),
9389 {StoredVal, Addr}));
9390 }
9391 NewSI->addParamAttr(
9392 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
9393 State.addMetadata(NewSI, SI);
9394}
9395
9396// Determine how to lower the scalar epilogue, which depends on 1) optimising
9397// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9398// predication, and 4) a TTI hook that analyses whether the loop is suitable
9399// for predication.
9404 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9405 // don't look at hints or options, and don't request a scalar epilogue.
9406 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9407 // LoopAccessInfo (due to code dependency and not being able to reliably get
9408 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9409 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9410 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9411 // back to the old way and vectorize with versioning when forced. See D81345.)
9412 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9416
9417 // 2) If set, obey the directives
9418 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9426 };
9427 }
9428
9429 // 3) If set, obey the hints
9430 switch (Hints.getPredicate()) {
9435 };
9436
9437 // 4) if the TTI hook indicates this is profitable, request predication.
9438 TailFoldingInfo TFI(TLI, &LVL, IAI);
9441
9443}
9444
9445// Process the loop in the VPlan-native vectorization path. This path builds
9446// VPlan upfront in the vectorization pipeline, which allows to apply
9447// VPlan-to-VPlan transformations from the very beginning without modifying the
9448// input LLVM IR.
9455 LoopVectorizationRequirements &Requirements) {
9456
9457 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9458 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9459 return false;
9460 }
9461 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9462 Function *F = L->getHeader()->getParent();
9463 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9464
9466 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9467
9468 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9469 &Hints, IAI);
9470 // Use the planner for outer loop vectorization.
9471 // TODO: CM is not used at this point inside the planner. Turn CM into an
9472 // optional argument if we don't need it in the future.
9473 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9474 ORE);
9475
9476 // Get user vectorization factor.
9477 ElementCount UserVF = Hints.getWidth();
9478
9480
9481 // Plan how to best vectorize, return the best VF and its cost.
9482 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9483
9484 // If we are stress testing VPlan builds, do not attempt to generate vector
9485 // code. Masked vector code generation support will follow soon.
9486 // Also, do not attempt to vectorize if no vector code will be produced.
9488 return false;
9489
9490 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
9491
9492 {
9493 bool AddBranchWeights =
9494 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9495 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9496 F->getDataLayout(), AddBranchWeights);
9497 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9498 VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9499 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9500 << L->getHeader()->getParent()->getName() << "\"\n");
9501 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9502 }
9503
9504 reportVectorization(ORE, L, VF, 1);
9505
9506 // Mark the loop as already vectorized to avoid vectorizing again.
9507 Hints.setAlreadyVectorized();
9508 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9509 return true;
9510}
9511
9512// Emit a remark if there are stores to floats that required a floating point
9513// extension. If the vectorized loop was generated with floating point there
9514// will be a performance penalty from the conversion overhead and the change in
9515// the vector width.
9518 for (BasicBlock *BB : L->getBlocks()) {
9519 for (Instruction &Inst : *BB) {
9520 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9521 if (S->getValueOperand()->getType()->isFloatTy())
9522 Worklist.push_back(S);
9523 }
9524 }
9525 }
9526
9527 // Traverse the floating point stores upwards searching, for floating point
9528 // conversions.
9531 while (!Worklist.empty()) {
9532 auto *I = Worklist.pop_back_val();
9533 if (!L->contains(I))
9534 continue;
9535 if (!Visited.insert(I).second)
9536 continue;
9537
9538 // Emit a remark if the floating point store required a floating
9539 // point conversion.
9540 // TODO: More work could be done to identify the root cause such as a
9541 // constant or a function return type and point the user to it.
9542 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9543 ORE->emit([&]() {
9544 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9545 I->getDebugLoc(), L->getHeader())
9546 << "floating point conversion changes vector width. "
9547 << "Mixed floating point precision requires an up/down "
9548 << "cast that will negatively impact performance.";
9549 });
9550
9551 for (Use &Op : I->operands())
9552 if (auto *OpI = dyn_cast<Instruction>(Op))
9553 Worklist.push_back(OpI);
9554 }
9555}
9556
9557static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9559 std::optional<unsigned> VScale, Loop *L,
9560 ScalarEvolution &SE,
9562 InstructionCost CheckCost = Checks.getCost();
9563 if (!CheckCost.isValid())
9564 return false;
9565
9566 // When interleaving only scalar and vector cost will be equal, which in turn
9567 // would lead to a divide by 0. Fall back to hard threshold.
9568 if (VF.Width.isScalar()) {
9569 if (CheckCost > VectorizeMemoryCheckThreshold) {
9570 LLVM_DEBUG(
9571 dbgs()
9572 << "LV: Interleaving only is not profitable due to runtime checks\n");
9573 return false;
9574 }
9575 return true;
9576 }
9577
9578 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9579 uint64_t ScalarC = *VF.ScalarCost.getValue();
9580 if (ScalarC == 0)
9581 return true;
9582
9583 // First, compute the minimum iteration count required so that the vector
9584 // loop outperforms the scalar loop.
9585 // The total cost of the scalar loop is
9586 // ScalarC * TC
9587 // where
9588 // * TC is the actual trip count of the loop.
9589 // * ScalarC is the cost of a single scalar iteration.
9590 //
9591 // The total cost of the vector loop is
9592 // RtC + VecC * (TC / VF) + EpiC
9593 // where
9594 // * RtC is the cost of the generated runtime checks
9595 // * VecC is the cost of a single vector iteration.
9596 // * TC is the actual trip count of the loop
9597 // * VF is the vectorization factor
9598 // * EpiCost is the cost of the generated epilogue, including the cost
9599 // of the remaining scalar operations.
9600 //
9601 // Vectorization is profitable once the total vector cost is less than the
9602 // total scalar cost:
9603 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9604 //
9605 // Now we can compute the minimum required trip count TC as
9606 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
9607 //
9608 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9609 // the computations are performed on doubles, not integers and the result
9610 // is rounded up, hence we get an upper estimate of the TC.
9611 unsigned IntVF = VF.Width.getKnownMinValue();
9612 if (VF.Width.isScalable()) {
9613 unsigned AssumedMinimumVscale = 1;
9614 if (VScale)
9615 AssumedMinimumVscale = *VScale;
9616 IntVF *= AssumedMinimumVscale;
9617 }
9618 uint64_t RtC = *CheckCost.getValue();
9619 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
9620 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
9621
9622 // Second, compute a minimum iteration count so that the cost of the
9623 // runtime checks is only a fraction of the total scalar loop cost. This
9624 // adds a loop-dependent bound on the overhead incurred if the runtime
9625 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9626 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9627 // cost, compute
9628 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9629 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
9630
9631 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9632 // epilogue is allowed, choose the next closest multiple of VF. This should
9633 // partly compensate for ignoring the epilogue cost.
9634 uint64_t MinTC = std::max(MinTC1, MinTC2);
9635 if (SEL == CM_ScalarEpilogueAllowed)
9636 MinTC = alignTo(MinTC, IntVF);
9638
9639 LLVM_DEBUG(
9640 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9641 << VF.MinProfitableTripCount << "\n");
9642
9643 // Skip vectorization if the expected trip count is less than the minimum
9644 // required trip count.
9645 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9648 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9649 "trip count < minimum profitable VF ("
9650 << *ExpectedTC << " < " << VF.MinProfitableTripCount
9651 << ")\n");
9652
9653 return false;
9654 }
9655 }
9656 return true;
9657}
9658
9660 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9662 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9664
9666 assert((EnableVPlanNativePath || L->isInnermost()) &&
9667 "VPlan-native path is not enabled. Only process inner loops.");
9668
9669 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9670 << L->getHeader()->getParent()->getName() << "' from "
9671 << L->getLocStr() << "\n");
9672
9673 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9674
9675 LLVM_DEBUG(
9676 dbgs() << "LV: Loop hints:"
9677 << " force="
9679 ? "disabled"
9681 ? "enabled"
9682 : "?"))
9683 << " width=" << Hints.getWidth()
9684 << " interleave=" << Hints.getInterleave() << "\n");
9685
9686 // Function containing loop
9687 Function *F = L->getHeader()->getParent();
9688
9689 // Looking at the diagnostic output is the only way to determine if a loop
9690 // was vectorized (other than looking at the IR or machine code), so it
9691 // is important to generate an optimization remark for each loop. Most of
9692 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9693 // generated as OptimizationRemark and OptimizationRemarkMissed are
9694 // less verbose reporting vectorized loops and unvectorized loops that may
9695 // benefit from vectorization, respectively.
9696
9697 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9698 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9699 return false;
9700 }
9701
9702 PredicatedScalarEvolution PSE(*SE, *L);
9703
9704 // Check if it is legal to vectorize the loop.
9705 LoopVectorizationRequirements Requirements;
9706 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9707 &Requirements, &Hints, DB, AC, BFI, PSI);
9709 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9710 Hints.emitRemarkWithHints();
9711 return false;
9712 }
9713
9714 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9715 // here. They may require CFG and instruction level transformations before
9716 // even evaluating whether vectorization is profitable. Since we cannot modify
9717 // the incoming IR, we need to build VPlan upfront in the vectorization
9718 // pipeline.
9719 if (!L->isInnermost())
9720 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9721 ORE, BFI, PSI, Hints, Requirements);
9722
9723 assert(L->isInnermost() && "Inner loop expected.");
9724
9725 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9726 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9727
9728 // If an override option has been passed in for interleaved accesses, use it.
9729 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9730 UseInterleaved = EnableInterleavedMemAccesses;
9731
9732 // Analyze interleaved memory accesses.
9733 if (UseInterleaved)
9735
9736 // Check the function attributes and profiles to find out if this function
9737 // should be optimized for size.
9739 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9740
9741 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9742 // count by optimizing for size, to minimize overheads.
9743 auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9744 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9745 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9746 << "This loop is worth vectorizing only if no scalar "
9747 << "iteration overheads are incurred.");
9749 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9750 else {
9751 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
9752 LLVM_DEBUG(dbgs() << "\n");
9753 // Predicate tail-folded loops are efficient even when the loop
9754 // iteration count is low. However, setting the epilogue policy to
9755 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9756 // with runtime checks. It's more effective to let
9757 // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9758 // for the loop.
9761 } else {
9762 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
9763 "small to consider vectorizing.\n");
9765 "The trip count is below the minial threshold value.",
9766 "loop trip count is too low, avoiding vectorization",
9767 "LowTripCount", ORE, L);
9768 Hints.emitRemarkWithHints();
9769 return false;
9770 }
9771 }
9772 }
9773
9774 // Check the function attributes to see if implicit floats or vectors are
9775 // allowed.
9776 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9778 "Can't vectorize when the NoImplicitFloat attribute is used",
9779 "loop not vectorized due to NoImplicitFloat attribute",
9780 "NoImplicitFloat", ORE, L);
9781 Hints.emitRemarkWithHints();
9782 return false;
9783 }
9784
9785 // Check if the target supports potentially unsafe FP vectorization.
9786 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9787 // for the target we're vectorizing for, to make sure none of the
9788 // additional fp-math flags can help.
9789 if (Hints.isPotentiallyUnsafe() &&
9792 "Potentially unsafe FP op prevents vectorization",
9793 "loop not vectorized due to unsafe FP support.",
9794 "UnsafeFP", ORE, L);
9795 Hints.emitRemarkWithHints();
9796 return false;
9797 }
9798
9799 bool AllowOrderedReductions;
9800 // If the flag is set, use that instead and override the TTI behaviour.
9801 if (ForceOrderedReductions.getNumOccurrences() > 0)
9802 AllowOrderedReductions = ForceOrderedReductions;
9803 else
9804 AllowOrderedReductions = TTI->enableOrderedReductions();
9805 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9806 ORE->emit([&]() {
9807 auto *ExactFPMathInst = Requirements.getExactFPInst();
9808 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9809 ExactFPMathInst->getDebugLoc(),
9810 ExactFPMathInst->getParent())
9811 << "loop not vectorized: cannot prove it is safe to reorder "
9812 "floating-point operations";
9813 });
9814 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9815 "reorder floating-point operations\n");
9816 Hints.emitRemarkWithHints();
9817 return false;
9818 }
9819
9820 // Use the cost model.
9821 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9822 F, &Hints, IAI);
9823 // Use the planner for vectorization.
9824 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9825 ORE);
9826
9827 // Get user vectorization factor and interleave count.
9828 ElementCount UserVF = Hints.getWidth();
9829 unsigned UserIC = Hints.getInterleave();
9830
9831 // Plan how to best vectorize, return the best VF and its cost.
9832 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9833
9836
9838 unsigned IC = 1;
9839
9840 bool AddBranchWeights =
9841 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9842 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9843 F->getDataLayout(), AddBranchWeights);
9844 if (MaybeVF) {
9845 VF = *MaybeVF;
9846 // Select the interleave count.
9847 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9848
9849 unsigned SelectedIC = std::max(IC, UserIC);
9850 // Optimistically generate runtime checks if they are needed. Drop them if
9851 // they turn out to not be profitable.
9852 if (VF.Width.isVector() || SelectedIC > 1)
9853 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
9854
9855 // Check if it is profitable to vectorize with runtime checks.
9856 bool ForceVectorization =
9858 if (!ForceVectorization &&
9860 *PSE.getSE(), SEL)) {
9861 ORE->emit([&]() {
9863 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9864 L->getHeader())
9865 << "loop not vectorized: cannot prove it is safe to reorder "
9866 "memory operations";
9867 });
9868 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9869 Hints.emitRemarkWithHints();
9870 return false;
9871 }
9872 }
9873
9874 // Identify the diagnostic messages that should be produced.
9875 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9876 bool VectorizeLoop = true, InterleaveLoop = true;
9877 if (VF.Width.isScalar()) {
9878 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9879 VecDiagMsg = std::make_pair(
9880 "VectorizationNotBeneficial",
9881 "the cost-model indicates that vectorization is not beneficial");
9882 VectorizeLoop = false;
9883 }
9884
9885 if (!MaybeVF && UserIC > 1) {
9886 // Tell the user interleaving was avoided up-front, despite being explicitly
9887 // requested.
9888 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9889 "interleaving should be avoided up front\n");
9890 IntDiagMsg = std::make_pair(
9891 "InterleavingAvoided",
9892 "Ignoring UserIC, because interleaving was avoided up front");
9893 InterleaveLoop = false;
9894 } else if (IC == 1 && UserIC <= 1) {
9895 // Tell the user interleaving is not beneficial.
9896 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9897 IntDiagMsg = std::make_pair(
9898 "InterleavingNotBeneficial",
9899 "the cost-model indicates that interleaving is not beneficial");
9900 InterleaveLoop = false;
9901 if (UserIC == 1) {
9902 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9903 IntDiagMsg.second +=
9904 " and is explicitly disabled or interleave count is set to 1";
9905 }
9906 } else if (IC > 1 && UserIC == 1) {
9907 // Tell the user interleaving is beneficial, but it explicitly disabled.
9908 LLVM_DEBUG(
9909 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9910 IntDiagMsg = std::make_pair(
9911 "InterleavingBeneficialButDisabled",
9912 "the cost-model indicates that interleaving is beneficial "
9913 "but is explicitly disabled or interleave count is set to 1");
9914 InterleaveLoop = false;
9915 }
9916
9917 // Override IC if user provided an interleave count.
9918 IC = UserIC > 0 ? UserIC : IC;
9919
9920 // Emit diagnostic messages, if any.
9921 const char *VAPassName = Hints.vectorizeAnalysisPassName();
9922 if (!VectorizeLoop && !InterleaveLoop) {
9923 // Do not vectorize or interleaving the loop.
9924 ORE->emit([&]() {
9925 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9926 L->getStartLoc(), L->getHeader())
9927 << VecDiagMsg.second;
9928 });
9929 ORE->emit([&]() {
9930 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9931 L->getStartLoc(), L->getHeader())
9932 << IntDiagMsg.second;
9933 });
9934 return false;
9935 } else if (!VectorizeLoop && InterleaveLoop) {
9936 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9937 ORE->emit([&]() {
9938 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9939 L->getStartLoc(), L->getHeader())
9940 << VecDiagMsg.second;
9941 });
9942 } else if (VectorizeLoop && !InterleaveLoop) {
9943 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9944 << ") in " << L->getLocStr() << '\n');
9945 ORE->emit([&]() {
9946 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9947 L->getStartLoc(), L->getHeader())
9948 << IntDiagMsg.second;
9949 });
9950 } else if (VectorizeLoop && InterleaveLoop) {
9951 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9952 << ") in " << L->getLocStr() << '\n');
9953 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9954 }
9955
9956 bool DisableRuntimeUnroll = false;
9957 MDNode *OrigLoopID = L->getLoopID();
9958 {
9959 using namespace ore;
9960 if (!VectorizeLoop) {
9961 assert(IC > 1 && "interleave count should not be 1 or 0");
9962 // If we decided that it is not legal to vectorize the loop, then
9963 // interleave it.
9964 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
9965 &CM, BFI, PSI, Checks);
9966
9967 ElementCount BestVF = LVP.computeBestVF();
9968 assert(BestVF.isScalar() &&
9969 "VPlan cost model and legacy cost model disagreed");
9970 VPlan &BestPlan = LVP.getPlanFor(BestVF);
9971 LVP.executePlan(BestVF, IC, BestPlan, Unroller, DT, false);
9972
9973 ORE->emit([&]() {
9974 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9975 L->getHeader())
9976 << "interleaved loop (interleaved count: "
9977 << NV("InterleaveCount", IC) << ")";
9978 });
9979 } else {
9980 // If we decided that it is *legal* to vectorize the loop, then do it.
9981
9982 ElementCount BestVF = LVP.computeBestVF();
9983 LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << BestVF << "\n");
9984 assert(VF.Width == BestVF &&
9985 "VPlan cost model and legacy cost model disagreed");
9986 VPlan &BestPlan = LVP.getPlanFor(BestVF);
9987 // Consider vectorizing the epilogue too if it's profitable.
9988 VectorizationFactor EpilogueVF =
9989 LVP.selectEpilogueVectorizationFactor(BestVF, IC);
9990 if (EpilogueVF.Width.isVector()) {
9991
9992 // The first pass vectorizes the main loop and creates a scalar epilogue
9993 // to be vectorized by executing the plan (potentially with a different
9994 // factor) again shortly afterwards.
9995 EpilogueLoopVectorizationInfo EPI(BestVF, IC, EpilogueVF.Width, 1);
9996 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
9997 EPI, &LVL, &CM, BFI, PSI, Checks);
9998
9999 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10000 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
10001 EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
10002 ++LoopsVectorized;
10003
10004 // Second pass vectorizes the epilogue and adjusts the control flow
10005 // edges from the first pass.
10006 EPI.MainLoopVF = EPI.EpilogueVF;
10007 EPI.MainLoopUF = EPI.EpilogueUF;
10008 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10009 ORE, EPI, &LVL, &CM, BFI, PSI,
10010 Checks);
10011
10012 VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF);
10013 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10014 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10015 Header->setName("vec.epilog.vector.body");
10016
10017 // Re-use the trip count and steps expanded for the main loop, as
10018 // skeleton creation needs it as a value that dominates both the scalar
10019 // and vector epilogue loops
10020 // TODO: This is a workaround needed for epilogue vectorization and it
10021 // should be removed once induction resume value creation is done
10022 // directly in VPlan.
10023 EpilogILV.setTripCount(MainILV.getTripCount());
10024 for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
10025 auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
10026 auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn(
10027 ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10028 ExpandR->replaceAllUsesWith(ExpandedVal);
10029 if (BestEpiPlan.getTripCount() == ExpandR)
10030 BestEpiPlan.resetTripCount(ExpandedVal);
10031 ExpandR->eraseFromParent();
10032 }
10033
10034 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10035 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10036 // before vectorizing the epilogue loop.
10037 for (VPRecipeBase &R : Header->phis()) {
10038 if (isa<VPCanonicalIVPHIRecipe>(&R))
10039 continue;
10040
10041 Value *ResumeV = nullptr;
10042 // TODO: Move setting of resume values to prepareToExecute.
10043 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10044 const RecurrenceDescriptor &RdxDesc =
10045 ReductionPhi->getRecurrenceDescriptor();
10046 RecurKind RK = RdxDesc.getRecurrenceKind();
10047 ResumeV = ReductionResumeValues.find(&RdxDesc)->second;
10049 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10050 // start value; compare the final value from the main vector loop
10051 // to the start value.
10052 IRBuilder<> Builder(
10053 cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10054 ResumeV = Builder.CreateICmpNE(ResumeV,
10055 RdxDesc.getRecurrenceStartValue());
10056 }
10057 } else {
10058 // Create induction resume values for both widened pointer and
10059 // integer/fp inductions and update the start value of the induction
10060 // recipes to use the resume value.
10061 PHINode *IndPhi = nullptr;
10062 const InductionDescriptor *ID;
10063 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10064 IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10065 ID = &Ind->getInductionDescriptor();
10066 } else {
10067 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10068 IndPhi = WidenInd->getPHINode();
10069 ID = &WidenInd->getInductionDescriptor();
10070 }
10071
10072 ResumeV = MainILV.createInductionResumeValue(
10073 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
10075 }
10076 assert(ResumeV && "Must have a resume value");
10077 VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV);
10078 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10079 }
10080
10081 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10082 "DT not preserved correctly");
10083 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10084 DT, true, &ExpandedSCEVs);
10085 ++LoopsEpilogueVectorized;
10086
10087 if (!MainILV.areSafetyChecksAdded())
10088 DisableRuntimeUnroll = true;
10089 } else {
10090 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, BestVF,
10091 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10092 PSI, Checks);
10093 LVP.executePlan(BestVF, IC, BestPlan, LB, DT, false);
10094 ++LoopsVectorized;
10095
10096 // Add metadata to disable runtime unrolling a scalar loop when there
10097 // are no runtime checks about strides and memory. A scalar loop that is
10098 // rarely used is not worth unrolling.
10099 if (!LB.areSafetyChecksAdded())
10100 DisableRuntimeUnroll = true;
10101 }
10102 // Report the vectorization decision.
10103 reportVectorization(ORE, L, VF, IC);
10104 }
10105
10108 }
10109
10110 std::optional<MDNode *> RemainderLoopID =
10113 if (RemainderLoopID) {
10114 L->setLoopID(*RemainderLoopID);
10115 } else {
10116 if (DisableRuntimeUnroll)
10118
10119 // Mark the loop as already vectorized to avoid vectorizing again.
10120 Hints.setAlreadyVectorized();
10121 }
10122
10123 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10124 return true;
10125}
10126
10128
10129 // Don't attempt if
10130 // 1. the target claims to have no vector registers, and
10131 // 2. interleaving won't help ILP.
10132 //
10133 // The second condition is necessary because, even if the target has no
10134 // vector registers, loop vectorization may still enable scalar
10135 // interleaving.
10138 return LoopVectorizeResult(false, false);
10139
10140 bool Changed = false, CFGChanged = false;
10141
10142 // The vectorizer requires loops to be in simplified form.
10143 // Since simplification may add new inner loops, it has to run before the
10144 // legality and profitability checks. This means running the loop vectorizer
10145 // will simplify all loops, regardless of whether anything end up being
10146 // vectorized.
10147 for (const auto &L : *LI)
10148 Changed |= CFGChanged |=
10149 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10150
10151 // Build up a worklist of inner-loops to vectorize. This is necessary as
10152 // the act of vectorizing or partially unrolling a loop creates new loops
10153 // and can invalidate iterators across the loops.
10154 SmallVector<Loop *, 8> Worklist;
10155
10156 for (Loop *L : *LI)
10157 collectSupportedLoops(*L, LI, ORE, Worklist);
10158
10159 LoopsAnalyzed += Worklist.size();
10160
10161 // Now walk the identified inner loops.
10162 while (!Worklist.empty()) {
10163 Loop *L = Worklist.pop_back_val();
10164
10165 // For the inner loops we actually process, form LCSSA to simplify the
10166 // transform.
10167 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10168
10169 Changed |= CFGChanged |= processLoop(L);
10170
10171 if (Changed) {
10172 LAIs->clear();
10173
10174#ifndef NDEBUG
10175 if (VerifySCEV)
10176 SE->verify();
10177#endif
10178 }
10179 }
10180
10181 // Process each loop nest in the function.
10182 return LoopVectorizeResult(Changed, CFGChanged);
10183}
10184
10187 LI = &AM.getResult<LoopAnalysis>(F);
10188 // There are no loops in the function. Return before computing other
10189 // expensive analyses.
10190 if (LI->empty())
10191 return PreservedAnalyses::all();
10200
10201 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10202 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10203 BFI = nullptr;
10204 if (PSI && PSI->hasProfileSummary())
10206 LoopVectorizeResult Result = runImpl(F);
10207 if (!Result.MadeAnyChange)
10208 return PreservedAnalyses::all();
10210
10211 if (isAssignmentTrackingEnabled(*F.getParent())) {
10212 for (auto &BB : F)
10214 }
10215
10216 PA.preserve<LoopAnalysis>();
10220
10221 if (Result.MadeCFGChange) {
10222 // Making CFG changes likely means a loop got vectorized. Indicate that
10223 // extra simplification passes should be run.
10224 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10225 // be run if runtime checks have been added.
10228 } else {
10230 }
10231 return PA;
10232}
10233
10235 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10236 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10237 OS, MapClassName2PassName);
10238
10239 OS << '<';
10240 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10241 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10242 OS << '>';
10243}
@ Poison
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
ReachingDefAnalysis InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
This is the interface for LLVM's primary stateless and local alias analysis.
static bool IsEmptyBlock(MachineBasicBlock *MBB)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
#define DEBUG_WITH_TYPE(TYPE, X)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition: Debug.h:64
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
uint64_t Addr
std::string Name
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define Check(C,...)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This defines the Use class.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
This header provides classes for managing per-loop analyses.
static const char * VerboseDebug
loop Loop Strength Reduction
#define LV_NAME
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static void createAndCollectMergePhiForReduction(VPInstruction *RedResult, DenseMap< const RecurrenceDescriptor *, Value * > &ReductionResumeValues, VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock, bool VectorizingEpilogue)
static std::optional< unsigned > getSmallBestKnownTC(ScalarEvolution &SE, Loop *L)
Returns "best known" trip count for the specified loop L as defined by the following procedure: 1) Re...
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static Instruction * createReverseEVL(IRBuilderBase &Builder, Value *Operand, Value *EVL, const Twine &Name)
Use all-true mask for reverse rather than actual mask, as it avoids a dependence w/o affecting the re...
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or it's operands.
const char LLVMLoopVectorizeFollowupAll[]
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, DebugLoc DL)
static std::optional< unsigned > getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI)
Convenience function that returns the value of vscale_range iff vscale_range.min == vscale_range....
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static constexpr uint32_t MemCheckBypassWeights[]
static Type * MaybeVectorizeType(Type *Elt, ElementCount VF)
cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing."))
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, std::optional< unsigned > VScale, Loop *L, ScalarEvolution &SE, ScalarEpilogueLowering SEL)
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
const char VerboseDebug[]
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop)
Creates a VPWidenIntOrFpInductionRecpipe for Phi.
static constexpr uint32_t SCEVCheckBypassWeights[]
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
const char LLVMLoopVectorizeFollowupVectorized[]
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan)
Feed a resume value for every FOR from the vector loop to the scalar loop, if middle block branches t...
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static void addUsersInExitBlock(Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, const MapVector< PHINode *, InductionDescriptor > &Inductions)
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
const char LLVMLoopVectorizeFollowupEpilogue[]
static bool useActiveLaneMask(TailFoldingStyle Style)
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
#define DEBUG_TYPE
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static void AddRuntimeUnrollDisableMetaData(Loop *L)
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static bool willGenerateVectors(VPlan &Plan, ElementCount VF, const TargetTransformInfo &TTI)
Check if any recipe of Plan will generate a vector value, which will be assigned a vector register.
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I, DebugLoc DL={})
Create an analysis remark that explains why vectorization failed.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
#define R2(n)
This file contains the declarations for metadata subclasses.
Module.h This file contains the declarations for the Module class.
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
#define OP(OPC)
Definition: SandboxIR.h:650
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
This pass exposes codegen information to IR-level passes.
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file defines the VPlanHCFGBuilder class which contains the public interface (buildHierarchicalCF...
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1520
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
void registerAssumption(AssumeInst *CI)
Add an @llvm.assume intrinsic to this function's cache.
unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
Definition: Attributes.cpp:460
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:233
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:517
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
Definition: BasicBlock.cpp:374
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:367
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:459
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:489
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
BinaryOps getOpcode() const
Definition: InstrTypes.h:442
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
bool isConditional() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1965
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1401
unsigned arg_size() const
Definition: InstrTypes.h:1408
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1594
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ FIRST_ICMP_PREDICATE
Definition: InstrTypes.h:788
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:783
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:850
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:857
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
A debug info location.
Definition: DebugLoc.h:33
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:202
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:146
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
DomTreeNodeBase * getIDom() const
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:326
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:314
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition: TypeSize.h:317
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
BasicBlock * emitMinimumVectorEpilogueIterCountCheck(BasicBlock *Bypass, BasicBlock *Insert)
Emits an iteration count bypass check after the main vector loop has finished to see if there are any...
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (ie the ...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the main loop strategy (ie the firs...
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent function types.
Definition: DerivedTypes.h:103
param_iterator param_begin() const
Definition: DerivedTypes.h:128
param_iterator param_end() const
Definition: DerivedTypes.h:129
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:705
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:214
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:384
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:769
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:702
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:743
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:91
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:922
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1193
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:463
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:933
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1091
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:308
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2265
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2261
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:142
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1361
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1344
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:468
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2371
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1421
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2686
A struct for saving information about induction variables.
const SCEV * getStep() const
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
Value * getStartValue() const
An extension of the inner loop vectorizer that creates a skeleton for a vectorized loop that has its ...
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
virtual std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)=0
The interface for creating a vectorized skeleton using one of two different strategies,...
std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
PHINode * createInductionResumeValue(PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, ArrayRef< BasicBlock * > BypassBlocks, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create a new phi node for the induction variable OrigPhi to resume iteration count in the scalar epil...
void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
BasicBlock * LoopScalarBody
The scalar loop body.
Value * TripCount
Trip count of the original loop.
void sinkScalarOperands(Instruction *PredInst)
Iteratively sink the scalarized operands of a predicated instruction into the block that was created ...
const TargetLibraryInfo * TLI
Target Library Info.
DenseMap< PHINode *, Value * > IVEndValues
ElementCount MinProfitableTripCount
const TargetTransformInfo * TTI
Target Transform Info.
Value * VectorTripCount
Trip count of the widened loop (TripCount - TripCount % (VF*UF))
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
BasicBlock * emitSCEVChecks(BasicBlock *Bypass)
Emit a bypass check to see if all of the SCEV assumptions we've had to make are correct.
LoopVectorizationCostModel * Cost
The profitablity analysis.
SmallMapVector< const RecurrenceDescriptor *, PHINode *, 4 > ReductionResumeValues
BlockFrequencyInfo * BFI
BFI and PSI are used to check for profile guided size optimizations.
Value * getTripCount() const
Returns the original loop trip count.
BasicBlock * LoopMiddleBlock
Middle Block between the vector and the scalar.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
SmallVector< Instruction *, 4 > PredicatedInstructions
Store instructions that were predicated.
BasicBlock * completeLoopSkeleton()
Complete the loop skeleton by adding debug MDs, creating appropriate conditional branches in the midd...
void createVectorLoopSkeleton(StringRef Prefix)
Emit basic blocks (prefixed with Prefix) for the iteration check, vector loop preheader,...
BasicBlock * emitMemRuntimeChecks(BasicBlock *Bypass)
Emit bypass checks to check any memory assumptions we may have made.
BasicBlock * LoopScalarPreHeader
The scalar-loop preheader.
LoopVectorizationLegality * Legal
The legality analysis.
void emitIterationCountCheck(BasicBlock *Bypass)
Emit a bypass check to see if the vector trip count is zero, including if it overflows.
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, BasicBlock *MiddleBlock, BasicBlock *VectorHeader, VPlan &Plan, VPTransformState &State)
Set up the values of the IVs correctly when exiting the vector loop.
LoopInfo * LI
Loop Info.
void createInductionResumeValues(const SCEV2ValueTy &ExpandedSCEVs, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create new phi nodes for the induction variables to resume iteration count in the scalar epilogue,...
ProfileSummaryInfo * PSI
void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State)
Fix the non-induction PHIs in Plan.
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
BasicBlock * LoopVectorPreHeader
The vector-loop preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
Value * getOrCreateVectorTripCount(BasicBlock *InsertBlock)
Returns (and creates if needed) the trip count of the widened loop.
IRBuilder Builder
The builder that we use.
virtual std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
unsigned UF
The vectorization unroll factor to use.
void fixVectorizedLoop(VPTransformState &State, VPlan &Plan)
Fix the vectorized code, taking care of header phi's, live-outs, and more.
BasicBlock * LoopExitBlock
The unique ExitBlock of the scalar loop if one exists.
SmallVector< BasicBlock *, 4 > LoopBypassBlocks
A list of all bypass blocks. The first block is the entry of the loop.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
Definition: Instruction.cpp:97
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:466
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
void replaceSuccessorWith(BasicBlock *OldBB, BasicBlock *NewBB)
Replace specified successor OldBB to point at the provided block.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:169
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
const char * getOpcodeName() const
Definition: Instruction.h:276
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:463
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:266
The group of interleaved loads/stores sharing the same stride and close to each other.
Definition: VectorUtils.h:468
uint32_t getFactor() const
Definition: VectorUtils.h:484
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
Definition: VectorUtils.h:538
InstTy * getInsertPos() const
Definition: VectorUtils.h:554
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:610
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
Definition: VectorUtils.h:655
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
Definition: VectorUtils.h:666
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
Definition: VectorUtils.h:647
bool invalidateGroups()
Invalidate groups, e.g., in case all blocks in loop will be predicated contrary to original assumptio...
Definition: VectorUtils.h:630
iterator_range< SmallPtrSetIterator< llvm::InterleaveGroup< Instruction > * > > getInterleaveGroups()
Definition: VectorUtils.h:660
void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:174
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
iterator_range< block_iterator > blocks() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
Store the result of a depth first search within basic blocks contained by a single loop.
Definition: LoopIterator.h:97
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
Definition: LoopIterator.h:136
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopInfo.cpp:1254
RPOIterator endRPO() const
Definition: LoopIterator.h:140
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopIterator.h:180
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool requiresScalarEpilogue(VFRange Range) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
std::optional< InstructionCost > getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, TTI::TargetCostKind CostKind) const
Return the cost of instructions in an inloop reduction pattern, if I is part of that pattern.
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI)
LoopVectorizationLegality * Legal
Vectorization legality.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
InstructionCost getInstructionCost(Instruction *I, ElementCount VF)
Returns the execution time cost of an instruction for a given vector width.
DemandedBits * DB
Demanded bits analysis.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr) const
Get the interleaved access group that Instr belongs to.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
Loop * TheLoop
The loop that we evaluate.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool isAccessInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleaved access group.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallVector< RegisterUsage, 8 > calculateRegisterUsage(ArrayRef< ElementCount > VFs)
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
bool isEpilogueVectorizationProfitable(const ElementCount VF) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
void collectUniformsAndScalars(ElementCount VF)
Collect Uniform and Scalar values for the given VF.
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const
Returns true if I is an instruction which requires predication and for which our chosen predication s...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
InstructionCost expectedCost(ElementCount VF)
Returns the expected execution cost.
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost)
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
bool hasVectorCallVariants() const
Returns true if there is at least one function call in the loop which has a vectorized variant availa...
bool isInvariantAddressOfReduction(Value *V)
Returns True if given address is invariant and is used to store recurrent expression.
bool blockNeedsPredication(BasicBlock *BB) const
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const
Check if this pointer is consecutive when vectorizing.
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
bool isReductionVariable(PHINode *PN) const
Returns True if PN is a reduction variable in this loop.
bool isFixedOrderRecurrence(const PHINode *Phi) const
Returns True if Phi is a fixed-order recurrence in this loop.
const InductionDescriptor * getPointerInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is pointer induction.
const InductionDescriptor * getIntOrFpInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is an integer or floating point induction.
bool isInductionPhi(const Value *V) const
Returns True if V is a Phi node of an induction variable in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool isInvariant(Value *V) const
Returns true if V is invariant across all loop iterations according to SCEV.
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
bool canFoldTailByMasking() const
Return true if we can vectorize this loop while folding its tail by masking.
void prepareToFoldTailByMasking()
Mark all respective loads/stores for masking.
Type * getWidestInductionType()
Returns the widest induction type.
const LoopAccessInfo * getLAI() const
bool isUniformMemOp(Instruction &I, ElementCount VF) const
A uniform memory op is a load or store which accesses the same memory location on all VF lanes,...
bool isMaskRequired(const Instruction *I) const
Returns true if vector representation of the instruction I requires mask.
const RuntimePointerChecking * getRuntimePointerChecking() const
Returns the information that we collected about runtime memory check.
Planner drives the vectorization process after having passed Legality checks.
std::optional< VectorizationFactor > plan(ElementCount UserVF, unsigned UserIC)
Plan how to best vectorize, return the best VF and its cost, or std::nullopt if vectorization and int...
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)
VPlan & getPlanFor(ElementCount VF) const
Return the VPlan for VF.
Definition: VPlan.cpp:1683
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
ElementCount computeBestVF()
Compute and return the most profitable vectorization factor.
std::pair< DenseMap< const SCEV *, Value * >, DenseMap< const RecurrenceDescriptor *, Value * > > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool IsEpilogueVectorization, const DenseMap< const SCEV *, Value * > *ExpandedSCEVs=nullptr)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
Definition: VPlan.cpp:1671
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE)
Emit remarks for recipes with invalid costs in the available VPlans.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition: VPlan.cpp:1652
void printPlans(raw_ostream &O)
Definition: VPlan.cpp:1697
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
void prepareNoAliasMetadata()
Set up the aliasing scopes based on the memchecks.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition: LoopInfo.cpp:67
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:632
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:61
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Definition: LoopInfo.cpp:502
Metadata node.
Definition: Metadata.h:1069
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1077
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1542
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1436
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:606
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
size_type size() const
Definition: MapVector.h:60
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:193
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:688
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
void setIncomingValueForBlock(const BasicBlock *BB, Value *V)
Set every incoming value(s) for block BB to V.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static unsigned getIncomingValueNumForOperand(unsigned i)
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1852
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
void preserve()
Mark an analysis as preserved.
Definition: Analysis.h:131
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:70
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
Instruction * getLoopExitInstr() const
static unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
TrackingVH< Value > getRecurrenceStartValue() const
SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
bool Need
This flag indicates if we need to add the runtime check.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class represents a constant integer value.
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
This class represents an assumption made using SCEV expressions which can be checked at run-time.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isZero() const
Return true if the expression is a constant zero.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getSmallConstantMaxTripCount(const Loop *L)
Returns the upper bound of the loop trip count as a normal unsigned value.
const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVMContext & getContext() const
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
value_type pop_back_val()
Definition: SetVector.h:285
size_type size() const
Definition: SmallPtrSet.h:95
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:346
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:384
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:435
iterator end() const
Definition: SmallPtrSet.h:460
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:367
iterator begin() const
Definition: SmallPtrSet.h:455
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:502
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:290
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Multiway switch.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
std::optional< unsigned > getVScaleForTuning() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const
Return true if the target supports masked scatter.
bool enableAggressiveInterleaving(bool LoopHasReductions) const
Don't restrict interleaved unrolling to small loops.
bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
bool prefersVectorizedAddressing() const
Return true if target doesn't mind addresses in vectors.
bool hasBranchDivergence(const Function *F=nullptr) const
Return true if branch divergence exists.
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE=nullptr, const SCEV *Ptr=nullptr) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const
Get target-customized preferences for the generic loop unrolling transformation.
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instructions unique non-constant operands.
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
std::optional< unsigned > getMaxVScale() const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool enableOrderedReductions() const
Return true if we should be enabling ordered reductions for the target.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Query the target what the preferred style of tail folding is.
unsigned getRegUsageForType(Type *Ty) const
Returns the estimated number of registers required to represent Ty.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add ...
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isElementTypeLegalForScalableVector(Type *Ty) const
ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const
bool enableMaskedInterleavedAccessVectorization() const
Enable matching of interleaved access groups that contain predicated accesses or gaps and therefore v...
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
const char * getRegisterClassName(unsigned ClassID) const
bool preferEpilogueVectorization() const
Return true if the loop vectorizer should consider vectorizing an otherwise scalar epilogue loop.
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
bool isLegalMaskedStore(Type *DataType, Align Alignment) const
Return true if the target supports masked store.
bool enableInterleavedAccessVectorization() const
Enable matching of interleaved access groups.
unsigned getMinTripCountTailFoldingThreshold() const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
unsigned getMaxInterleaveFactor(ElementCount VF) const
unsigned getNumberOfParts(Type *Tp) const
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr) const
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
Query the target whether it would be prefered to create a predicated vector loop, which can avoid the...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool isLegalMaskedLoad(Type *DataType, Align Alignment) const
Return true if the target supports masked load.
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition: TypeSwitch.h:87
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition: TypeSwitch.h:96
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:261
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition: Type.h:239
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:224
bool isTokenTy() const
Return true if this is 'token'.
Definition: Type.h:221
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:343
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:242
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
op_iterator op_begin()
Definition: User.h:234
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
op_iterator op_end()
Definition: User.h:236
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:71
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:2986
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition: VPlan.h:3058
RecipeListTy::iterator iterator
Instruction iterators...
Definition: VPlan.h:3010
void execute(VPTransformState *State) override
The method which generates the output IR instructions that correspond to this VPBasicBlock,...
Definition: VPlan.cpp:488
iterator end()
Definition: VPlan.h:3020
iterator begin()
Recipe iterator methods.
Definition: VPlan.h:3018
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition: VPlan.h:3071
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition: VPlan.cpp:217
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition: VPlan.h:3049
bool empty() const
Definition: VPlan.h:3029
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition: VPlan.h:2041
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:437
VPRegionBlock * getParent()
Definition: VPlan.h:509
const VPBasicBlock * getExitingBasicBlock() const
Definition: VPlan.cpp:182
void setName(const Twine &newName)
Definition: VPlan.h:502
const VPBasicBlock * getEntryBasicBlock() const
Definition: VPlan.cpp:160
VPBlockBase * getSingleSuccessor() const
Definition: VPlan.h:544
const VPBlocksTy & getSuccessors() const
Definition: VPlan.h:534
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBase NewBlock after BlockPtr.
Definition: VPlan.h:3594
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPBasicBlock * getInsertBlock() const
VPValue * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL={}, const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createOverflowingOp(unsigned Opcode, std::initializer_list< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL={}, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPValue * createNot(VPValue *Operand, DebugLoc DL={}, const Twine &Name="")
VPValue * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPValue * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL={}, const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition: VPlan.h:2725
Type * getScalarType() const
Returns the scalar type of the induction.
Definition: VPlan.h:2754
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition: VPlanValue.h:396
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPValue * getStepValue() const
Definition: VPlan.h:2924
VPValue * getStartValue() const
Definition: VPlan.h:2923
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition: VPlan.h:1727
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition: VPlan.h:1771
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition: VPlan.h:1760
This is a concrete Recipe that models a single VPlan-level instruction.
Definition: VPlan.h:1233
unsigned getOpcode() const
Definition: VPlan.h:1345
@ ResumePhi
Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
Definition: VPlan.h:1251
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition: VPlan.h:2098
static VPLane getLastLaneForVF(const ElementCount &VF)
Definition: VPlan.h:196
static VPLane getFirstLane()
Definition: VPlan.h:180
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition: VPlan.h:766
VPBasicBlock * getParent()
Definition: VPlan.h:791
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition: VPlan.h:862
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPValue * getVPValueOrAddLiveIn(Value *V, VPlan &Plan)
VPValue * createEdgeMask(BasicBlock *Src, BasicBlock *Dst)
A helper function that computes the predicate of the edge between SRC and DST.
VPReplicateRecipe * handleReplication(Instruction *I, VFRange &Range)
Build a VPReplicationRecipe for I.
void createSwitchEdgeMasks(SwitchInst *SI)
Create an edge mask for every destination of cases and/or default.
VPValue * getBlockInMask(BasicBlock *BB) const
Returns the entry mask for the block BB.
VPValue * getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const
A helper that returns the previously computed predicate of the edge between SRC and DST.
iterator_range< mapped_iterator< Use *, std::function< VPValue *(Value *)> > > mapToVPValues(User::op_range Operands)
Returns a range mapping the values of the range Operands to their corresponding VPValues.
void fixHeaderPhis()
Add the incoming values from the backedge to reduction & first-order recurrence cross-iteration phis.
VPRecipeBase * tryToCreateWidenRecipe(Instruction *Instr, ArrayRef< VPValue * > Operands, VFRange &Range, VPBasicBlock *VPBB)
Create and return a widened recipe for I if one can be created within the given VF Range.
void createHeaderMask()
Create the mask for the vector loop header block.
void createBlockInMask(BasicBlock *BB)
A helper function that computes the predicate of the block BB, assuming that the header block of the ...
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
void setFlags(Instruction *I) const
Set the IR flags for I.
Definition: VPlan.h:1153
A recipe for handling reduction phis.
Definition: VPlan.h:1982
bool isInLoop() const
Returns true, if the phi is part of an in-loop reduction.
Definition: VPlan.h:2036
const RecurrenceDescriptor & getRecurrenceDescriptor() const
Definition: VPlan.h:2028
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition: VPlan.h:2189
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition: VPlan.h:3164
const VPBlockBase * getEntry() const
Definition: VPlan.h:3203
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition: VPlan.h:3235
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition: VPlan.h:2304
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isUniform() const
Definition: VPlan.h:2344
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition: VPlan.h:895
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition: VPlan.h:959
An analysis for type-inference for VPValues.
Definition: VPlanAnalysis.h:39
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition: VPlanValue.h:202
operand_range operands()
Definition: VPlanValue.h:272
void setOperand(unsigned I, VPValue *New)
Definition: VPlanValue.h:257
unsigned getNumOperands() const
Definition: VPlanValue.h:251
VPValue * getOperand(unsigned N) const
Definition: VPlanValue.h:252
void addOperand(VPValue *Operand)
Definition: VPlanValue.h:246
void replaceAllUsesWith(VPValue *New)
Definition: VPlan.cpp:1426
user_iterator user_begin()
Definition: VPlanValue.h:128
Value * getLiveInIRValue()
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition: VPlanValue.h:172
user_iterator user_end()
Definition: VPlanValue.h:130
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition: VPlanValue.h:167
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition: VPlan.cpp:1430
user_range users()
Definition: VPlanValue.h:132
A recipe to compute the pointers for widened memory accesses of IndexTy for all parts.
Definition: VPlan.h:1663
A recipe for widening Call instructions.
Definition: VPlan.h:1534
A Recipe for widening the canonical induction variable of the vector loop.
Definition: VPlan.h:2850
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition: VPlan.h:1447
A recipe for handling GEP instructions.
Definition: VPlan.h:1621
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition: VPlan.h:1784
A common base class for widening memory operations.
Definition: VPlan.h:2461
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
Definition: VPlan.h:2508
Instruction & Ingredient
Definition: VPlan.h:2463
VPValue * getMask() const
Return the mask used by this recipe.
Definition: VPlan.h:2522
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition: VPlan.h:2515
bool isReverse() const
Return whether the consecutive loaded/stored addresses are in reverse order.
Definition: VPlan.h:2512
A recipe for handling phis that are widened in the vector loop.
Definition: VPlan.h:1910
VPValue * getIncomingValue(unsigned I)
Returns the I th incoming VPValue.
Definition: VPlan.h:1949
VPBasicBlock * getIncomingBlock(unsigned I)
Returns the I th incoming VPBasicBlock.
Definition: VPlan.h:1946
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition: VPlan.h:1410
Main class to build the VPlan H-CFG for an incoming IR.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition: VPlan.h:3268
void prepareToExecute(Value *TripCount, Value *VectorTripCount, Value *CanonicalIVStartValue, VPTransformState &State)
Prepare the plan for execution, setting up the required live-in values.
Definition: VPlan.cpp:921
VPBasicBlock * getEntry()
Definition: VPlan.h:3370
VPValue & getVectorTripCount()
The vector trip count.
Definition: VPlan.h:3395
void setName(const Twine &newName)
Definition: VPlan.h:3432
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition: VPlan.h:3398
VPValue * getTripCount() const
The trip count of the original loop.
Definition: VPlan.h:3374
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition: VPlan.h:3388
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition: VPlan.h:3415
void addLiveOut(PHINode *PN, VPValue *V)
Definition: VPlan.cpp:1187
VPBasicBlock * getPreheader()
Definition: VPlan.h:3503
VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition: VPlan.h:3470
bool hasVF(ElementCount VF)
Definition: VPlan.h:3408
bool hasUF(unsigned UF) const
Definition: VPlan.h:3421
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this plan.
Definition: VPlan.cpp:1091
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition: VPlan.h:3381
static VPlanPtr createInitialVPlan(const SCEV *TripCount, ScalarEvolution &PSE, bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop)
Create initial VPlan, having an "entry" VPBasicBlock (wrapping original scalar pre-header ) which con...
Definition: VPlan.cpp:863
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition: VPlan.h:3436
LLVM_DUMP_METHOD void dump() const
Dump the plan to stderr (for debugging).
Definition: VPlan.cpp:1184
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition: VPlan.cpp:981
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition: VPlan.h:3478
const MapVector< PHINode *, VPLiveOut * > & getLiveOuts() const
Definition: VPlan.h:3489
VPValue * getSCEVExpansion(const SCEV *S) const
Definition: VPlan.h:3493
VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition: VPlan.cpp:1230
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition: Value.cpp:157
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
VectorBuilder & setEVL(Value *NewExplicitVectorLength)
Definition: VectorBuilder.h:82
VectorBuilder & setMask(Value *NewMask)
Definition: VectorBuilder.h:78
Value * createVectorInstruction(unsigned Opcode, Type *ReturnTy, ArrayRef< Value * > VecOpArray, const Twine &Name=Twine())
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:671
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:664
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:232
constexpr bool isNonZero() const
Definition: TypeSize.h:158
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:218
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition: TypeSize.h:258
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr bool isZero() const
Definition: TypeSize.h:156
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:225
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:254
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:239
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
A range adaptor for a pair of iterators.
IteratorT end() const
IteratorT begin() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ Entry
Definition: COFF.h:826
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition: DwarfDebug.h:190
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:816
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:236
bool isUniformAfterVectorization(const VPValue *VPV)
Returns true if VPV is uniform after vectorization.
Definition: VPlan.h:3818
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE)
Get or create a VPValue that corresponds to the expansion of Expr.
Definition: VPlan.cpp:1615
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
pred_iterator pred_end(BasicBlock *BB)
Definition: CFG.h:114
@ Offset
Definition: DWP.cpp:480
Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
Definition: LoopUtils.cpp:1894
void stable_sort(R &&Range)
Definition: STLExtras.h:2020
bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:849
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:988
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2431
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7137
auto successors(const MachineBasicBlock *BB)
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
const SCEV * createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, Loop *OrigLoop)
std::pair< Instruction *, ElementCount > InstructionVFPair
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:465
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
Definition: LoopUtils.cpp:263
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2098
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition: VPlanCFG.h:214
bool VerifySCEV
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition: VPlanCFG.h:226
auto map_range(ContainerTy &&C, FuncTy F)
Definition: STLExtras.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
void collectEphemeralRecipesForVPlan(VPlan &Plan, DenseSet< VPRecipeBase * > &EphRecipes)
pred_iterator pred_begin(BasicBlock *BB)
Definition: CFG.h:110
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
cl::opt< bool > EnableVPlanNativePath("enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization."))
Definition: VPlan.cpp:56
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
std::unique_ptr< VPlan > VPlanPtr
Definition: VPlan.h:147
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
cl::opt< bool > EnableLoopVectorization
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:419
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition: STLExtras.h:572
void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition: STLExtras.h:336
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
Type * ToVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
Definition: VectorUtils.h:133
TargetTransformInfo TTI
static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
Definition: DebugInfo.cpp:2242
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:33
@ Or
Bitwise or logical OR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, Loop *RemainderLoop, uint64_t UF)
Set weights for UnrolledLoop and RemainderLoop based on weights for OrigLoop and the following distri...
Definition: LoopUtils.cpp:1701
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
ScalarEpilogueLowering
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1886
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
Definition: LoopUtils.cpp:1952
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2070
InstructionCost Cost
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
unsigned getReciprocalPredBlockProb()
A helper function that returns the reciprocal of the block probability of predicated blocks.
Definition: VPlan.h:95
bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:593
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
bool verifyVPlanIsValid(const VPlan &Plan)
Verify invariants for general VPlans.
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:471
cl::opt< bool > EnableLoopInterleaving
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:28
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
SmallMapVector< unsigned, unsigned, 4 > LoopInvariantRegs
Holds the number of loop invariant values that are used in the loop.
TargetLibraryInfo * TLI
LoopVectorizeResult runImpl(Function &F)
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LoopVectorizePass(LoopVectorizeOptions Opts={})
BlockFrequencyInfo * BFI
ScalarEvolution * SE
AssumptionCache * AC
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
Storage for information about made changes.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition: PassManager.h:69
A marker to determine if extra passes after loop vectorization should be run.
Definition: LoopVectorize.h:86
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Flags describing the kind of vector reduction.
Parameters that control the generic loop unrolling transformation.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Definition: VPlan.h:100
ElementCount End
Definition: VPlan.h:105
Struct to hold various analysis needed for cost computations.
Definition: VPlan.h:737
LoopVectorizationCostModel & CM
Definition: VPlan.h:742
bool skipCostComputation(Instruction *UI, bool IsVector) const
Return true if the cost for UI shouldn't be computed, e.g.
InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const
Return the cost for UI with VF using the legacy cost model as fallback until computing the cost of al...
SmallPtrSet< Instruction *, 8 > SkipCostComputation
Definition: VPlan.h:743
A recipe for handling first-order recurrence phis.
Definition: VPlan.h:1955
VPIteration represents a single point in the iteration space of the output (vectorized and/or unrolle...
Definition: VPlan.h:238
bool isFirstIteration() const
Definition: VPlan.h:250
BasicBlock * PrevBB
The previous IR BasicBlock created or used.
Definition: VPlan.h:384
SmallDenseMap< VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
Definition: VPlan.h:392
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
Definition: VPlan.h:255
Value * get(VPValue *Def, unsigned Part, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def and a given Part if IsScalar is false,...
Definition: VPlan.cpp:259
DenseMap< const SCEV *, Value * > ExpandedSCEVs
Map SCEVs to their expanded values.
Definition: VPlan.h:429
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
Definition: VPlan.h:432
void addMetadata(Value *To, Instruction *From)
Add metadata from one instruction to another.
Definition: VPlan.cpp:374
struct llvm::VPTransformState::CFGState CFG
LoopVersioning * LVer
LoopVersioning.
Definition: VPlan.h:425
void addNewMetadata(Instruction *To, const Instruction *Orig)
Add additional metadata to To that was not present on Orig.
Definition: VPlan.cpp:366
void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance)
Construct the vector value of a scalarized value V one lane at a time.
Definition: VPlan.cpp:406
void set(VPValue *Def, Value *V, unsigned Part, bool IsScalar=false)
Set the generated vector Value for a given VPValue and a given Part, if IsScalar is false.
Definition: VPlan.h:307
std::optional< VPIteration > Instance
Hold the indices to generate specific scalar instructions.
Definition: VPlan.h:267
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
Definition: VPlan.h:409
VPlan * Plan
Pointer to the VPlan code is generated for.
Definition: VPlan.h:415
InnerLoopVectorizer * ILV
Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
Definition: VPlan.h:412
ElementCount VF
The chosen Vectorization and Unroll Factors of the loop being vectorized.
Definition: VPlan.h:261
void setDebugLocFrom(DebugLoc DL)
Set the debug location in the builder using the debug location DL.
Definition: VPlan.cpp:385
void execute(VPTransformState &State) override
Generate the wide load or gather.
VPValue * getEVL() const
Return the EVL operand.
Definition: VPlan.h:2588
A recipe for widening load operations, using the address to load from and an optional mask.
Definition: VPlan.h:2537
A recipe for widening select instructions.
Definition: VPlan.h:1587
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition: VPlan.h:2663
void execute(VPTransformState &State) override
Generate the wide store or scatter.
VPValue * getEVL() const
Return the EVL operand.
Definition: VPlan.h:2666
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition: VPlan.h:2611
static bool tryAddExplicitVectorLength(VPlan &Plan)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void dropPoisonGeneratingRecipes(VPlan &Plan, function_ref< bool(BasicBlock *)> BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimize(VPlan &Plan, ScalarEvolution &SE)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void VPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, ScalarEvolution &SE, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs, LLVMContext &Ctx)
Insert truncates and extends for any truncated recipe.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Sink users of fixed-order recurrences after the recipe defining their previous value.
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.