LLVM 19.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanHCFGBuilder.h"
62#include "VPlanTransforms.h"
63#include "VPlanVerifier.h"
64#include "llvm/ADT/APInt.h"
65#include "llvm/ADT/ArrayRef.h"
66#include "llvm/ADT/DenseMap.h"
68#include "llvm/ADT/Hashing.h"
69#include "llvm/ADT/MapVector.h"
70#include "llvm/ADT/STLExtras.h"
72#include "llvm/ADT/SmallSet.h"
74#include "llvm/ADT/Statistic.h"
75#include "llvm/ADT/StringRef.h"
76#include "llvm/ADT/Twine.h"
81#include "llvm/Analysis/CFG.h"
97#include "llvm/IR/Attributes.h"
98#include "llvm/IR/BasicBlock.h"
99#include "llvm/IR/CFG.h"
100#include "llvm/IR/Constant.h"
101#include "llvm/IR/Constants.h"
102#include "llvm/IR/DataLayout.h"
103#include "llvm/IR/DebugInfo.h"
105#include "llvm/IR/DebugLoc.h"
106#include "llvm/IR/DerivedTypes.h"
108#include "llvm/IR/Dominators.h"
109#include "llvm/IR/Function.h"
110#include "llvm/IR/IRBuilder.h"
111#include "llvm/IR/InstrTypes.h"
112#include "llvm/IR/Instruction.h"
113#include "llvm/IR/Instructions.h"
115#include "llvm/IR/Intrinsics.h"
116#include "llvm/IR/MDBuilder.h"
117#include "llvm/IR/Metadata.h"
118#include "llvm/IR/Module.h"
119#include "llvm/IR/Operator.h"
120#include "llvm/IR/PatternMatch.h"
122#include "llvm/IR/Type.h"
123#include "llvm/IR/Use.h"
124#include "llvm/IR/User.h"
125#include "llvm/IR/Value.h"
126#include "llvm/IR/ValueHandle.h"
128#include "llvm/IR/Verifier.h"
129#include "llvm/Support/Casting.h"
132#include "llvm/Support/Debug.h"
145#include <algorithm>
146#include <cassert>
147#include <cmath>
148#include <cstdint>
149#include <functional>
150#include <iterator>
151#include <limits>
152#include <map>
153#include <memory>
154#include <string>
155#include <tuple>
156#include <utility>
157
158using namespace llvm;
159
160#define LV_NAME "loop-vectorize"
161#define DEBUG_TYPE LV_NAME
162
163#ifndef NDEBUG
164const char VerboseDebug[] = DEBUG_TYPE "-verbose";
165#endif
166
167/// @{
168/// Metadata attribute names
169const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
171 "llvm.loop.vectorize.followup_vectorized";
173 "llvm.loop.vectorize.followup_epilogue";
174/// @}
175
176STATISTIC(LoopsVectorized, "Number of loops vectorized");
177STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
178STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
179
181 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
182 cl::desc("Enable vectorization of epilogue loops."));
183
185 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
186 cl::desc("When epilogue vectorization is enabled, and a value greater than "
187 "1 is specified, forces the given VF for all applicable epilogue "
188 "loops."));
189
191 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
192 cl::desc("Only loops with vectorization factor equal to or larger than "
193 "the specified value are considered for epilogue vectorization."));
194
195/// Loops with a known constant trip count below this number are vectorized only
196/// if no scalar iteration overheads are incurred.
198 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
199 cl::desc("Loops with a constant trip count that is smaller than this "
200 "value are vectorized only if no scalar iteration overheads "
201 "are incurred."));
202
204 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
205 cl::desc("The maximum allowed number of runtime memory checks"));
206
207// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
208// that predication is preferred, and this lists all options. I.e., the
209// vectorizer will try to fold the tail-loop (epilogue) into the vector body
210// and predicate the instructions accordingly. If tail-folding fails, there are
211// different fallback strategies depending on these values:
213 enum Option {
217 };
218} // namespace PreferPredicateTy
219
221 "prefer-predicate-over-epilogue",
224 cl::desc("Tail-folding and predication preferences over creating a scalar "
225 "epilogue loop."),
227 "scalar-epilogue",
228 "Don't tail-predicate loops, create scalar epilogue"),
230 "predicate-else-scalar-epilogue",
231 "prefer tail-folding, create scalar epilogue if tail "
232 "folding fails."),
234 "predicate-dont-vectorize",
235 "prefers tail-folding, don't attempt vectorization if "
236 "tail-folding fails.")));
237
239 "force-tail-folding-style", cl::desc("Force the tail folding style"),
240 cl::init(TailFoldingStyle::None),
242 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
244 TailFoldingStyle::Data, "data",
245 "Create lane mask for data only, using active.lane.mask intrinsic"),
246 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
247 "data-without-lane-mask",
248 "Create lane mask with compare/stepvector"),
249 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
250 "Create lane mask using active.lane.mask intrinsic, and use "
251 "it for both data and control flow"),
252 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
253 "data-and-control-without-rt-check",
254 "Similar to data-and-control, but remove the runtime check"),
255 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
256 "Use predicated EVL instructions for tail folding. If EVL "
257 "is unsupported, fallback to data-without-lane-mask.")));
258
260 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
261 cl::desc("Maximize bandwidth when selecting vectorization factor which "
262 "will be determined by the smallest type in loop."));
263
265 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
266 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
267
268/// An interleave-group may need masking if it resides in a block that needs
269/// predication, or in order to mask away gaps.
271 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
272 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
273
275 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
276 cl::desc("A flag that overrides the target's number of scalar registers."));
277
279 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
280 cl::desc("A flag that overrides the target's number of vector registers."));
281
283 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
284 cl::desc("A flag that overrides the target's max interleave factor for "
285 "scalar loops."));
286
288 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
289 cl::desc("A flag that overrides the target's max interleave factor for "
290 "vectorized loops."));
291
293 "force-target-instruction-cost", cl::init(0), cl::Hidden,
294 cl::desc("A flag that overrides the target's expected cost for "
295 "an instruction to a single constant value. Mostly "
296 "useful for getting consistent testing."));
297
299 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
300 cl::desc(
301 "Pretend that scalable vectors are supported, even if the target does "
302 "not support them. This flag should only be used for testing."));
303
305 "small-loop-cost", cl::init(20), cl::Hidden,
306 cl::desc(
307 "The cost of a loop that is considered 'small' by the interleaver."));
308
310 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
311 cl::desc("Enable the use of the block frequency analysis to access PGO "
312 "heuristics minimizing code growth in cold regions and being more "
313 "aggressive in hot regions."));
314
315// Runtime interleave loops for load/store throughput.
317 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
318 cl::desc(
319 "Enable runtime interleaving until load/store ports are saturated"));
320
321/// The number of stores in a loop that are allowed to need predication.
323 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
324 cl::desc("Max number of stores to be predicated behind an if."));
325
327 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
328 cl::desc("Count the induction variable only once when interleaving"));
329
331 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
332 cl::desc("Enable if predication of stores during vectorization."));
333
335 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
336 cl::desc("The maximum interleave count to use when interleaving a scalar "
337 "reduction in a nested loop."));
338
339static cl::opt<bool>
340 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
342 cl::desc("Prefer in-loop vector reductions, "
343 "overriding the targets preference."));
344
346 "force-ordered-reductions", cl::init(false), cl::Hidden,
347 cl::desc("Enable the vectorisation of loops with in-order (strict) "
348 "FP reductions"));
349
351 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
352 cl::desc(
353 "Prefer predicating a reduction operation over an after loop select."));
354
355namespace llvm {
357 "enable-vplan-native-path", cl::Hidden,
358 cl::desc("Enable VPlan-native vectorization path with "
359 "support for outer loop vectorization."));
360}
361
362// This flag enables the stress testing of the VPlan H-CFG construction in the
363// VPlan-native vectorization path. It must be used in conjuction with
364// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
365// verification of the H-CFGs built.
367 "vplan-build-stress-test", cl::init(false), cl::Hidden,
368 cl::desc(
369 "Build VPlan for every supported loop nest in the function and bail "
370 "out right after the build (stress test the VPlan H-CFG construction "
371 "in the VPlan-native vectorization path)."));
372
374 "interleave-loops", cl::init(true), cl::Hidden,
375 cl::desc("Enable loop interleaving in Loop vectorization passes"));
377 "vectorize-loops", cl::init(true), cl::Hidden,
378 cl::desc("Run the Loop vectorization passes"));
379
381 "vplan-print-in-dot-format", cl::Hidden,
382 cl::desc("Use dot format instead of plain text when dumping VPlans"));
383
385 "force-widen-divrem-via-safe-divisor", cl::Hidden,
386 cl::desc(
387 "Override cost based safe divisor widening for div/rem instructions"));
388
390 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
392 cl::desc("Try wider VFs if they enable the use of vector variants"));
393
394// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
395// variables not overflowing do not hold. See `emitSCEVChecks`.
396static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
397// Likelyhood of bypassing the vectorized loop because pointers overlap. See
398// `emitMemRuntimeChecks`.
399static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
400// Likelyhood of bypassing the vectorized loop because there are zero trips left
401// after prolog. See `emitIterationCountCheck`.
402static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
403
404/// A helper function that returns true if the given type is irregular. The
405/// type is irregular if its allocated size doesn't equal the store size of an
406/// element of the corresponding vector type.
407static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
408 // Determine if an array of N elements of type Ty is "bitcast compatible"
409 // with a <N x Ty> vector.
410 // This is only true if there is no padding between the array elements.
411 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
412}
413
414/// A helper function that returns the reciprocal of the block probability of
415/// predicated blocks. If we return X, we are assuming the predicated block
416/// will execute once for every X iterations of the loop header.
417///
418/// TODO: We should use actual block probability here, if available. Currently,
419/// we always assume predicated blocks have a 50% chance of executing.
420static unsigned getReciprocalPredBlockProb() { return 2; }
421
422/// Returns "best known" trip count for the specified loop \p L as defined by
423/// the following procedure:
424/// 1) Returns exact trip count if it is known.
425/// 2) Returns expected trip count according to profile data if any.
426/// 3) Returns upper bound estimate if it is known.
427/// 4) Returns std::nullopt if all of the above failed.
428static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
429 Loop *L) {
430 // Check if exact trip count is known.
431 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
432 return ExpectedTC;
433
434 // Check if there is an expected trip count available from profile data.
436 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
437 return *EstimatedTC;
438
439 // Check if upper bound estimate is known.
440 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
441 return ExpectedTC;
442
443 return std::nullopt;
444}
445
446/// Return a vector containing interleaved elements from multiple
447/// smaller input vectors.
449 const Twine &Name) {
450 unsigned Factor = Vals.size();
451 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
452
453 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
454#ifndef NDEBUG
455 for (Value *Val : Vals)
456 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
457#endif
458
459 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
460 // must use intrinsics to interleave.
461 if (VecTy->isScalableTy()) {
462 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
463 return Builder.CreateIntrinsic(
464 WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
465 /*FMFSource=*/nullptr, Name);
466 }
467
468 // Fixed length. Start by concatenating all vectors into a wide vector.
469 Value *WideVec = concatenateVectors(Builder, Vals);
470
471 // Interleave the elements into the wide vector.
472 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
473 return Builder.CreateShuffleVector(
474 WideVec, createInterleaveMask(NumElts, Factor), Name);
475}
476
477namespace {
478// Forward declare GeneratedRTChecks.
479class GeneratedRTChecks;
480
481using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
482} // namespace
483
484namespace llvm {
485
487
488/// InnerLoopVectorizer vectorizes loops which contain only one basic
489/// block to a specified vectorization factor (VF).
490/// This class performs the widening of scalars into vectors, or multiple
491/// scalars. This class also implements the following features:
492/// * It inserts an epilogue loop for handling loops that don't have iteration
493/// counts that are known to be a multiple of the vectorization factor.
494/// * It handles the code generation for reduction variables.
495/// * Scalarization (implementation using scalars) of un-vectorizable
496/// instructions.
497/// InnerLoopVectorizer does not perform any vectorization-legality
498/// checks, and relies on the caller to check for the different legality
499/// aspects. The InnerLoopVectorizer relies on the
500/// LoopVectorizationLegality class to provide information about the induction
501/// and reduction variables that were found to a given vectorization factor.
503public:
506 const TargetLibraryInfo *TLI,
510 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
512 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
513 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
514 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
515 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
517 // Query this against the original loop and save it here because the profile
518 // of the original loop header may change as the transformation happens.
521
523 this->MinProfitableTripCount = VecWidth;
524 else
525 this->MinProfitableTripCount = MinProfitableTripCount;
526 }
527
528 virtual ~InnerLoopVectorizer() = default;
529
530 /// Create a new empty loop that will contain vectorized instructions later
531 /// on, while the old loop will be used as the scalar remainder. Control flow
532 /// is generated around the vectorized (and scalar epilogue) loops consisting
533 /// of various checks and bypasses. Return the pre-header block of the new
534 /// loop and the start value for the canonical induction, if it is != 0. The
535 /// latter is the case when vectorizing the epilogue loop. In the case of
536 /// epilogue vectorization, this function is overriden to handle the more
537 /// complex control flow around the loops. \p ExpandedSCEVs is used to
538 /// look up SCEV expansions for expressions needed during skeleton creation.
539 virtual std::pair<BasicBlock *, Value *>
540 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
541
542 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
543 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
544
545 // Return true if any runtime check is added.
547
548 /// A helper function to scalarize a single Instruction in the innermost loop.
549 /// Generates a sequence of scalar instances for each lane between \p MinLane
550 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
551 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
552 /// Instr's operands.
553 void scalarizeInstruction(const Instruction *Instr,
554 VPReplicateRecipe *RepRecipe,
555 const VPIteration &Instance,
556 VPTransformState &State);
557
558 /// Try to vectorize interleaved access group \p Group with the base address
559 /// given in \p Addr, optionally masking the vector operations if \p
560 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
561 /// values in the vectorized loop.
563 ArrayRef<VPValue *> VPDefs,
565 ArrayRef<VPValue *> StoredValues,
566 VPValue *BlockInMask, bool NeedsMaskForGaps);
567
568 /// Fix the non-induction PHIs in \p Plan.
569 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
570
571 /// Create a new phi node for the induction variable \p OrigPhi to resume
572 /// iteration count in the scalar epilogue, from where the vectorized loop
573 /// left off. \p Step is the SCEV-expanded induction step to use. In cases
574 /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
575 /// and the resume values can come from an additional bypass block, the \p
576 /// AdditionalBypass pair provides information about the bypass block and the
577 /// end value on the edge from bypass to this loop.
579 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
580 ArrayRef<BasicBlock *> BypassBlocks,
581 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
582
583 /// Returns the original loop trip count.
584 Value *getTripCount() const { return TripCount; }
585
586 /// Used to set the trip count after ILV's construction and after the
587 /// preheader block has been executed. Note that this always holds the trip
588 /// count of the original loop for both main loop and epilogue vectorization.
589 void setTripCount(Value *TC) { TripCount = TC; }
590
591protected:
593
594 /// A small list of PHINodes.
596
597 /// A type for scalarized values in the new loop. Each value from the
598 /// original loop, when scalarized, is represented by UF x VF scalar values
599 /// in the new unrolled loop, where UF is the unroll factor and VF is the
600 /// vectorization factor.
602
603 /// Set up the values of the IVs correctly when exiting the vector loop.
604 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
605 Value *VectorTripCount, Value *EndValue,
606 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
607 VPlan &Plan, VPTransformState &State);
608
609 /// Create the exit value of first order recurrences in the middle block and
610 /// update their users.
612 VPTransformState &State);
613
614 /// Iteratively sink the scalarized operands of a predicated instruction into
615 /// the block that was created for it.
616 void sinkScalarOperands(Instruction *PredInst);
617
618 /// Returns (and creates if needed) the trip count of the widened loop.
620
621 /// Returns a bitcasted value to the requested vector type.
622 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
624 const DataLayout &DL);
625
626 /// Emit a bypass check to see if the vector trip count is zero, including if
627 /// it overflows.
629
630 /// Emit a bypass check to see if all of the SCEV assumptions we've
631 /// had to make are correct. Returns the block containing the checks or
632 /// nullptr if no checks have been added.
634
635 /// Emit bypass checks to check any memory assumptions we may have made.
636 /// Returns the block containing the checks or nullptr if no checks have been
637 /// added.
639
640 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
641 /// vector loop preheader, middle block and scalar preheader.
643
644 /// Create new phi nodes for the induction variables to resume iteration count
645 /// in the scalar epilogue, from where the vectorized loop left off.
646 /// In cases where the loop skeleton is more complicated (eg. epilogue
647 /// vectorization) and the resume values can come from an additional bypass
648 /// block, the \p AdditionalBypass pair provides information about the bypass
649 /// block and the end value on the edge from bypass to this loop.
651 const SCEV2ValueTy &ExpandedSCEVs,
652 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
653
654 /// Complete the loop skeleton by adding debug MDs, creating appropriate
655 /// conditional branches in the middle block, preparing the builder and
656 /// running the verifier. Return the preheader of the completed vector loop.
658
659 /// Allow subclasses to override and print debug traces before/after vplan
660 /// execution, when trace information is requested.
661 virtual void printDebugTracesAtStart(){};
662 virtual void printDebugTracesAtEnd(){};
663
664 /// The original loop.
666
667 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
668 /// dynamic knowledge to simplify SCEV expressions and converts them to a
669 /// more usable form.
671
672 /// Loop Info.
674
675 /// Dominator Tree.
677
678 /// Target Library Info.
680
681 /// Target Transform Info.
683
684 /// Assumption Cache.
686
687 /// Interface to emit optimization remarks.
689
690 /// The vectorization SIMD factor to use. Each vector will have this many
691 /// vector elements.
693
695
696 /// The vectorization unroll factor to use. Each scalar is vectorized to this
697 /// many different vector instructions.
698 unsigned UF;
699
700 /// The builder that we use
702
703 // --- Vectorization state ---
704
705 /// The vector-loop preheader.
707
708 /// The scalar-loop preheader.
710
711 /// Middle Block between the vector and the scalar.
713
714 /// The unique ExitBlock of the scalar loop if one exists. Note that
715 /// there can be multiple exiting edges reaching this block.
717
718 /// The scalar loop body.
720
721 /// A list of all bypass blocks. The first block is the entry of the loop.
723
724 /// Store instructions that were predicated.
726
727 /// Trip count of the original loop.
728 Value *TripCount = nullptr;
729
730 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
732
733 /// The legality analysis.
735
736 /// The profitablity analysis.
738
739 // Record whether runtime checks are added.
740 bool AddedSafetyChecks = false;
741
742 // Holds the end values for each induction variable. We save the end values
743 // so we can later fix-up the external users of the induction variables.
745
746 /// BFI and PSI are used to check for profile guided size optimizations.
749
750 // Whether this loop should be optimized for size based on profile guided size
751 // optimizatios.
753
754 /// Structure to hold information about generated runtime checks, responsible
755 /// for cleaning the checks, if vectorization turns out unprofitable.
756 GeneratedRTChecks &RTChecks;
757
758 // Holds the resume values for reductions in the loops, used to set the
759 // correct start value of reduction PHIs when vectorizing the epilogue.
762};
763
765public:
768 const TargetLibraryInfo *TLI,
770 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
773 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
775 ElementCount::getFixed(1),
776 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
777 BFI, PSI, Check) {}
778};
779
780/// Encapsulate information regarding vectorization of a loop and its epilogue.
781/// This information is meant to be updated and used across two stages of
782/// epilogue vectorization.
785 unsigned MainLoopUF = 0;
787 unsigned EpilogueUF = 0;
792 Value *TripCount = nullptr;
794
796 ElementCount EVF, unsigned EUF)
797 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
798 assert(EUF == 1 &&
799 "A high UF for the epilogue loop is likely not beneficial.");
800 }
801};
802
803/// An extension of the inner loop vectorizer that creates a skeleton for a
804/// vectorized loop that has its epilogue (residual) also vectorized.
805/// The idea is to run the vplan on a given loop twice, firstly to setup the
806/// skeleton and vectorize the main loop, and secondly to complete the skeleton
807/// from the first step and vectorize the epilogue. This is achieved by
808/// deriving two concrete strategy classes from this base class and invoking
809/// them in succession from the loop vectorizer planner.
811public:
819 GeneratedRTChecks &Checks)
821 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
822 CM, BFI, PSI, Checks),
823 EPI(EPI) {}
824
825 // Override this function to handle the more complex control flow around the
826 // three loops.
827 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
828 const SCEV2ValueTy &ExpandedSCEVs) final {
829 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
830 }
831
832 /// The interface for creating a vectorized skeleton using one of two
833 /// different strategies, each corresponding to one execution of the vplan
834 /// as described above.
835 virtual std::pair<BasicBlock *, Value *>
836 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
837
838 /// Holds and updates state information required to vectorize the main loop
839 /// and its epilogue in two separate passes. This setup helps us avoid
840 /// regenerating and recomputing runtime safety checks. It also helps us to
841 /// shorten the iteration-count-check path length for the cases where the
842 /// iteration count of the loop is so small that the main vector loop is
843 /// completely skipped.
845};
846
847/// A specialized derived class of inner loop vectorizer that performs
848/// vectorization of *main* loops in the process of vectorizing loops and their
849/// epilogues.
851public:
859 GeneratedRTChecks &Check)
861 EPI, LVL, CM, BFI, PSI, Check) {}
862 /// Implements the interface for creating a vectorized skeleton using the
863 /// *main loop* strategy (ie the first pass of vplan execution).
864 std::pair<BasicBlock *, Value *>
865 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
866
867protected:
868 /// Emits an iteration count bypass check once for the main loop (when \p
869 /// ForEpilogue is false) and once for the epilogue loop (when \p
870 /// ForEpilogue is true).
871 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
872 void printDebugTracesAtStart() override;
873 void printDebugTracesAtEnd() override;
874};
875
876// A specialized derived class of inner loop vectorizer that performs
877// vectorization of *epilogue* loops in the process of vectorizing loops and
878// their epilogues.
880public:
888 GeneratedRTChecks &Checks)
890 EPI, LVL, CM, BFI, PSI, Checks) {
892 }
893 /// Implements the interface for creating a vectorized skeleton using the
894 /// *epilogue loop* strategy (ie the second pass of vplan execution).
895 std::pair<BasicBlock *, Value *>
896 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
897
898protected:
899 /// Emits an iteration count bypass check after the main vector loop has
900 /// finished to see if there are any iterations left to execute by either
901 /// the vector epilogue or the scalar epilogue.
903 BasicBlock *Bypass,
904 BasicBlock *Insert);
905 void printDebugTracesAtStart() override;
906 void printDebugTracesAtEnd() override;
907};
908} // end namespace llvm
909
910/// Look for a meaningful debug location on the instruction or it's
911/// operands.
913 if (!I)
914 return DebugLoc();
915
917 if (I->getDebugLoc() != Empty)
918 return I->getDebugLoc();
919
920 for (Use &Op : I->operands()) {
921 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
922 if (OpInst->getDebugLoc() != Empty)
923 return OpInst->getDebugLoc();
924 }
925
926 return I->getDebugLoc();
927}
928
929/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
930/// is passed, the message relates to that particular instruction.
931#ifndef NDEBUG
932static void debugVectorizationMessage(const StringRef Prefix,
933 const StringRef DebugMsg,
934 Instruction *I) {
935 dbgs() << "LV: " << Prefix << DebugMsg;
936 if (I != nullptr)
937 dbgs() << " " << *I;
938 else
939 dbgs() << '.';
940 dbgs() << '\n';
941}
942#endif
943
944/// Create an analysis remark that explains why vectorization failed
945///
946/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
947/// RemarkName is the identifier for the remark. If \p I is passed it is an
948/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
949/// the location of the remark. \return the remark object that can be
950/// streamed to.
952 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
953 Value *CodeRegion = TheLoop->getHeader();
954 DebugLoc DL = TheLoop->getStartLoc();
955
956 if (I) {
957 CodeRegion = I->getParent();
958 // If there is no debug location attached to the instruction, revert back to
959 // using the loop's.
960 if (I->getDebugLoc())
961 DL = I->getDebugLoc();
962 }
963
964 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
965}
966
967namespace llvm {
968
969/// Return a value for Step multiplied by VF.
971 int64_t Step) {
972 assert(Ty->isIntegerTy() && "Expected an integer step");
973 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
974}
975
976/// Return the runtime value for VF.
978 return B.CreateElementCount(Ty, VF);
979}
980
982 Loop *OrigLoop) {
983 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
984 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
985
986 ScalarEvolution &SE = *PSE.getSE();
987 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
988}
989
991 const StringRef OREMsg, const StringRef ORETag,
992 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
993 Instruction *I) {
994 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
995 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
996 ORE->emit(
997 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
998 << "loop not vectorized: " << OREMsg);
999}
1000
1001void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1002 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1003 Instruction *I) {
1005 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1006 ORE->emit(
1007 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1008 << Msg);
1009}
1010
1011/// Report successful vectorization of the loop. In case an outer loop is
1012/// vectorized, prepend "outer" to the vectorization remark.
1014 VectorizationFactor VF, unsigned IC) {
1016 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
1017 nullptr));
1018 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
1019 ORE->emit([&]() {
1020 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
1021 TheLoop->getHeader())
1022 << "vectorized " << LoopType << "loop (vectorization width: "
1023 << ore::NV("VectorizationFactor", VF.Width)
1024 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
1025 });
1026}
1027
1028} // end namespace llvm
1029
1030#ifndef NDEBUG
1031/// \return string containing a file name and a line # for the given loop.
1032static std::string getDebugLocString(const Loop *L) {
1033 std::string Result;
1034 if (L) {
1035 raw_string_ostream OS(Result);
1036 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1037 LoopDbgLoc.print(OS);
1038 else
1039 // Just print the module name.
1040 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1041 OS.flush();
1042 }
1043 return Result;
1044}
1045#endif
1046
1047namespace llvm {
1048
1049// Loop vectorization cost-model hints how the scalar epilogue loop should be
1050// lowered.
1052
1053 // The default: allowing scalar epilogues.
1055
1056 // Vectorization with OptForSize: don't allow epilogues.
1058
1059 // A special case of vectorisation with OptForSize: loops with a very small
1060 // trip count are considered for vectorization under OptForSize, thereby
1061 // making sure the cost of their loop body is dominant, free of runtime
1062 // guards and scalar iteration overheads.
1064
1065 // Loop hint predicate indicating an epilogue is undesired.
1067
1068 // Directive indicating we must either tail fold or not vectorize
1071
1072using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1073
1074/// LoopVectorizationCostModel - estimates the expected speedups due to
1075/// vectorization.
1076/// In many cases vectorization is not profitable. This can happen because of
1077/// a number of reasons. In this class we mainly attempt to predict the
1078/// expected speedup/slowdowns due to the supported instruction set. We use the
1079/// TargetTransformInfo to query the different backends for the cost of
1080/// different operations.
1082public:
1086 const TargetTransformInfo &TTI,
1092 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1093 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1094 Hints(Hints), InterleaveInfo(IAI) {}
1095
1096 /// \return An upper bound for the vectorization factors (both fixed and
1097 /// scalable). If the factors are 0, vectorization and interleaving should be
1098 /// avoided up front.
1099 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1100
1101 /// \return True if runtime checks are required for vectorization, and false
1102 /// otherwise.
1103 bool runtimeChecksRequired();
1104
1105 /// Setup cost-based decisions for user vectorization factor.
1106 /// \return true if the UserVF is a feasible VF to be chosen.
1110 return expectedCost(UserVF).first.isValid();
1111 }
1112
1113 /// \return The size (in bits) of the smallest and widest types in the code
1114 /// that needs to be vectorized. We ignore values that remain scalar such as
1115 /// 64 bit loop indices.
1116 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1117
1118 /// \return The desired interleave count.
1119 /// If interleave count has been specified by metadata it will be returned.
1120 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1121 /// are the selected vectorization factor and the cost of the selected VF.
1122 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1123
1124 /// Memory access instruction may be vectorized in more than one way.
1125 /// Form of instruction after vectorization depends on cost.
1126 /// This function takes cost-based decisions for Load/Store instructions
1127 /// and collects them in a map. This decisions map is used for building
1128 /// the lists of loop-uniform and loop-scalar instructions.
1129 /// The calculated cost is saved with widening decision in order to
1130 /// avoid redundant calculations.
1132
1133 /// A call may be vectorized in different ways depending on whether we have
1134 /// vectorized variants available and whether the target supports masking.
1135 /// This function analyzes all calls in the function at the supplied VF,
1136 /// makes a decision based on the costs of available options, and stores that
1137 /// decision in a map for use in planning and plan execution.
1139
1140 /// A struct that represents some properties of the register usage
1141 /// of a loop.
1143 /// Holds the number of loop invariant values that are used in the loop.
1144 /// The key is ClassID of target-provided register class.
1146 /// Holds the maximum number of concurrent live intervals in the loop.
1147 /// The key is ClassID of target-provided register class.
1149 };
1150
1151 /// \return Returns information about the register usages of the loop for the
1152 /// given vectorization factors.
1155
1156 /// Collect values we want to ignore in the cost model.
1157 void collectValuesToIgnore();
1158
1159 /// Collect all element types in the loop for which widening is needed.
1161
1162 /// Split reductions into those that happen in the loop, and those that happen
1163 /// outside. In loop reductions are collected into InLoopReductions.
1165
1166 /// Returns true if we should use strict in-order reductions for the given
1167 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1168 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1169 /// of FP operations.
1170 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1171 return !Hints->allowReordering() && RdxDesc.isOrdered();
1172 }
1173
1174 /// \returns The smallest bitwidth each instruction can be represented with.
1175 /// The vector equivalents of these instructions should be truncated to this
1176 /// type.
1178 return MinBWs;
1179 }
1180
1181 /// \returns True if it is more profitable to scalarize instruction \p I for
1182 /// vectorization factor \p VF.
1184 assert(VF.isVector() &&
1185 "Profitable to scalarize relevant only for VF > 1.");
1186 assert(
1187 TheLoop->isInnermost() &&
1188 "cost-model should not be used for outer loops (in VPlan-native path)");
1189
1190 auto Scalars = InstsToScalarize.find(VF);
1191 assert(Scalars != InstsToScalarize.end() &&
1192 "VF not yet analyzed for scalarization profitability");
1193 return Scalars->second.contains(I);
1194 }
1195
1196 /// Returns true if \p I is known to be uniform after vectorization.
1198 assert(
1199 TheLoop->isInnermost() &&
1200 "cost-model should not be used for outer loops (in VPlan-native path)");
1201 // Pseudo probe needs to be duplicated for each unrolled iteration and
1202 // vector lane so that profiled loop trip count can be accurately
1203 // accumulated instead of being under counted.
1204 if (isa<PseudoProbeInst>(I))
1205 return false;
1206
1207 if (VF.isScalar())
1208 return true;
1209
1210 auto UniformsPerVF = Uniforms.find(VF);
1211 assert(UniformsPerVF != Uniforms.end() &&
1212 "VF not yet analyzed for uniformity");
1213 return UniformsPerVF->second.count(I);
1214 }
1215
1216 /// Returns true if \p I is known to be scalar after vectorization.
1218 assert(
1219 TheLoop->isInnermost() &&
1220 "cost-model should not be used for outer loops (in VPlan-native path)");
1221 if (VF.isScalar())
1222 return true;
1223
1224 auto ScalarsPerVF = Scalars.find(VF);
1225 assert(ScalarsPerVF != Scalars.end() &&
1226 "Scalar values are not calculated for VF");
1227 return ScalarsPerVF->second.count(I);
1228 }
1229
1230 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1231 /// for vectorization factor \p VF.
1233 return VF.isVector() && MinBWs.contains(I) &&
1234 !isProfitableToScalarize(I, VF) &&
1236 }
1237
1238 /// Decision that was taken during cost calculation for memory instruction.
1241 CM_Widen, // For consecutive accesses with stride +1.
1242 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1249
1250 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1251 /// instruction \p I and vector width \p VF.
1254 assert(VF.isVector() && "Expected VF >=2");
1255 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1256 }
1257
1258 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1259 /// interleaving group \p Grp and vector width \p VF.
1263 assert(VF.isVector() && "Expected VF >=2");
1264 /// Broadcast this decicion to all instructions inside the group.
1265 /// But the cost will be assigned to one instruction only.
1266 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1267 if (auto *I = Grp->getMember(i)) {
1268 if (Grp->getInsertPos() == I)
1269 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1270 else
1271 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1272 }
1273 }
1274 }
1275
1276 /// Return the cost model decision for the given instruction \p I and vector
1277 /// width \p VF. Return CM_Unknown if this instruction did not pass
1278 /// through the cost modeling.
1280 assert(VF.isVector() && "Expected VF to be a vector VF");
1281 assert(
1282 TheLoop->isInnermost() &&
1283 "cost-model should not be used for outer loops (in VPlan-native path)");
1284
1285 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1286 auto Itr = WideningDecisions.find(InstOnVF);
1287 if (Itr == WideningDecisions.end())
1288 return CM_Unknown;
1289 return Itr->second.first;
1290 }
1291
1292 /// Return the vectorization cost for the given instruction \p I and vector
1293 /// width \p VF.
1295 assert(VF.isVector() && "Expected VF >=2");
1296 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1297 assert(WideningDecisions.contains(InstOnVF) &&
1298 "The cost is not calculated");
1299 return WideningDecisions[InstOnVF].second;
1300 }
1301
1306 std::optional<unsigned> MaskPos;
1308 };
1309
1311 Function *Variant, Intrinsic::ID IID,
1312 std::optional<unsigned> MaskPos,
1314 assert(!VF.isScalar() && "Expected vector VF");
1315 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1316 MaskPos, Cost};
1317 }
1318
1320 ElementCount VF) const {
1321 assert(!VF.isScalar() && "Expected vector VF");
1322 return CallWideningDecisions.at(std::make_pair(CI, VF));
1323 }
1324
1325 /// Return True if instruction \p I is an optimizable truncate whose operand
1326 /// is an induction variable. Such a truncate will be removed by adding a new
1327 /// induction variable with the destination type.
1329 // If the instruction is not a truncate, return false.
1330 auto *Trunc = dyn_cast<TruncInst>(I);
1331 if (!Trunc)
1332 return false;
1333
1334 // Get the source and destination types of the truncate.
1335 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1336 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1337
1338 // If the truncate is free for the given types, return false. Replacing a
1339 // free truncate with an induction variable would add an induction variable
1340 // update instruction to each iteration of the loop. We exclude from this
1341 // check the primary induction variable since it will need an update
1342 // instruction regardless.
1343 Value *Op = Trunc->getOperand(0);
1344 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1345 return false;
1346
1347 // If the truncated value is not an induction variable, return false.
1348 return Legal->isInductionPhi(Op);
1349 }
1350
1351 /// Collects the instructions to scalarize for each predicated instruction in
1352 /// the loop.
1354
1355 /// Collect Uniform and Scalar values for the given \p VF.
1356 /// The sets depend on CM decision for Load/Store instructions
1357 /// that may be vectorized as interleave, gather-scatter or scalarized.
1358 /// Also make a decision on what to do about call instructions in the loop
1359 /// at that VF -- scalarize, call a known vector routine, or call a
1360 /// vector intrinsic.
1362 // Do the analysis once.
1363 if (VF.isScalar() || Uniforms.contains(VF))
1364 return;
1367 collectLoopUniforms(VF);
1368 collectLoopScalars(VF);
1369 }
1370
1371 /// Returns true if the target machine supports masked store operation
1372 /// for the given \p DataType and kind of access to \p Ptr.
1373 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1374 return Legal->isConsecutivePtr(DataType, Ptr) &&
1375 TTI.isLegalMaskedStore(DataType, Alignment);
1376 }
1377
1378 /// Returns true if the target machine supports masked load operation
1379 /// for the given \p DataType and kind of access to \p Ptr.
1380 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1381 return Legal->isConsecutivePtr(DataType, Ptr) &&
1382 TTI.isLegalMaskedLoad(DataType, Alignment);
1383 }
1384
1385 /// Returns true if the target machine can represent \p V as a masked gather
1386 /// or scatter operation.
1388 bool LI = isa<LoadInst>(V);
1389 bool SI = isa<StoreInst>(V);
1390 if (!LI && !SI)
1391 return false;
1392 auto *Ty = getLoadStoreType(V);
1394 if (VF.isVector())
1395 Ty = VectorType::get(Ty, VF);
1396 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1397 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1398 }
1399
1400 /// Returns true if the target machine supports all of the reduction
1401 /// variables found for the given VF.
1403 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1404 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1405 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1406 }));
1407 }
1408
1409 /// Given costs for both strategies, return true if the scalar predication
1410 /// lowering should be used for div/rem. This incorporates an override
1411 /// option so it is not simply a cost comparison.
1413 InstructionCost SafeDivisorCost) const {
1414 switch (ForceSafeDivisor) {
1415 case cl::BOU_UNSET:
1416 return ScalarCost < SafeDivisorCost;
1417 case cl::BOU_TRUE:
1418 return false;
1419 case cl::BOU_FALSE:
1420 return true;
1421 };
1422 llvm_unreachable("impossible case value");
1423 }
1424
1425 /// Returns true if \p I is an instruction which requires predication and
1426 /// for which our chosen predication strategy is scalarization (i.e. we
1427 /// don't have an alternate strategy such as masking available).
1428 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1430
1431 /// Returns true if \p I is an instruction that needs to be predicated
1432 /// at runtime. The result is independent of the predication mechanism.
1433 /// Superset of instructions that return true for isScalarWithPredication.
1434 bool isPredicatedInst(Instruction *I) const;
1435
1436 /// Return the costs for our two available strategies for lowering a
1437 /// div/rem operation which requires speculating at least one lane.
1438 /// First result is for scalarization (will be invalid for scalable
1439 /// vectors); second is for the safe-divisor strategy.
1440 std::pair<InstructionCost, InstructionCost>
1442 ElementCount VF) const;
1443
1444 /// Returns true if \p I is a memory instruction with consecutive memory
1445 /// access that can be widened.
1447
1448 /// Returns true if \p I is a memory instruction in an interleaved-group
1449 /// of memory accesses that can be vectorized with wide vector loads/stores
1450 /// and shuffles.
1452
1453 /// Check if \p Instr belongs to any interleaved access group.
1455 return InterleaveInfo.isInterleaved(Instr);
1456 }
1457
1458 /// Get the interleaved access group that \p Instr belongs to.
1461 return InterleaveInfo.getInterleaveGroup(Instr);
1462 }
1463
1464 /// Returns true if we're required to use a scalar epilogue for at least
1465 /// the final iteration of the original loop.
1466 bool requiresScalarEpilogue(bool IsVectorizing) const {
1468 return false;
1469 // If we might exit from anywhere but the latch, must run the exiting
1470 // iteration in scalar form.
1472 return true;
1473 return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
1474 }
1475
1476 /// Returns true if we're required to use a scalar epilogue for at least
1477 /// the final iteration of the original loop for all VFs in \p Range.
1478 /// A scalar epilogue must either be required for all VFs in \p Range or for
1479 /// none.
1481 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1482 return requiresScalarEpilogue(VF.isVector());
1483 };
1484 bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1485 assert(
1486 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1487 "all VFs in range must agree on whether a scalar epilogue is required");
1488 return IsRequired;
1489 }
1490
1491 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1492 /// loop hint annotation.
1494 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1495 }
1496
1497 /// Returns the TailFoldingStyle that is best for the current loop.
1498 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1499 if (!ChosenTailFoldingStyle)
1501 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1502 : ChosenTailFoldingStyle->second;
1503 }
1504
1505 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1506 /// overflow or not.
1507 /// \param IsScalableVF true if scalable vector factors enabled.
1508 /// \param UserIC User specific interleave count.
1509 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1510 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1512 ChosenTailFoldingStyle =
1514 return;
1515 }
1516
1517 if (!ForceTailFoldingStyle.getNumOccurrences()) {
1518 ChosenTailFoldingStyle = std::make_pair(
1519 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1520 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1521 return;
1522 }
1523
1524 // Set styles when forced.
1525 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1526 ForceTailFoldingStyle.getValue());
1528 return;
1529 // Override forced styles if needed.
1530 // FIXME: use actual opcode/data type for analysis here.
1531 // FIXME: Investigate opportunity for fixed vector factor.
1532 bool EVLIsLegal =
1533 IsScalableVF && UserIC <= 1 &&
1534 TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1536 // FIXME: implement support for max safe dependency distance.
1538 // FIXME: remove this once reductions are supported.
1540 if (!EVLIsLegal) {
1541 // If for some reason EVL mode is unsupported, fallback to
1542 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1543 // in a generic way.
1544 ChosenTailFoldingStyle =
1547 LLVM_DEBUG(
1548 dbgs()
1549 << "LV: Preference for VP intrinsics indicated. Will "
1550 "not try to generate VP Intrinsics "
1551 << (UserIC > 1
1552 ? "since interleave count specified is greater than 1.\n"
1553 : "due to non-interleaving reasons.\n"));
1554 }
1555 }
1556
1557 /// Returns true if all loop blocks should be masked to fold tail loop.
1558 bool foldTailByMasking() const {
1559 // TODO: check if it is possible to check for None style independent of
1560 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1562 }
1563
1564 /// Returns true if the instructions in this block requires predication
1565 /// for any reason, e.g. because tail folding now requires a predicate
1566 /// or because the block in the original loop was predicated.
1569 }
1570
1571 /// Returns true if VP intrinsics with explicit vector length support should
1572 /// be generated in the tail folded loop.
1573 bool foldTailWithEVL() const {
1575 // FIXME: remove this once vp_reverse is supported.
1576 none_of(
1577 WideningDecisions,
1578 [](const std::pair<std::pair<Instruction *, ElementCount>,
1579 std::pair<InstWidening, InstructionCost>>
1580 &Data) { return Data.second.first == CM_Widen_Reverse; });
1581 }
1582
1583 /// Returns true if the Phi is part of an inloop reduction.
1584 bool isInLoopReduction(PHINode *Phi) const {
1585 return InLoopReductions.contains(Phi);
1586 }
1587
1588 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1589 /// with factor VF. Return the cost of the instruction, including
1590 /// scalarization overhead if it's needed.
1592
1593 /// Estimate cost of a call instruction CI if it were vectorized with factor
1594 /// VF. Return the cost of the instruction, including scalarization overhead
1595 /// if it's needed.
1597
1598 /// Invalidates decisions already taken by the cost model.
1600 WideningDecisions.clear();
1601 CallWideningDecisions.clear();
1602 Uniforms.clear();
1603 Scalars.clear();
1604 }
1605
1606 /// The vectorization cost is a combination of the cost itself and a boolean
1607 /// indicating whether any of the contributing operations will actually
1608 /// operate on vector values after type legalization in the backend. If this
1609 /// latter value is false, then all operations will be scalarized (i.e. no
1610 /// vectorization has actually taken place).
1611 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1612
1613 /// Returns the expected execution cost. The unit of the cost does
1614 /// not matter because we use the 'cost' units to compare different
1615 /// vector widths. The cost that is returned is *not* normalized by
1616 /// the factor width. If \p Invalid is not nullptr, this function
1617 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1618 /// each instruction that has an Invalid cost for the given VF.
1622
1623 bool hasPredStores() const { return NumPredStores > 0; }
1624
1625 /// Returns true if epilogue vectorization is considered profitable, and
1626 /// false otherwise.
1627 /// \p VF is the vectorization factor chosen for the original loop.
1629
1630private:
1631 unsigned NumPredStores = 0;
1632
1633 /// \return An upper bound for the vectorization factors for both
1634 /// fixed and scalable vectorization, where the minimum-known number of
1635 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1636 /// disabled or unsupported, then the scalable part will be equal to
1637 /// ElementCount::getScalable(0).
1638 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1639 ElementCount UserVF,
1640 bool FoldTailByMasking);
1641
1642 /// \return the maximized element count based on the targets vector
1643 /// registers and the loop trip-count, but limited to a maximum safe VF.
1644 /// This is a helper function of computeFeasibleMaxVF.
1645 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1646 unsigned SmallestType,
1647 unsigned WidestType,
1648 ElementCount MaxSafeVF,
1649 bool FoldTailByMasking);
1650
1651 /// \return the maximum legal scalable VF, based on the safe max number
1652 /// of elements.
1653 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1654
1655 /// Returns the execution time cost of an instruction for a given vector
1656 /// width. Vector width of one means scalar.
1657 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1658
1659 /// The cost-computation logic from getInstructionCost which provides
1660 /// the vector type as an output parameter.
1661 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1662 Type *&VectorTy);
1663
1664 /// Return the cost of instructions in an inloop reduction pattern, if I is
1665 /// part of that pattern.
1666 std::optional<InstructionCost>
1667 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1669
1670 /// Calculate vectorization cost of memory instruction \p I.
1671 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1672
1673 /// The cost computation for scalarized memory instruction.
1674 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1675
1676 /// The cost computation for interleaving group of memory instructions.
1677 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1678
1679 /// The cost computation for Gather/Scatter instruction.
1680 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1681
1682 /// The cost computation for widening instruction \p I with consecutive
1683 /// memory access.
1684 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1685
1686 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1687 /// Load: scalar load + broadcast.
1688 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1689 /// element)
1690 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1691
1692 /// Estimate the overhead of scalarizing an instruction. This is a
1693 /// convenience wrapper for the type-based getScalarizationOverhead API.
1694 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1696
1697 /// Returns true if an artificially high cost for emulated masked memrefs
1698 /// should be used.
1699 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1700
1701 /// Map of scalar integer values to the smallest bitwidth they can be legally
1702 /// represented as. The vector equivalents of these values should be truncated
1703 /// to this type.
1705
1706 /// A type representing the costs for instructions if they were to be
1707 /// scalarized rather than vectorized. The entries are Instruction-Cost
1708 /// pairs.
1709 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1710
1711 /// A set containing all BasicBlocks that are known to present after
1712 /// vectorization as a predicated block.
1714 PredicatedBBsAfterVectorization;
1715
1716 /// Records whether it is allowed to have the original scalar loop execute at
1717 /// least once. This may be needed as a fallback loop in case runtime
1718 /// aliasing/dependence checks fail, or to handle the tail/remainder
1719 /// iterations when the trip count is unknown or doesn't divide by the VF,
1720 /// or as a peel-loop to handle gaps in interleave-groups.
1721 /// Under optsize and when the trip count is very small we don't allow any
1722 /// iterations to execute in the scalar loop.
1723 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1724
1725 /// Control finally chosen tail folding style. The first element is used if
1726 /// the IV update may overflow, the second element - if it does not.
1727 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1728 ChosenTailFoldingStyle;
1729
1730 /// A map holding scalar costs for different vectorization factors. The
1731 /// presence of a cost for an instruction in the mapping indicates that the
1732 /// instruction will be scalarized when vectorizing with the associated
1733 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1735
1736 /// Holds the instructions known to be uniform after vectorization.
1737 /// The data is collected per VF.
1739
1740 /// Holds the instructions known to be scalar after vectorization.
1741 /// The data is collected per VF.
1743
1744 /// Holds the instructions (address computations) that are forced to be
1745 /// scalarized.
1747
1748 /// PHINodes of the reductions that should be expanded in-loop.
1749 SmallPtrSet<PHINode *, 4> InLoopReductions;
1750
1751 /// A Map of inloop reduction operations and their immediate chain operand.
1752 /// FIXME: This can be removed once reductions can be costed correctly in
1753 /// VPlan. This was added to allow quick lookup of the inloop operations.
1754 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1755
1756 /// Returns the expected difference in cost from scalarizing the expression
1757 /// feeding a predicated instruction \p PredInst. The instructions to
1758 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1759 /// non-negative return value implies the expression will be scalarized.
1760 /// Currently, only single-use chains are considered for scalarization.
1761 InstructionCost computePredInstDiscount(Instruction *PredInst,
1762 ScalarCostsTy &ScalarCosts,
1763 ElementCount VF);
1764
1765 /// Collect the instructions that are uniform after vectorization. An
1766 /// instruction is uniform if we represent it with a single scalar value in
1767 /// the vectorized loop corresponding to each vector iteration. Examples of
1768 /// uniform instructions include pointer operands of consecutive or
1769 /// interleaved memory accesses. Note that although uniformity implies an
1770 /// instruction will be scalar, the reverse is not true. In general, a
1771 /// scalarized instruction will be represented by VF scalar values in the
1772 /// vectorized loop, each corresponding to an iteration of the original
1773 /// scalar loop.
1774 void collectLoopUniforms(ElementCount VF);
1775
1776 /// Collect the instructions that are scalar after vectorization. An
1777 /// instruction is scalar if it is known to be uniform or will be scalarized
1778 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1779 /// to the list if they are used by a load/store instruction that is marked as
1780 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1781 /// VF values in the vectorized loop, each corresponding to an iteration of
1782 /// the original scalar loop.
1783 void collectLoopScalars(ElementCount VF);
1784
1785 /// Keeps cost model vectorization decision and cost for instructions.
1786 /// Right now it is used for memory instructions only.
1788 std::pair<InstWidening, InstructionCost>>;
1789
1790 DecisionList WideningDecisions;
1791
1792 using CallDecisionList =
1793 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1794
1795 CallDecisionList CallWideningDecisions;
1796
1797 /// Returns true if \p V is expected to be vectorized and it needs to be
1798 /// extracted.
1799 bool needsExtract(Value *V, ElementCount VF) const {
1800 Instruction *I = dyn_cast<Instruction>(V);
1801 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1803 return false;
1804
1805 // Assume we can vectorize V (and hence we need extraction) if the
1806 // scalars are not computed yet. This can happen, because it is called
1807 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1808 // the scalars are collected. That should be a safe assumption in most
1809 // cases, because we check if the operands have vectorizable types
1810 // beforehand in LoopVectorizationLegality.
1811 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1812 };
1813
1814 /// Returns a range containing only operands needing to be extracted.
1815 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1816 ElementCount VF) const {
1818 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1819 }
1820
1821public:
1822 /// The loop that we evaluate.
1824
1825 /// Predicated scalar evolution analysis.
1827
1828 /// Loop Info analysis.
1830
1831 /// Vectorization legality.
1833
1834 /// Vector target information.
1836
1837 /// Target Library Info.
1839
1840 /// Demanded bits analysis.
1842
1843 /// Assumption cache.
1845
1846 /// Interface to emit optimization remarks.
1848
1850
1851 /// Loop Vectorize Hint.
1853
1854 /// The interleave access information contains groups of interleaved accesses
1855 /// with the same stride and close to each other.
1857
1858 /// Values to ignore in the cost model.
1860
1861 /// Values to ignore in the cost model when VF > 1.
1863
1864 /// All element types found in the loop.
1866};
1867} // end namespace llvm
1868
1869namespace {
1870/// Helper struct to manage generating runtime checks for vectorization.
1871///
1872/// The runtime checks are created up-front in temporary blocks to allow better
1873/// estimating the cost and un-linked from the existing IR. After deciding to
1874/// vectorize, the checks are moved back. If deciding not to vectorize, the
1875/// temporary blocks are completely removed.
1876class GeneratedRTChecks {
1877 /// Basic block which contains the generated SCEV checks, if any.
1878 BasicBlock *SCEVCheckBlock = nullptr;
1879
1880 /// The value representing the result of the generated SCEV checks. If it is
1881 /// nullptr, either no SCEV checks have been generated or they have been used.
1882 Value *SCEVCheckCond = nullptr;
1883
1884 /// Basic block which contains the generated memory runtime checks, if any.
1885 BasicBlock *MemCheckBlock = nullptr;
1886
1887 /// The value representing the result of the generated memory runtime checks.
1888 /// If it is nullptr, either no memory runtime checks have been generated or
1889 /// they have been used.
1890 Value *MemRuntimeCheckCond = nullptr;
1891
1892 DominatorTree *DT;
1893 LoopInfo *LI;
1895
1896 SCEVExpander SCEVExp;
1897 SCEVExpander MemCheckExp;
1898
1899 bool CostTooHigh = false;
1900 const bool AddBranchWeights;
1901
1902 Loop *OuterLoop = nullptr;
1903
1904public:
1905 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1907 bool AddBranchWeights)
1908 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1909 MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1910
1911 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1912 /// accurately estimate the cost of the runtime checks. The blocks are
1913 /// un-linked from the IR and is added back during vector code generation. If
1914 /// there is no vector code generation, the check blocks are removed
1915 /// completely.
1916 void Create(Loop *L, const LoopAccessInfo &LAI,
1917 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1918
1919 // Hard cutoff to limit compile-time increase in case a very large number of
1920 // runtime checks needs to be generated.
1921 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1922 // profile info.
1923 CostTooHigh =
1925 if (CostTooHigh)
1926 return;
1927
1928 BasicBlock *LoopHeader = L->getHeader();
1929 BasicBlock *Preheader = L->getLoopPreheader();
1930
1931 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1932 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1933 // may be used by SCEVExpander. The blocks will be un-linked from their
1934 // predecessors and removed from LI & DT at the end of the function.
1935 if (!UnionPred.isAlwaysTrue()) {
1936 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1937 nullptr, "vector.scevcheck");
1938
1939 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1940 &UnionPred, SCEVCheckBlock->getTerminator());
1941 }
1942
1943 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1944 if (RtPtrChecking.Need) {
1945 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1946 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1947 "vector.memcheck");
1948
1949 auto DiffChecks = RtPtrChecking.getDiffChecks();
1950 if (DiffChecks) {
1951 Value *RuntimeVF = nullptr;
1952 MemRuntimeCheckCond = addDiffRuntimeChecks(
1953 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1954 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1955 if (!RuntimeVF)
1956 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1957 return RuntimeVF;
1958 },
1959 IC);
1960 } else {
1961 MemRuntimeCheckCond = addRuntimeChecks(
1962 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1964 }
1965 assert(MemRuntimeCheckCond &&
1966 "no RT checks generated although RtPtrChecking "
1967 "claimed checks are required");
1968 }
1969
1970 if (!MemCheckBlock && !SCEVCheckBlock)
1971 return;
1972
1973 // Unhook the temporary block with the checks, update various places
1974 // accordingly.
1975 if (SCEVCheckBlock)
1976 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1977 if (MemCheckBlock)
1978 MemCheckBlock->replaceAllUsesWith(Preheader);
1979
1980 if (SCEVCheckBlock) {
1981 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1982 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1983 Preheader->getTerminator()->eraseFromParent();
1984 }
1985 if (MemCheckBlock) {
1986 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1987 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1988 Preheader->getTerminator()->eraseFromParent();
1989 }
1990
1991 DT->changeImmediateDominator(LoopHeader, Preheader);
1992 if (MemCheckBlock) {
1993 DT->eraseNode(MemCheckBlock);
1994 LI->removeBlock(MemCheckBlock);
1995 }
1996 if (SCEVCheckBlock) {
1997 DT->eraseNode(SCEVCheckBlock);
1998 LI->removeBlock(SCEVCheckBlock);
1999 }
2000
2001 // Outer loop is used as part of the later cost calculations.
2002 OuterLoop = L->getParentLoop();
2003 }
2004
2005 InstructionCost getCost() {
2006 if (SCEVCheckBlock || MemCheckBlock)
2007 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
2008
2009 if (CostTooHigh) {
2011 Cost.setInvalid();
2012 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
2013 return Cost;
2014 }
2015
2016 InstructionCost RTCheckCost = 0;
2017 if (SCEVCheckBlock)
2018 for (Instruction &I : *SCEVCheckBlock) {
2019 if (SCEVCheckBlock->getTerminator() == &I)
2020 continue;
2023 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2024 RTCheckCost += C;
2025 }
2026 if (MemCheckBlock) {
2027 InstructionCost MemCheckCost = 0;
2028 for (Instruction &I : *MemCheckBlock) {
2029 if (MemCheckBlock->getTerminator() == &I)
2030 continue;
2033 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2034 MemCheckCost += C;
2035 }
2036
2037 // If the runtime memory checks are being created inside an outer loop
2038 // we should find out if these checks are outer loop invariant. If so,
2039 // the checks will likely be hoisted out and so the effective cost will
2040 // reduce according to the outer loop trip count.
2041 if (OuterLoop) {
2042 ScalarEvolution *SE = MemCheckExp.getSE();
2043 // TODO: If profitable, we could refine this further by analysing every
2044 // individual memory check, since there could be a mixture of loop
2045 // variant and invariant checks that mean the final condition is
2046 // variant.
2047 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
2048 if (SE->isLoopInvariant(Cond, OuterLoop)) {
2049 // It seems reasonable to assume that we can reduce the effective
2050 // cost of the checks even when we know nothing about the trip
2051 // count. Assume that the outer loop executes at least twice.
2052 unsigned BestTripCount = 2;
2053
2054 // If exact trip count is known use that.
2055 if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
2056 BestTripCount = SmallTC;
2058 // Else use profile data if available.
2059 if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
2060 BestTripCount = *EstimatedTC;
2061 }
2062
2063 BestTripCount = std::max(BestTripCount, 1U);
2064 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2065
2066 // Let's ensure the cost is always at least 1.
2067 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2069
2070 if (BestTripCount > 1)
2072 << "We expect runtime memory checks to be hoisted "
2073 << "out of the outer loop. Cost reduced from "
2074 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2075
2076 MemCheckCost = NewMemCheckCost;
2077 }
2078 }
2079
2080 RTCheckCost += MemCheckCost;
2081 }
2082
2083 if (SCEVCheckBlock || MemCheckBlock)
2084 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2085 << "\n");
2086
2087 return RTCheckCost;
2088 }
2089
2090 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2091 /// unused.
2092 ~GeneratedRTChecks() {
2093 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2094 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2095 if (!SCEVCheckCond)
2096 SCEVCleaner.markResultUsed();
2097
2098 if (!MemRuntimeCheckCond)
2099 MemCheckCleaner.markResultUsed();
2100
2101 if (MemRuntimeCheckCond) {
2102 auto &SE = *MemCheckExp.getSE();
2103 // Memory runtime check generation creates compares that use expanded
2104 // values. Remove them before running the SCEVExpanderCleaners.
2105 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2106 if (MemCheckExp.isInsertedInstruction(&I))
2107 continue;
2108 SE.forgetValue(&I);
2109 I.eraseFromParent();
2110 }
2111 }
2112 MemCheckCleaner.cleanup();
2113 SCEVCleaner.cleanup();
2114
2115 if (SCEVCheckCond)
2116 SCEVCheckBlock->eraseFromParent();
2117 if (MemRuntimeCheckCond)
2118 MemCheckBlock->eraseFromParent();
2119 }
2120
2121 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2122 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2123 /// depending on the generated condition.
2124 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2125 BasicBlock *LoopVectorPreHeader,
2126 BasicBlock *LoopExitBlock) {
2127 if (!SCEVCheckCond)
2128 return nullptr;
2129
2130 Value *Cond = SCEVCheckCond;
2131 // Mark the check as used, to prevent it from being removed during cleanup.
2132 SCEVCheckCond = nullptr;
2133 if (auto *C = dyn_cast<ConstantInt>(Cond))
2134 if (C->isZero())
2135 return nullptr;
2136
2137 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2138
2139 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2140 // Create new preheader for vector loop.
2141 if (OuterLoop)
2142 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2143
2144 SCEVCheckBlock->getTerminator()->eraseFromParent();
2145 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2146 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2147 SCEVCheckBlock);
2148
2149 DT->addNewBlock(SCEVCheckBlock, Pred);
2150 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2151
2152 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2153 if (AddBranchWeights)
2155 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2156 return SCEVCheckBlock;
2157 }
2158
2159 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2160 /// the branches to branch to the vector preheader or \p Bypass, depending on
2161 /// the generated condition.
2162 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2163 BasicBlock *LoopVectorPreHeader) {
2164 // Check if we generated code that checks in runtime if arrays overlap.
2165 if (!MemRuntimeCheckCond)
2166 return nullptr;
2167
2168 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2169 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2170 MemCheckBlock);
2171
2172 DT->addNewBlock(MemCheckBlock, Pred);
2173 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2174 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2175
2176 if (OuterLoop)
2177 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2178
2179 BranchInst &BI =
2180 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2181 if (AddBranchWeights) {
2183 }
2184 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2185 MemCheckBlock->getTerminator()->setDebugLoc(
2186 Pred->getTerminator()->getDebugLoc());
2187
2188 // Mark the check as used, to prevent it from being removed during cleanup.
2189 MemRuntimeCheckCond = nullptr;
2190 return MemCheckBlock;
2191 }
2192};
2193} // namespace
2194
2196 return Style == TailFoldingStyle::Data ||
2197 Style == TailFoldingStyle::DataAndControlFlow ||
2198 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2199}
2200
2202 return Style == TailFoldingStyle::DataAndControlFlow ||
2203 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2204}
2205
2206// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2207// vectorization. The loop needs to be annotated with #pragma omp simd
2208// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2209// vector length information is not provided, vectorization is not considered
2210// explicit. Interleave hints are not allowed either. These limitations will be
2211// relaxed in the future.
2212// Please, note that we are currently forced to abuse the pragma 'clang
2213// vectorize' semantics. This pragma provides *auto-vectorization hints*
2214// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2215// provides *explicit vectorization hints* (LV can bypass legal checks and
2216// assume that vectorization is legal). However, both hints are implemented
2217// using the same metadata (llvm.loop.vectorize, processed by
2218// LoopVectorizeHints). This will be fixed in the future when the native IR
2219// representation for pragma 'omp simd' is introduced.
2220static bool isExplicitVecOuterLoop(Loop *OuterLp,
2222 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2223 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2224
2225 // Only outer loops with an explicit vectorization hint are supported.
2226 // Unannotated outer loops are ignored.
2228 return false;
2229
2230 Function *Fn = OuterLp->getHeader()->getParent();
2231 if (!Hints.allowVectorization(Fn, OuterLp,
2232 true /*VectorizeOnlyWhenForced*/)) {
2233 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2234 return false;
2235 }
2236
2237 if (Hints.getInterleave() > 1) {
2238 // TODO: Interleave support is future work.
2239 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2240 "outer loops.\n");
2241 Hints.emitRemarkWithHints();
2242 return false;
2243 }
2244
2245 return true;
2246}
2247
2251 // Collect inner loops and outer loops without irreducible control flow. For
2252 // now, only collect outer loops that have explicit vectorization hints. If we
2253 // are stress testing the VPlan H-CFG construction, we collect the outermost
2254 // loop of every loop nest.
2255 if (L.isInnermost() || VPlanBuildStressTest ||
2257 LoopBlocksRPO RPOT(&L);
2258 RPOT.perform(LI);
2259 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2260 V.push_back(&L);
2261 // TODO: Collect inner loops inside marked outer loops in case
2262 // vectorization fails for the outer loop. Do not invoke
2263 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2264 // already known to be reducible. We can use an inherited attribute for
2265 // that.
2266 return;
2267 }
2268 }
2269 for (Loop *InnerL : L)
2270 collectSupportedLoops(*InnerL, LI, ORE, V);
2271}
2272
2273//===----------------------------------------------------------------------===//
2274// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2275// LoopVectorizationCostModel and LoopVectorizationPlanner.
2276//===----------------------------------------------------------------------===//
2277
2278/// Compute the transformed value of Index at offset StartValue using step
2279/// StepValue.
2280/// For integer induction, returns StartValue + Index * StepValue.
2281/// For pointer induction, returns StartValue[Index * StepValue].
2282/// FIXME: The newly created binary instructions should contain nsw/nuw
2283/// flags, which can be found from the original scalar operations.
2284static Value *
2286 Value *Step,
2288 const BinaryOperator *InductionBinOp) {
2289 Type *StepTy = Step->getType();
2290 Value *CastedIndex = StepTy->isIntegerTy()
2291 ? B.CreateSExtOrTrunc(Index, StepTy)
2292 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2293 if (CastedIndex != Index) {
2294 CastedIndex->setName(CastedIndex->getName() + ".cast");
2295 Index = CastedIndex;
2296 }
2297
2298 // Note: the IR at this point is broken. We cannot use SE to create any new
2299 // SCEV and then expand it, hoping that SCEV's simplification will give us
2300 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2301 // lead to various SCEV crashes. So all we can do is to use builder and rely
2302 // on InstCombine for future simplifications. Here we handle some trivial
2303 // cases only.
2304 auto CreateAdd = [&B](Value *X, Value *Y) {
2305 assert(X->getType() == Y->getType() && "Types don't match!");
2306 if (auto *CX = dyn_cast<ConstantInt>(X))
2307 if (CX->isZero())
2308 return Y;
2309 if (auto *CY = dyn_cast<ConstantInt>(Y))
2310 if (CY->isZero())
2311 return X;
2312 return B.CreateAdd(X, Y);
2313 };
2314
2315 // We allow X to be a vector type, in which case Y will potentially be
2316 // splatted into a vector with the same element count.
2317 auto CreateMul = [&B](Value *X, Value *Y) {
2318 assert(X->getType()->getScalarType() == Y->getType() &&
2319 "Types don't match!");
2320 if (auto *CX = dyn_cast<ConstantInt>(X))
2321 if (CX->isOne())
2322 return Y;
2323 if (auto *CY = dyn_cast<ConstantInt>(Y))
2324 if (CY->isOne())
2325 return X;
2326 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2327 if (XVTy && !isa<VectorType>(Y->getType()))
2328 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2329 return B.CreateMul(X, Y);
2330 };
2331
2332 switch (InductionKind) {
2334 assert(!isa<VectorType>(Index->getType()) &&
2335 "Vector indices not supported for integer inductions yet");
2336 assert(Index->getType() == StartValue->getType() &&
2337 "Index type does not match StartValue type");
2338 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2339 return B.CreateSub(StartValue, Index);
2340 auto *Offset = CreateMul(Index, Step);
2341 return CreateAdd(StartValue, Offset);
2342 }
2344 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2346 assert(!isa<VectorType>(Index->getType()) &&
2347 "Vector indices not supported for FP inductions yet");
2348 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2349 assert(InductionBinOp &&
2350 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2351 InductionBinOp->getOpcode() == Instruction::FSub) &&
2352 "Original bin op should be defined for FP induction");
2353
2354 Value *MulExp = B.CreateFMul(Step, Index);
2355 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2356 "induction");
2357 }
2359 return nullptr;
2360 }
2361 llvm_unreachable("invalid enum");
2362}
2363
2364std::optional<unsigned> getMaxVScale(const Function &F,
2365 const TargetTransformInfo &TTI) {
2366 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2367 return MaxVScale;
2368
2369 if (F.hasFnAttribute(Attribute::VScaleRange))
2370 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2371
2372 return std::nullopt;
2373}
2374
2375/// For the given VF and UF and maximum trip count computed for the loop, return
2376/// whether the induction variable might overflow in the vectorized loop. If not,
2377/// then we know a runtime overflow check always evaluates to false and can be
2378/// removed.
2381 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2382 // Always be conservative if we don't know the exact unroll factor.
2383 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2384
2385 Type *IdxTy = Cost->Legal->getWidestInductionType();
2386 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2387
2388 // We know the runtime overflow check is known false iff the (max) trip-count
2389 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2390 // the vector loop induction variable.
2391 if (unsigned TC =
2392 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2393 uint64_t MaxVF = VF.getKnownMinValue();
2394 if (VF.isScalable()) {
2395 std::optional<unsigned> MaxVScale =
2396 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2397 if (!MaxVScale)
2398 return false;
2399 MaxVF *= *MaxVScale;
2400 }
2401
2402 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2403 }
2404
2405 return false;
2406}
2407
2408// Return whether we allow using masked interleave-groups (for dealing with
2409// strided loads/stores that reside in predicated blocks, or for dealing
2410// with gaps).
2412 // If an override option has been passed in for interleaved accesses, use it.
2413 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2415
2417}
2418
2419// Try to vectorize the interleave group that \p Instr belongs to.
2420//
2421// E.g. Translate following interleaved load group (factor = 3):
2422// for (i = 0; i < N; i+=3) {
2423// R = Pic[i]; // Member of index 0
2424// G = Pic[i+1]; // Member of index 1
2425// B = Pic[i+2]; // Member of index 2
2426// ... // do something to R, G, B
2427// }
2428// To:
2429// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2430// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2431// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2432// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2433//
2434// Or translate following interleaved store group (factor = 3):
2435// for (i = 0; i < N; i+=3) {
2436// ... do something to R, G, B
2437// Pic[i] = R; // Member of index 0
2438// Pic[i+1] = G; // Member of index 1
2439// Pic[i+2] = B; // Member of index 2
2440// }
2441// To:
2442// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2443// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2444// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2445// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2446// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2449 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2450 VPValue *BlockInMask, bool NeedsMaskForGaps) {
2451 Instruction *Instr = Group->getInsertPos();
2452 const DataLayout &DL = Instr->getModule()->getDataLayout();
2453
2454 // Prepare for the vector type of the interleaved load/store.
2455 Type *ScalarTy = getLoadStoreType(Instr);
2456 unsigned InterleaveFactor = Group->getFactor();
2457 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2458
2459 // Prepare for the new pointers.
2460 SmallVector<Value *, 2> AddrParts;
2461 unsigned Index = Group->getIndex(Instr);
2462
2463 // TODO: extend the masked interleaved-group support to reversed access.
2464 assert((!BlockInMask || !Group->isReverse()) &&
2465 "Reversed masked interleave-group not supported.");
2466
2467 Value *Idx;
2468 // If the group is reverse, adjust the index to refer to the last vector lane
2469 // instead of the first. We adjust the index from the first vector lane,
2470 // rather than directly getting the pointer for lane VF - 1, because the
2471 // pointer operand of the interleaved access is supposed to be uniform. For
2472 // uniform instructions, we're only required to generate a value for the
2473 // first vector lane in each unroll iteration.
2474 if (Group->isReverse()) {
2475 Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2476 Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
2480 } else
2482
2483 for (unsigned Part = 0; Part < UF; Part++) {
2484 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2485 if (auto *I = dyn_cast<Instruction>(AddrPart))
2486 State.setDebugLocFrom(I->getDebugLoc());
2487
2488 // Notice current instruction could be any index. Need to adjust the address
2489 // to the member of index 0.
2490 //
2491 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2492 // b = A[i]; // Member of index 0
2493 // Current pointer is pointed to A[i+1], adjust it to A[i].
2494 //
2495 // E.g. A[i+1] = a; // Member of index 1
2496 // A[i] = b; // Member of index 0
2497 // A[i+2] = c; // Member of index 2 (Current instruction)
2498 // Current pointer is pointed to A[i+2], adjust it to A[i].
2499
2500 bool InBounds = false;
2501 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2502 InBounds = gep->isInBounds();
2503 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2504 AddrParts.push_back(AddrPart);
2505 }
2506
2507 State.setDebugLocFrom(Instr->getDebugLoc());
2508 Value *PoisonVec = PoisonValue::get(VecTy);
2509
2510 auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
2511 unsigned Part, Value *MaskForGaps) -> Value * {
2512 if (VF.isScalable()) {
2513 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2514 assert(InterleaveFactor == 2 &&
2515 "Unsupported deinterleave factor for scalable vectors");
2516 auto *BlockInMaskPart = State.get(BlockInMask, Part);
2517 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2518 auto *MaskTy =
2520 return Builder.CreateIntrinsic(
2521 MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
2522 /*FMFSource=*/nullptr, "interleaved.mask");
2523 }
2524
2525 if (!BlockInMask)
2526 return MaskForGaps;
2527
2528 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2529 Value *ShuffledMask = Builder.CreateShuffleVector(
2530 BlockInMaskPart,
2531 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2532 "interleaved.mask");
2533 return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2534 MaskForGaps)
2535 : ShuffledMask;
2536 };
2537
2538 // Vectorize the interleaved load group.
2539 if (isa<LoadInst>(Instr)) {
2540 Value *MaskForGaps = nullptr;
2541 if (NeedsMaskForGaps) {
2542 MaskForGaps =
2544 assert(MaskForGaps && "Mask for Gaps is required but it is null");
2545 }
2546
2547 // For each unroll part, create a wide load for the group.
2548 SmallVector<Value *, 2> NewLoads;
2549 for (unsigned Part = 0; Part < UF; Part++) {
2550 Instruction *NewLoad;
2551 if (BlockInMask || MaskForGaps) {
2553 "masked interleaved groups are not allowed.");
2554 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2555 NewLoad =
2556 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2557 GroupMask, PoisonVec, "wide.masked.vec");
2558 }
2559 else
2560 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2561 Group->getAlign(), "wide.vec");
2562 Group->addMetadata(NewLoad);
2563 NewLoads.push_back(NewLoad);
2564 }
2565
2566 if (VecTy->isScalableTy()) {
2567 assert(InterleaveFactor == 2 &&
2568 "Unsupported deinterleave factor for scalable vectors");
2569
2570 for (unsigned Part = 0; Part < UF; ++Part) {
2571 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2572 // so must use intrinsics to deinterleave.
2574 Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
2575 /*FMFSource=*/nullptr, "strided.vec");
2576 unsigned J = 0;
2577 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2578 Instruction *Member = Group->getMember(I);
2579
2580 if (!Member)
2581 continue;
2582
2583 Value *StridedVec = Builder.CreateExtractValue(DI, I);
2584 // If this member has different type, cast the result type.
2585 if (Member->getType() != ScalarTy) {
2586 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2587 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2588 }
2589
2590 if (Group->isReverse())
2591 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2592
2593 State.set(VPDefs[J], StridedVec, Part);
2594 ++J;
2595 }
2596 }
2597
2598 return;
2599 }
2600
2601 // For each member in the group, shuffle out the appropriate data from the
2602 // wide loads.
2603 unsigned J = 0;
2604 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2605 Instruction *Member = Group->getMember(I);
2606
2607 // Skip the gaps in the group.
2608 if (!Member)
2609 continue;
2610
2611 auto StrideMask =
2612 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2613 for (unsigned Part = 0; Part < UF; Part++) {
2614 Value *StridedVec = Builder.CreateShuffleVector(
2615 NewLoads[Part], StrideMask, "strided.vec");
2616
2617 // If this member has different type, cast the result type.
2618 if (Member->getType() != ScalarTy) {
2619 assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2620 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2621 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2622 }
2623
2624 if (Group->isReverse())
2625 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2626
2627 State.set(VPDefs[J], StridedVec, Part);
2628 }
2629 ++J;
2630 }
2631 return;
2632 }
2633
2634 // The sub vector type for current instruction.
2635 auto *SubVT = VectorType::get(ScalarTy, VF);
2636
2637 // Vectorize the interleaved store group.
2638 Value *MaskForGaps =
2640 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2641 "masked interleaved groups are not allowed.");
2642 assert((!MaskForGaps || !VF.isScalable()) &&
2643 "masking gaps for scalable vectors is not yet supported.");
2644 for (unsigned Part = 0; Part < UF; Part++) {
2645 // Collect the stored vector from each member.
2646 SmallVector<Value *, 4> StoredVecs;
2647 unsigned StoredIdx = 0;
2648 for (unsigned i = 0; i < InterleaveFactor; i++) {
2649 assert((Group->getMember(i) || MaskForGaps) &&
2650 "Fail to get a member from an interleaved store group");
2651 Instruction *Member = Group->getMember(i);
2652
2653 // Skip the gaps in the group.
2654 if (!Member) {
2655 Value *Undef = PoisonValue::get(SubVT);
2656 StoredVecs.push_back(Undef);
2657 continue;
2658 }
2659
2660 Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2661 ++StoredIdx;
2662
2663 if (Group->isReverse())
2664 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2665
2666 // If this member has different type, cast it to a unified type.
2667
2668 if (StoredVec->getType() != SubVT)
2669 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2670
2671 StoredVecs.push_back(StoredVec);
2672 }
2673
2674 // Interleave all the smaller vectors into one wider vector.
2675 Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
2676 Instruction *NewStoreInstr;
2677 if (BlockInMask || MaskForGaps) {
2678 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2679 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2680 Group->getAlign(), GroupMask);
2681 } else
2682 NewStoreInstr =
2683 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2684
2685 Group->addMetadata(NewStoreInstr);
2686 }
2687}
2688
2690 VPReplicateRecipe *RepRecipe,
2691 const VPIteration &Instance,
2692 VPTransformState &State) {
2693 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2694
2695 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2696 // the first lane and part.
2697 if (isa<NoAliasScopeDeclInst>(Instr))
2698 if (!Instance.isFirstIteration())
2699 return;
2700
2701 // Does this instruction return a value ?
2702 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2703
2704 Instruction *Cloned = Instr->clone();
2705 if (!IsVoidRetTy) {
2706 Cloned->setName(Instr->getName() + ".cloned");
2707#if !defined(NDEBUG)
2708 // Verify that VPlan type inference results agree with the type of the
2709 // generated values.
2710 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2711 "inferred type and type from generated instructions do not match");
2712#endif
2713 }
2714
2715 RepRecipe->setFlags(Cloned);
2716
2717 if (auto DL = Instr->getDebugLoc())
2718 State.setDebugLocFrom(DL);
2719
2720 // Replace the operands of the cloned instructions with their scalar
2721 // equivalents in the new loop.
2722 for (const auto &I : enumerate(RepRecipe->operands())) {
2723 auto InputInstance = Instance;
2724 VPValue *Operand = I.value();
2726 InputInstance.Lane = VPLane::getFirstLane();
2727 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2728 }
2729 State.addNewMetadata(Cloned, Instr);
2730
2731 // Place the cloned scalar in the new loop.
2732 State.Builder.Insert(Cloned);
2733
2734 State.set(RepRecipe, Cloned, Instance);
2735
2736 // If we just cloned a new assumption, add it the assumption cache.
2737 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2739
2740 // End if-block.
2741 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2742 if (IfPredicateInstr)
2743 PredicatedInstructions.push_back(Cloned);
2744}
2745
2746Value *
2748 if (VectorTripCount)
2749 return VectorTripCount;
2750
2751 Value *TC = getTripCount();
2752 IRBuilder<> Builder(InsertBlock->getTerminator());
2753
2754 Type *Ty = TC->getType();
2755 // This is where we can make the step a runtime constant.
2756 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2757
2758 // If the tail is to be folded by masking, round the number of iterations N
2759 // up to a multiple of Step instead of rounding down. This is done by first
2760 // adding Step-1 and then rounding down. Note that it's ok if this addition
2761 // overflows: the vector induction variable will eventually wrap to zero given
2762 // that it starts at zero and its Step is a power of two; the loop will then
2763 // exit, with the last early-exit vector comparison also producing all-true.
2764 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2765 // is accounted for in emitIterationCountCheck that adds an overflow check.
2766 if (Cost->foldTailByMasking()) {
2768 "VF*UF must be a power of 2 when folding tail by masking");
2769 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2770 TC = Builder.CreateAdd(
2771 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2772 }
2773
2774 // Now we need to generate the expression for the part of the loop that the
2775 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2776 // iterations are not required for correctness, or N - Step, otherwise. Step
2777 // is equal to the vectorization factor (number of SIMD elements) times the
2778 // unroll factor (number of SIMD instructions).
2779 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2780
2781 // There are cases where we *must* run at least one iteration in the remainder
2782 // loop. See the cost model for when this can happen. If the step evenly
2783 // divides the trip count, we set the remainder to be equal to the step. If
2784 // the step does not evenly divide the trip count, no adjustment is necessary
2785 // since there will already be scalar iterations. Note that the minimum
2786 // iterations check ensures that N >= Step.
2787 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2788 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2789 R = Builder.CreateSelect(IsZero, Step, R);
2790 }
2791
2792 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2793
2794 return VectorTripCount;
2795}
2796
2798 const DataLayout &DL) {
2799 // Verify that V is a vector type with same number of elements as DstVTy.
2800 auto *DstFVTy = cast<VectorType>(DstVTy);
2801 auto VF = DstFVTy->getElementCount();
2802 auto *SrcVecTy = cast<VectorType>(V->getType());
2803 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2804 Type *SrcElemTy = SrcVecTy->getElementType();
2805 Type *DstElemTy = DstFVTy->getElementType();
2806 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2807 "Vector elements must have same size");
2808
2809 // Do a direct cast if element types are castable.
2810 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2811 return Builder.CreateBitOrPointerCast(V, DstFVTy);
2812 }
2813 // V cannot be directly casted to desired vector type.
2814 // May happen when V is a floating point vector but DstVTy is a vector of
2815 // pointers or vice-versa. Handle this using a two-step bitcast using an
2816 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2817 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2818 "Only one type should be a pointer type");
2819 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2820 "Only one type should be a floating point type");
2821 Type *IntTy =
2822 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2823 auto *VecIntTy = VectorType::get(IntTy, VF);
2824 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2825 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2826}
2827
2829 Value *Count = getTripCount();
2830 // Reuse existing vector loop preheader for TC checks.
2831 // Note that new preheader block is generated for vector loop.
2832 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2833 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2834
2835 // Generate code to check if the loop's trip count is less than VF * UF, or
2836 // equal to it in case a scalar epilogue is required; this implies that the
2837 // vector trip count is zero. This check also covers the case where adding one
2838 // to the backedge-taken count overflowed leading to an incorrect trip count
2839 // of zero. In this case we will also jump to the scalar loop.
2840 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2842
2843 // If tail is to be folded, vector loop takes care of all iterations.
2844 Type *CountTy = Count->getType();
2845 Value *CheckMinIters = Builder.getFalse();
2846 auto CreateStep = [&]() -> Value * {
2847 // Create step with max(MinProTripCount, UF * VF).
2849 return createStepForVF(Builder, CountTy, VF, UF);
2850
2851 Value *MinProfTC =
2853 if (!VF.isScalable())
2854 return MinProfTC;
2856 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2857 };
2858
2859 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2860 if (Style == TailFoldingStyle::None)
2861 CheckMinIters =
2862 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2863 else if (VF.isScalable() &&
2866 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2867 // an overflow to zero when updating induction variables and so an
2868 // additional overflow check is required before entering the vector loop.
2869
2870 // Get the maximum unsigned value for the type.
2871 Value *MaxUIntTripCount =
2872 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2873 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2874
2875 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2876 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2877 }
2878
2879 // Create new preheader for vector loop.
2881 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2882 "vector.ph");
2883
2884 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2885 DT->getNode(Bypass)->getIDom()) &&
2886 "TC check is expected to dominate Bypass");
2887
2888 // Update dominator for Bypass & LoopExit (if needed).
2889 DT->changeImmediateDominator(Bypass, TCCheckBlock);
2890 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2891 // If there is an epilogue which must run, there's no edge from the
2892 // middle block to exit blocks and thus no need to update the immediate
2893 // dominator of the exit blocks.
2895
2896 BranchInst &BI =
2897 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2900 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2901 LoopBypassBlocks.push_back(TCCheckBlock);
2902}
2903
2905 BasicBlock *const SCEVCheckBlock =
2906 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2907 if (!SCEVCheckBlock)
2908 return nullptr;
2909
2910 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2912 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2913 "Cannot SCEV check stride or overflow when optimizing for size");
2914
2915
2916 // Update dominator only if this is first RT check.
2917 if (LoopBypassBlocks.empty()) {
2918 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2919 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2920 // If there is an epilogue which must run, there's no edge from the
2921 // middle block to exit blocks and thus no need to update the immediate
2922 // dominator of the exit blocks.
2923 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2924 }
2925
2926 LoopBypassBlocks.push_back(SCEVCheckBlock);
2927 AddedSafetyChecks = true;
2928 return SCEVCheckBlock;
2929}
2930
2932 // VPlan-native path does not do any analysis for runtime checks currently.
2934 return nullptr;
2935
2936 BasicBlock *const MemCheckBlock =
2937 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2938
2939 // Check if we generated code that checks in runtime if arrays overlap. We put
2940 // the checks into a separate block to make the more common case of few
2941 // elements faster.
2942 if (!MemCheckBlock)
2943 return nullptr;
2944
2945 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2946 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2947 "Cannot emit memory checks when optimizing for size, unless forced "
2948 "to vectorize.");
2949 ORE->emit([&]() {
2950 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2953 << "Code-size may be reduced by not forcing "
2954 "vectorization, or by source-code modifications "
2955 "eliminating the need for runtime checks "
2956 "(e.g., adding 'restrict').";
2957 });
2958 }
2959
2960 LoopBypassBlocks.push_back(MemCheckBlock);
2961
2962 AddedSafetyChecks = true;
2963
2964 return MemCheckBlock;
2965}
2966
2970 assert(LoopVectorPreHeader && "Invalid loop structure");
2971 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2972 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2973 "multiple exit loop without required epilogue?");
2974
2977 LI, nullptr, Twine(Prefix) + "middle.block");
2980 nullptr, Twine(Prefix) + "scalar.ph");
2981
2982 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
2983
2984 // Set up the middle block terminator. Two cases:
2985 // 1) If we know that we must execute the scalar epilogue, emit an
2986 // unconditional branch.
2987 // 2) Otherwise, we must have a single unique exit block (due to how we
2988 // implement the multiple exit case). In this case, set up a conditional
2989 // branch from the middle block to the loop scalar preheader, and the
2990 // exit block. completeLoopSkeleton will update the condition to use an
2991 // iteration check, if required to decide whether to execute the remainder.
2992 BranchInst *BrInst =
2993 Cost->requiresScalarEpilogue(VF.isVector())
2996 Builder.getTrue());
2997 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
2999
3000 // Update dominator for loop exit. During skeleton creation, only the vector
3001 // pre-header and the middle block are created. The vector loop is entirely
3002 // created during VPlan exection.
3003 if (!Cost->requiresScalarEpilogue(VF.isVector()))
3004 // If there is an epilogue which must run, there's no edge from the
3005 // middle block to exit blocks and thus no need to update the immediate
3006 // dominator of the exit blocks.
3008}
3009
3011 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
3012 ArrayRef<BasicBlock *> BypassBlocks,
3013 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3015 assert(VectorTripCount && "Expected valid arguments");
3016
3017 Instruction *OldInduction = Legal->getPrimaryInduction();
3018 Value *&EndValue = IVEndValues[OrigPhi];
3019 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3020 if (OrigPhi == OldInduction) {
3021 // We know what the end value is.
3022 EndValue = VectorTripCount;
3023 } else {
3025
3026 // Fast-math-flags propagate from the original induction instruction.
3027 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3028 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3029
3031 Step, II.getKind(), II.getInductionBinOp());
3032 EndValue->setName("ind.end");
3033
3034 // Compute the end value for the additional bypass (if applicable).
3035 if (AdditionalBypass.first) {
3036 B.SetInsertPoint(AdditionalBypass.first,
3037 AdditionalBypass.first->getFirstInsertionPt());
3038 EndValueFromAdditionalBypass =
3039 emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
3040 Step, II.getKind(), II.getInductionBinOp());
3041 EndValueFromAdditionalBypass->setName("ind.end");
3042 }
3043 }
3044
3045 // Create phi nodes to merge from the backedge-taken check block.
3046 PHINode *BCResumeVal =
3047 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3049 // Copy original phi DL over to the new one.
3050 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3051
3052 // The new PHI merges the original incoming value, in case of a bypass,
3053 // or the value at the end of the vectorized loop.
3054 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3055
3056 // Fix the scalar body counter (PHI node).
3057 // The old induction's phi node in the scalar body needs the truncated
3058 // value.
3059 for (BasicBlock *BB : BypassBlocks)
3060 BCResumeVal->addIncoming(II.getStartValue(), BB);
3061
3062 if (AdditionalBypass.first)
3063 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3064 EndValueFromAdditionalBypass);
3065 return BCResumeVal;
3066}
3067
3068/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
3069/// expansion results.
3071 const SCEV2ValueTy &ExpandedSCEVs) {
3072 const SCEV *Step = ID.getStep();
3073 if (auto *C = dyn_cast<SCEVConstant>(Step))
3074 return C->getValue();
3075 if (auto *U = dyn_cast<SCEVUnknown>(Step))
3076 return U->getValue();
3077 auto I = ExpandedSCEVs.find(Step);
3078 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
3079 return I->second;
3080}
3081
3083 const SCEV2ValueTy &ExpandedSCEVs,
3084 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3085 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3086 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3087 "Inconsistent information about additional bypass.");
3088 // We are going to resume the execution of the scalar loop.
3089 // Go over all of the induction variables that we found and fix the
3090 // PHIs that are left in the scalar version of the loop.
3091 // The starting values of PHI nodes depend on the counter of the last
3092 // iteration in the vectorized loop.
3093 // If we come from a bypass edge then we need to start from the original
3094 // start value.
3095 for (const auto &InductionEntry : Legal->getInductionVars()) {
3096 PHINode *OrigPhi = InductionEntry.first;
3097 const InductionDescriptor &II = InductionEntry.second;
3098 PHINode *BCResumeVal = createInductionResumeValue(
3099 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
3100 AdditionalBypass);
3101 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3102 }
3103}
3104
3106 // The trip counts should be cached by now.
3107 Value *Count = getTripCount();
3109
3110 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3111
3112 // Add a check in the middle block to see if we have completed
3113 // all of the iterations in the first vector loop. Three cases:
3114 // 1) If we require a scalar epilogue, there is no conditional branch as
3115 // we unconditionally branch to the scalar preheader. Do nothing.
3116 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3117 // Thus if tail is to be folded, we know we don't need to run the
3118 // remainder and we can use the previous value for the condition (true).
3119 // 3) Otherwise, construct a runtime check.
3120 if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
3121 !Cost->foldTailByMasking()) {
3122 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3123 // of the corresponding compare because they may have ended up with
3124 // different line numbers and we want to avoid awkward line stepping while
3125 // debugging. Eg. if the compare has got a line number inside the loop.
3126 // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3127 // operands. Perform simplification directly on VPlan once the branch is
3128 // modeled there.
3130 B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3131 Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
3132 BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
3133 BI.setCondition(CmpN);
3134 if (hasBranchWeightMD(*ScalarLatchTerm)) {
3135 // Assume that `Count % VectorTripCount` is equally distributed.
3136 unsigned TripCount = UF * VF.getKnownMinValue();
3137 assert(TripCount > 0 && "trip count should not be zero");
3138 const uint32_t Weights[] = {1, TripCount - 1};
3139 setBranchWeights(BI, Weights);
3140 }
3141 }
3142
3143#ifdef EXPENSIVE_CHECKS
3144 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3145#endif
3146
3147 return LoopVectorPreHeader;
3148}
3149
3150std::pair<BasicBlock *, Value *>
3152 const SCEV2ValueTy &ExpandedSCEVs) {
3153 /*
3154 In this function we generate a new loop. The new loop will contain
3155 the vectorized instructions while the old loop will continue to run the
3156 scalar remainder.
3157
3158 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3159 / | preheader are expanded here. Eventually all required SCEV
3160 / | expansion should happen here.
3161 / v
3162 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3163 | / |
3164 | / v
3165 || [ ] <-- vector pre header.
3166 |/ |
3167 | v
3168 | [ ] \
3169 | [ ]_| <-- vector loop (created during VPlan execution).
3170 | |
3171 | v
3172 \ -[ ] <--- middle-block.
3173 \/ |
3174 /\ v
3175 | ->[ ] <--- new preheader.
3176 | |
3177 (opt) v <-- edge from middle to exit iff epilogue is not required.
3178 | [ ] \
3179 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3180 \ |
3181 \ v
3182 >[ ] <-- exit block(s).
3183 ...
3184 */
3185
3186 // Create an empty vector loop, and prepare basic blocks for the runtime
3187 // checks.
3189
3190 // Now, compare the new count to zero. If it is zero skip the vector loop and
3191 // jump to the scalar loop. This check also covers the case where the
3192 // backedge-taken count is uint##_max: adding one to it will overflow leading
3193 // to an incorrect trip count of zero. In this (rare) case we will also jump
3194 // to the scalar loop.
3196
3197 // Generate the code to check any assumptions that we've made for SCEV
3198 // expressions.
3200
3201 // Generate the code that checks in runtime if arrays overlap. We put the
3202 // checks into a separate block to make the more common case of few elements
3203 // faster.
3205
3206 // Emit phis for the new starting index of the scalar loop.
3207 createInductionResumeValues(ExpandedSCEVs);
3208
3209 return {completeLoopSkeleton(), nullptr};
3210}
3211
3212// Fix up external users of the induction variable. At this point, we are
3213// in LCSSA form, with all external PHIs that use the IV having one input value,
3214// coming from the remainder loop. We need those PHIs to also have a correct
3215// value for the IV when arriving directly from the middle block.
3217 const InductionDescriptor &II,
3218 Value *VectorTripCount, Value *EndValue,
3219 BasicBlock *MiddleBlock,
3220 BasicBlock *VectorHeader, VPlan &Plan,
3221 VPTransformState &State) {
3222 // There are two kinds of external IV usages - those that use the value
3223 // computed in the last iteration (the PHI) and those that use the penultimate
3224 // value (the value that feeds into the phi from the loop latch).
3225 // We allow both, but they, obviously, have different values.
3226
3227 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3228
3229 DenseMap<Value *, Value *> MissingVals;
3230
3231 // An external user of the last iteration's value should see the value that
3232 // the remainder loop uses to initialize its own IV.
3234 for (User *U : PostInc->users()) {
3235 Instruction *UI = cast<Instruction>(U);
3236 if (!OrigLoop->contains(UI)) {
3237 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3238 MissingVals[UI] = EndValue;
3239 }
3240 }
3241
3242 // An external user of the penultimate value need to see EndValue - Step.
3243 // The simplest way to get this is to recompute it from the constituent SCEVs,
3244 // that is Start + (Step * (CRD - 1)).
3245 for (User *U : OrigPhi->users()) {
3246 auto *UI = cast<Instruction>(U);
3247 if (!OrigLoop->contains(UI)) {
3248 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3249 IRBuilder<> B(MiddleBlock->getTerminator());
3250
3251 // Fast-math-flags propagate from the original induction instruction.
3252 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3253 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3254
3255 Value *CountMinusOne = B.CreateSub(
3256 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3257 CountMinusOne->setName("cmo");
3258
3259 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
3260 assert(StepVPV && "step must have been expanded during VPlan execution");
3261 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
3262 : State.get(StepVPV, {0, 0});
3263 Value *Escape =
3264 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
3265 II.getKind(), II.getInductionBinOp());
3266 Escape->setName("ind.escape");
3267 MissingVals[UI] = Escape;
3268 }
3269 }
3270
3271 for (auto &I : MissingVals) {
3272 PHINode *PHI = cast<PHINode>(I.first);
3273 // One corner case we have to handle is two IVs "chasing" each-other,
3274 // that is %IV2 = phi [...], [ %IV1, %latch ]
3275 // In this case, if IV1 has an external use, we need to avoid adding both
3276 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3277 // don't already have an incoming value for the middle block.
3278 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3279 PHI->addIncoming(I.second, MiddleBlock);
3280 Plan.removeLiveOut(PHI);
3281 }
3282 }
3283}
3284
3285namespace {
3286
3287struct CSEDenseMapInfo {
3288 static bool canHandle(const Instruction *I) {
3289 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3290 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3291 }
3292
3293 static inline Instruction *getEmptyKey() {
3295 }
3296
3297 static inline Instruction *getTombstoneKey() {
3299 }
3300
3301 static unsigned getHashValue(const Instruction *I) {
3302 assert(canHandle(I) && "Unknown instruction!");
3303 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3304 I->value_op_end()));
3305 }
3306
3307 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3308 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3309 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3310 return LHS == RHS;
3311 return LHS->isIdenticalTo(RHS);
3312 }
3313};
3314
3315} // end anonymous namespace
3316
3317///Perform cse of induction variable instructions.
3318static void cse(BasicBlock *BB) {
3319 // Perform simple cse.
3321 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3322 if (!CSEDenseMapInfo::canHandle(&In))
3323 continue;
3324
3325 // Check if we can replace this instruction with any of the
3326 // visited instructions.
3327 if (Instruction *V = CSEMap.lookup(&In)) {
3328 In.replaceAllUsesWith(V);
3329 In.eraseFromParent();
3330 continue;
3331 }
3332
3333 CSEMap[&In] = &In;
3334 }
3335}
3336
3339 ElementCount VF) const {
3340 // We only need to calculate a cost if the VF is scalar; for actual vectors
3341 // we should already have a pre-calculated cost at each VF.
3342 if (!VF.isScalar())
3343 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
3344
3346 Type *RetTy = CI->getType();
3348 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
3349 return *RedCost;
3350
3352 for (auto &ArgOp : CI->args())
3353 Tys.push_back(ArgOp->getType());
3354
3355 InstructionCost ScalarCallCost =
3357
3358 // If this is an intrinsic we may have a lower cost for it.
3360 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
3361 return std::min(ScalarCallCost, IntrinsicCost);
3362 }
3363 return ScalarCallCost;
3364}
3365
3367 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3368 return Elt;
3369 return VectorType::get(Elt, VF);
3370}
3371
3374 ElementCount VF) const {
3376 assert(ID && "Expected intrinsic call!");
3377 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3378 FastMathFlags FMF;
3379 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3380 FMF = FPMO->getFastMathFlags();
3381
3384 SmallVector<Type *> ParamTys;
3385 std::transform(FTy->param_begin(), FTy->param_end(),
3386 std::back_inserter(ParamTys),
3387 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3388
3389 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3390 dyn_cast<IntrinsicInst>(CI));
3391 return TTI.getIntrinsicInstrCost(CostAttrs,
3393}
3394
3396 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3397 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3398 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3399}
3400
3402 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3403 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3404 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3405}
3406
3408 VPlan &Plan) {
3409 // Fix widened non-induction PHIs by setting up the PHI operands.
3411 fixNonInductionPHIs(Plan, State);
3412
3413 // At this point every instruction in the original loop is widened to a
3414 // vector form. Now we need to fix the recurrences in the loop. These PHI
3415 // nodes are currently empty because we did not want to introduce cycles.
3416 // This is the second stage of vectorizing recurrences. Note that fixing
3417 // reduction phis are already modeled in VPlan.
3418 // TODO: Also model fixing fixed-order recurrence phis in VPlan.
3419 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3420 VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
3421 for (VPRecipeBase &R : HeaderVPBB->phis()) {
3422 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3423 fixFixedOrderRecurrence(FOR, State);
3424 }
3425
3426 // Forget the original basic block.
3429
3430 // After vectorization, the exit blocks of the original loop will have
3431 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
3432 // looked through single-entry phis.
3433 SmallVector<BasicBlock *> ExitBlocks;
3434 OrigLoop->getExitBlocks(ExitBlocks);
3435 for (BasicBlock *Exit : ExitBlocks)
3436 for (PHINode &PN : Exit->phis())
3438
3439 VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
3440 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3441 if (Cost->requiresScalarEpilogue(VF.isVector())) {
3442 // No edge from the middle block to the unique exit block has been inserted
3443 // and there is nothing to fix from vector loop; phis should have incoming
3444 // from scalar loop only.
3445 } else {
3446 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
3447 // the cost model.
3448
3449 // If we inserted an edge from the middle block to the unique exit block,
3450 // update uses outside the loop (phis) to account for the newly inserted
3451 // edge.
3452
3453 // Fix-up external users of the induction variables.
3454 for (const auto &Entry : Legal->getInductionVars())
3455 fixupIVUsers(Entry.first, Entry.second,
3457 IVEndValues[Entry.first], LoopMiddleBlock,
3458 VectorLoop->getHeader(), Plan, State);
3459 }
3460
3461 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3462 // in the exit block, so update the builder.
3463 State.Builder.SetInsertPoint(State.CFG.ExitBB,
3464 State.CFG.ExitBB->getFirstNonPHIIt());
3465 for (const auto &KV : Plan.getLiveOuts())
3466 KV.second->fixPhi(Plan, State);
3467
3469 sinkScalarOperands(&*PI);
3470
3471 // Remove redundant induction instructions.
3472 cse(VectorLoop->getHeader());
3473
3474 // Set/update profile weights for the vector and remainder loops as original
3475 // loop iterations are now distributed among them. Note that original loop
3476 // represented by LoopScalarBody becomes remainder loop after vectorization.
3477 //
3478 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3479 // end up getting slightly roughened result but that should be OK since
3480 // profile is not inherently precise anyway. Note also possible bypass of
3481 // vector code caused by legality checks is ignored, assigning all the weight
3482 // to the vector loop, optimistically.
3483 //
3484 // For scalable vectorization we can't know at compile time how many iterations
3485 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3486 // vscale of '1'.
3489 VF.getKnownMinValue() * UF);
3490}
3491
3494 // This is the second phase of vectorizing first-order recurrences. An
3495 // overview of the transformation is described below. Suppose we have the
3496 // following loop.
3497 //
3498 // for (int i = 0; i < n; ++i)
3499 // b[i] = a[i] - a[i - 1];
3500 //
3501 // There is a first-order recurrence on "a". For this loop, the shorthand
3502 // scalar IR looks like:
3503 //
3504 // scalar.ph:
3505 // s_init = a[-1]
3506 // br scalar.body
3507 //
3508 // scalar.body:
3509 // i = phi [0, scalar.ph], [i+1, scalar.body]
3510 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3511 // s2 = a[i]
3512 // b[i] = s2 - s1
3513 // br cond, scalar.body, ...
3514 //
3515 // In this example, s1 is a recurrence because it's value depends on the
3516 // previous iteration. In the first phase of vectorization, we created a
3517 // vector phi v1 for s1. We now complete the vectorization and produce the
3518 // shorthand vector IR shown below (for VF = 4, UF = 1).
3519 //
3520 // vector.ph:
3521 // v_init = vector(..., ..., ..., a[-1])
3522 // br vector.body
3523 //
3524 // vector.body
3525 // i = phi [0, vector.ph], [i+4, vector.body]
3526 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3527 // v2 = a[i, i+1, i+2, i+3];
3528 // v3 = vector(v1(3), v2(0, 1, 2))
3529 // b[i, i+1, i+2, i+3] = v2 - v3
3530 // br cond, vector.body, middle.block
3531 //
3532 // middle.block:
3533 // x = v2(3)
3534 // br scalar.ph
3535 //
3536 // scalar.ph:
3537 // s_init = phi [x, middle.block], [a[-1], otherwise]
3538 // br scalar.body
3539 //
3540 // After execution completes the vector loop, we extract the next value of
3541 // the recurrence (x) to use as the initial value in the scalar loop.
3542
3543 // Extract the last vector element in the middle block. This will be the
3544 // initial value for the recurrence when jumping to the scalar loop.
3545 VPValue *PreviousDef = PhiR->getBackedgeValue();
3546 Value *Incoming = State.get(PreviousDef, UF - 1);
3547 auto *ExtractForScalar = Incoming;
3548 auto *IdxTy = Builder.getInt32Ty();
3549 Value *RuntimeVF = nullptr;
3550 if (VF.isVector()) {
3551 auto *One = ConstantInt::get(IdxTy, 1);
3553 RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3554 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3555 ExtractForScalar =
3556 Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
3557 }
3558
3559 auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
3560 assert(PhiR->getNumUsers() == 1 &&
3561 RecurSplice->getOpcode() ==
3563 "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3564 SmallVector<VPLiveOut *> LiveOuts;
3565 for (VPUser *U : RecurSplice->users())
3566 if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
3567 LiveOuts.push_back(LiveOut);
3568
3569 if (!LiveOuts.empty()) {
3570 // Extract the second last element in the middle block if the
3571 // Phi is used outside the loop. We need to extract the phi itself
3572 // and not the last element (the phi update in the current iteration). This
3573 // will be the value when jumping to the exit block from the
3574 // LoopMiddleBlock, when the scalar loop is not run at all.
3575 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3576 if (VF.isVector()) {
3577 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3578 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3579 Incoming, Idx, "vector.recur.extract.for.phi");
3580 } else {
3581 assert(UF > 1 && "VF and UF cannot both be 1");
3582 // When loop is unrolled without vectorizing, initialize
3583 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
3584 // value of `Incoming`. This is analogous to the vectorized case above:
3585 // extracting the second last element when VF > 1.
3586 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3587 }
3588
3589 for (VPLiveOut *LiveOut : LiveOuts) {
3590 assert(!Cost->requiresScalarEpilogue(VF.isVector()));
3591 PHINode *LCSSAPhi = LiveOut->getPhi();
3592 LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3593 State.Plan->removeLiveOut(LCSSAPhi);
3594 }
3595 }
3596
3597 // Fix the initial value of the original recurrence in the scalar loop.
3599 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3600 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3601 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3602 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3603 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3604 Start->addIncoming(Incoming, BB);
3605 }
3606
3607 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3608 Phi->setName("scalar.recur");
3609}
3610
3612 // The basic block and loop containing the predicated instruction.
3613 auto *PredBB = PredInst->getParent();
3614 auto *VectorLoop = LI->getLoopFor(PredBB);
3615
3616 // Initialize a worklist with the operands of the predicated instruction.
3617 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3618
3619 // Holds instructions that we need to analyze again. An instruction may be
3620 // reanalyzed if we don't yet know if we can sink it or not.
3621 SmallVector<Instruction *, 8> InstsToReanalyze;
3622
3623 // Returns true if a given use occurs in the predicated block. Phi nodes use
3624 // their operands in their corresponding predecessor blocks.
3625 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3626 auto *I = cast<Instruction>(U.getUser());
3627 BasicBlock *BB = I->getParent();
3628 if (auto *Phi = dyn_cast<PHINode>(I))
3629 BB = Phi->getIncomingBlock(
3630 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3631 return BB == PredBB;
3632 };
3633
3634 // Iteratively sink the scalarized operands of the predicated instruction
3635 // into the block we created for it. When an instruction is sunk, it's
3636 // operands are then added to the worklist. The algorithm ends after one pass
3637 // through the worklist doesn't sink a single instruction.
3638 bool Changed;
3639 do {
3640 // Add the instructions that need to be reanalyzed to the worklist, and
3641 // reset the changed indicator.
3642 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3643 InstsToReanalyze.clear();
3644 Changed = false;
3645
3646 while (!Worklist.empty()) {
3647 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3648
3649 // We can't sink an instruction if it is a phi node, is not in the loop,
3650 // may have side effects or may read from memory.
3651 // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3652 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3653 I->mayHaveSideEffects() || I->mayReadFromMemory())
3654 continue;
3655
3656 // If the instruction is already in PredBB, check if we can sink its
3657 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3658 // sinking the scalar instruction I, hence it appears in PredBB; but it
3659 // may have failed to sink I's operands (recursively), which we try
3660 // (again) here.
3661 if (I->getParent() == PredBB) {
3662 Worklist.insert(I->op_begin(), I->op_end());
3663 continue;
3664 }
3665
3666 // It's legal to sink the instruction if all its uses occur in the
3667 // predicated block. Otherwise, there's nothing to do yet, and we may
3668 // need to reanalyze the instruction.
3669 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3670 InstsToReanalyze.push_back(I);
3671 continue;
3672 }
3673
3674 // Move the instruction to the beginning of the predicated block, and add
3675 // it's operands to the worklist.
3676 I->moveBefore(&*PredBB->getFirstInsertionPt());
3677 Worklist.insert(I->op_begin(), I->op_end());
3678
3679 // The sinking may have enabled other instructions to be sunk, so we will
3680 // need to iterate.
3681 Changed = true;
3682 }
3683 } while (Changed);
3684}
3685
3687 VPTransformState &State) {
3688 auto Iter = vp_depth_first_deep(Plan.getEntry());
3689 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3690 for (VPRecipeBase &P : VPBB->phis()) {
3691 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3692 if (!VPPhi)
3693 continue;
3694 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3695 // Make sure the builder has a valid insert point.
3696 Builder.SetInsertPoint(NewPhi);
3697 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3698 VPValue *Inc = VPPhi->getIncomingValue(i);
3699 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3700 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
3701 }
3702 }
3703 }
3704}
3705
3706void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3707 // We should not collect Scalars more than once per VF. Right now, this
3708 // function is called from collectUniformsAndScalars(), which already does
3709 // this check. Collecting Scalars for VF=1 does not make any sense.
3710 assert(VF.isVector() && !Scalars.contains(VF) &&
3711 "This function should not be visited twice for the same VF");
3712
3713 // This avoids any chances of creating a REPLICATE recipe during planning
3714 // since that would result in generation of scalarized code during execution,
3715 // which is not supported for scalable vectors.
3716 if (VF.isScalable()) {
3717 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3718 return;
3719 }
3720
3722
3723 // These sets are used to seed the analysis with pointers used by memory
3724 // accesses that will remain scalar.
3726 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3727 auto *Latch = TheLoop->getLoopLatch();
3728
3729 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3730 // The pointer operands of loads and stores will be scalar as long as the
3731 // memory access is not a gather or scatter operation. The value operand of a
3732 // store will remain scalar if the store is scalarized.
3733 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3734 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3735 assert(WideningDecision != CM_Unknown &&
3736 "Widening decision should be ready at this moment");
3737 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3738 if (Ptr == Store->getValueOperand())
3739 return WideningDecision == CM_Scalarize;
3740 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3741 "Ptr is neither a value or pointer operand");
3742 return WideningDecision != CM_GatherScatter;
3743 };
3744
3745 // A helper that returns true if the given value is a bitcast or
3746 // getelementptr instruction contained in the loop.
3747 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
3748 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
3749 isa<GetElementPtrInst>(V)) &&
3751 };
3752
3753 // A helper that evaluates a memory access's use of a pointer. If the use will
3754 // be a scalar use and the pointer is only used by memory accesses, we place
3755 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3756 // PossibleNonScalarPtrs.
3757 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3758 // We only care about bitcast and getelementptr instructions contained in
3759 // the loop.
3760 if (!isLoopVaryingBitCastOrGEP(Ptr))
3761 return;
3762
3763 // If the pointer has already been identified as scalar (e.g., if it was
3764 // also identified as uniform), there's nothing to do.
3765 auto *I = cast<Instruction>(Ptr);
3766 if (Worklist.count(I))
3767 return;
3768
3769 // If the use of the pointer will be a scalar use, and all users of the
3770 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3771 // place the pointer in PossibleNonScalarPtrs.
3772 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3773 return isa<LoadInst>(U) || isa<StoreInst>(U);
3774 }))
3775 ScalarPtrs.insert(I);
3776 else
3777 PossibleNonScalarPtrs.insert(I);
3778 };
3779
3780 // We seed the scalars analysis with three classes of instructions: (1)
3781 // instructions marked uniform-after-vectorization and (2) bitcast,
3782 // getelementptr and (pointer) phi instructions used by memory accesses
3783 // requiring a scalar use.
3784 //
3785 // (1) Add to the worklist all instructions that have been identified as
3786 // uniform-after-vectorization.
3787 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3788
3789 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3790 // memory accesses requiring a scalar use. The pointer operands of loads and
3791 // stores will be scalar as long as the memory accesses is not a gather or
3792 // scatter operation. The value operand of a store will remain scalar if the
3793 // store is scalarized.
3794 for (auto *BB : TheLoop->blocks())
3795 for (auto &I : *BB) {
3796 if (auto *Load = dyn_cast<LoadInst>(&I)) {
3797 evaluatePtrUse(Load, Load->getPointerOperand());
3798 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3799 evaluatePtrUse(Store, Store->getPointerOperand());
3800 evaluatePtrUse(Store, Store->getValueOperand());
3801 }
3802 }
3803 for (auto *I : ScalarPtrs)
3804 if (!PossibleNonScalarPtrs.count(I)) {
3805 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3806 Worklist.insert(I);
3807 }
3808
3809 // Insert the forced scalars.
3810 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3811 // induction variable when the PHI user is scalarized.
3812 auto ForcedScalar = ForcedScalars.find(VF);
3813 if (ForcedScalar != ForcedScalars.end())
3814 for (auto *I : ForcedScalar->second) {
3815 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3816 Worklist.insert(I);
3817 }
3818
3819 // Expand the worklist by looking through any bitcasts and getelementptr
3820 // instructions we've already identified as scalar. This is similar to the
3821 // expansion step in collectLoopUniforms(); however, here we're only
3822 // expanding to include additional bitcasts and getelementptr instructions.
3823 unsigned Idx = 0;
3824 while (Idx != Worklist.size()) {
3825 Instruction *Dst = Worklist[Idx++];
3826 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
3827 continue;
3828 auto *Src = cast<Instruction>(Dst->getOperand(0));
3829 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3830 auto *J = cast<Instruction>(U);
3831 return !TheLoop->contains(J) || Worklist.count(J) ||
3832 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3833 isScalarUse(J, Src));
3834 })) {
3835 Worklist.insert(Src);
3836 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3837 }
3838 }
3839
3840 // An induction variable will remain scalar if all users of the induction
3841 // variable and induction variable update remain scalar.
3842 for (const auto &Induction : Legal->getInductionVars()) {
3843 auto *Ind = Induction.first;
3844 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3845
3846 // If tail-folding is applied, the primary induction variable will be used
3847 // to feed a vector compare.
3848 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3849 continue;
3850
3851 // Returns true if \p Indvar is a pointer induction that is used directly by
3852 // load/store instruction \p I.
3853 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3854 Instruction *I) {
3855 return Induction.second.getKind() ==
3857 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3858 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
3859 };
3860
3861 // Determine if all users of the induction variable are scalar after
3862 // vectorization.
3863 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3864 auto *I = cast<Instruction>(U);
3865 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3866 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3867 });
3868 if (!ScalarInd)
3869 continue;
3870
3871 // Determine if all users of the induction variable update instruction are
3872 // scalar after vectorization.
3873 auto ScalarIndUpdate =
3874 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3875 auto *I = cast<Instruction>(U);
3876 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3877 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3878 });
3879 if (!ScalarIndUpdate)
3880 continue;
3881
3882 // The induction variable and its update instruction will remain scalar.
3883 Worklist.insert(Ind);
3884 Worklist.insert(IndUpdate);
3885 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3886 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3887 << "\n");
3888 }
3889
3890 Scalars[VF].insert(Worklist.begin(), Worklist.end());
3891}
3892
3894 Instruction *I, ElementCount VF) const {
3895 if (!isPredicatedInst(I))
3896 return false;
3897
3898 // Do we have a non-scalar lowering for this predicated
3899 // instruction? No - it is scalar with predication.
3900 switch(I->getOpcode()) {
3901 default:
3902 return true;
3903 case Instruction::Call:
3904 if (VF.isScalar())
3905 return true;
3906 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3907 .Kind == CM_Scalarize;
3908 case Instruction::Load:
3909 case Instruction::Store: {
3911 auto *Ty = getLoadStoreType(I);
3912 Type *VTy = Ty;
3913 if (VF.isVector())
3914 VTy = VectorType::get(Ty, VF);
3915 const Align Alignment = getLoadStoreAlignment(I);
3916 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3917 TTI.isLegalMaskedGather(VTy, Alignment))
3918 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3919 TTI.isLegalMaskedScatter(VTy, Alignment));
3920 }
3921 case Instruction::UDiv:
3922 case Instruction::SDiv:
3923 case Instruction::SRem:
3924 case Instruction::URem: {
3925 // We have the option to use the safe-divisor idiom to avoid predication.
3926 // The cost based decision here will always select safe-divisor for
3927 // scalable vectors as scalarization isn't legal.
3928 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3929 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3930 }
3931 }
3932}
3933
3935 if (!blockNeedsPredicationForAnyReason(I->getParent()))
3936 return false;
3937
3938 // Can we prove this instruction is safe to unconditionally execute?
3939 // If not, we must use some form of predication.
3940 switch(I->getOpcode()) {
3941 default:
3942 return false;
3943 case Instruction::Load:
3944 case Instruction::Store: {
3945 if (!Legal->isMaskRequired(I))
3946 return false;
3947 // When we know the load's address is loop invariant and the instruction
3948 // in the original scalar loop was unconditionally executed then we
3949 // don't need to mark it as a predicated instruction. Tail folding may
3950 // introduce additional predication, but we're guaranteed to always have
3951 // at least one active lane. We call Legal->blockNeedsPredication here
3952 // because it doesn't query tail-folding. For stores, we need to prove
3953 // both speculation safety (which follows from the same argument as loads),
3954 // but also must prove the value being stored is correct. The easiest
3955 // form of the later is to require that all values stored are the same.
3957 (isa<LoadInst>(I) ||
3958 (isa<StoreInst>(I) &&
3959 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
3960 !Legal->blockNeedsPredication(I->getParent()))
3961 return false;
3962 return true;
3963 }
3964 case Instruction::UDiv:
3965 case Instruction::SDiv:
3966 case Instruction::SRem:
3967 case Instruction::URem:
3968 // TODO: We can use the loop-preheader as context point here and get
3969 // context sensitive reasoning
3971 case Instruction::Call:
3972 return Legal->isMaskRequired(I);
3973 }
3974}
3975
3976std::pair<InstructionCost, InstructionCost>
3978 ElementCount VF) const {
3979 assert(I->getOpcode() == Instruction::UDiv ||
3980 I->getOpcode() == Instruction::SDiv ||
3981 I->getOpcode() == Instruction::SRem ||
3982 I->getOpcode() == Instruction::URem);
3984
3986
3987 // Scalarization isn't legal for scalable vector types
3988 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3989 if (!VF.isScalable()) {
3990 // Get the scalarization cost and scale this amount by the probability of
3991 // executing the predicated block. If the instruction is not predicated,
3992 // we fall through to the next case.
3993 ScalarizationCost = 0;
3994
3995 // These instructions have a non-void type, so account for the phi nodes
3996 // that we will create. This cost is likely to be zero. The phi node
3997 // cost, if any, should be scaled by the block probability because it
3998 // models a copy at the end of each predicated block.
3999 ScalarizationCost += VF.getKnownMinValue() *
4000 TTI.getCFInstrCost(Instruction::PHI, CostKind);
4001
4002 // The cost of the non-predicated instruction.
4003 ScalarizationCost += VF.getKnownMinValue() *
4004 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4005
4006 // The cost of insertelement and extractelement instructions needed for
4007 // scalarization.
4008 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4009
4010 // Scale the cost by the probability of executing the predicated blocks.
4011 // This assumes the predicated block for each vector lane is equally
4012 // likely.
4013 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4014 }
4015 InstructionCost SafeDivisorCost = 0;
4016
4017 auto *VecTy = ToVectorTy(I->getType(), VF);
4018
4019 // The cost of the select guard to ensure all lanes are well defined
4020 // after we speculate above any internal control flow.
4021 SafeDivisorCost += TTI.getCmpSelInstrCost(
4022 Instruction::Select, VecTy,
4023 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4025
4026 // Certain instructions can be cheaper to vectorize if they have a constant
4027 // second vector operand. One example of this are shifts on x86.
4028 Value *Op2 = I->getOperand(1);
4029 auto Op2Info = TTI.getOperandInfo(Op2);
4030 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
4031 Legal->isInvariant(Op2))
4033
4034 SmallVector<const Value *, 4> Operands(I->operand_values());
4035 SafeDivisorCost += TTI.getArithmeticInstrCost(
4036 I->getOpcode(), VecTy, CostKind,
4037 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4038 Op2Info, Operands, I);
4039 return {ScalarizationCost, SafeDivisorCost};
4040}
4041
4043 Instruction *I, ElementCount VF) {
4044 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4046 "Decision should not be set yet.");
4047 auto *Group = getInterleavedAccessGroup(I);
4048 assert(Group && "Must have a group.");
4049
4050 // If the instruction's allocated size doesn't equal it's type size, it
4051 // requires padding and will be scalarized.
4052 auto &DL = I->getModule()->getDataLayout();
4053 auto *ScalarTy = getLoadStoreType(I);
4054 if (hasIrregularType(ScalarTy, DL))
4055 return false;
4056
4057 // If the group involves a non-integral pointer, we may not be able to
4058 // losslessly cast all values to a common type.
4059 unsigned InterleaveFactor = Group->getFactor();
4060 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4061 for (unsigned i = 0; i < InterleaveFactor; i++) {
4062 Instruction *Member = Group->getMember(i);
4063 if (!Member)
4064 continue;
4065 auto *MemberTy = getLoadStoreType(Member);
4066 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4067 // Don't coerce non-integral pointers to integers or vice versa.
4068 if (MemberNI != ScalarNI) {
4069 // TODO: Consider adding special nullptr value case here
4070 return false;
4071 } else if (MemberNI && ScalarNI &&
4072 ScalarTy->getPointerAddressSpace() !=
4073 MemberTy->getPointerAddressSpace()) {
4074 return false;
4075 }
4076 }
4077
4078 // Check if masking is required.
4079 // A Group may need masking for one of two reasons: it resides in a block that
4080 // needs predication, or it was decided to use masking to deal with gaps
4081 // (either a gap at the end of a load-access that may result in a speculative
4082 // load, or any gaps in a store-access).
4083 bool PredicatedAccessRequiresMasking =
4084 blockNeedsPredicationForAnyReason(I->getParent()) &&
4086 bool LoadAccessWithGapsRequiresEpilogMasking =
4087 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4089 bool StoreAccessWithGapsRequiresMasking =
4090 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4091 if (!PredicatedAccessRequiresMasking &&
4092 !LoadAccessWithGapsRequiresEpilogMasking &&
4093 !StoreAccessWithGapsRequiresMasking)
4094 return true;
4095
4096 // If masked interleaving is required, we expect that the user/target had
4097 // enabled it, because otherwise it either wouldn't have been created or
4098 // it should have been invalidated by the CostModel.
4100 "Masked interleave-groups for predicated accesses are not enabled.");
4101
4102 if (Group->isReverse())
4103 return false;
4104
4105 auto *Ty = getLoadStoreType(I);
4106 const Align Alignment = getLoadStoreAlignment(I);
4107 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4108 : TTI.isLegalMaskedStore(Ty, Alignment);
4109}
4110
4112 Instruction *I, ElementCount VF) {
4113 // Get and ensure we have a valid memory instruction.
4114 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4115
4117 auto *ScalarTy = getLoadStoreType(I);
4118
4119 // In order to be widened, the pointer should be consecutive, first of all.
4120 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4121 return false;
4122
4123 // If the instruction is a store located in a predicated block, it will be
4124 // scalarized.
4125 if (isScalarWithPredication(I, VF))
4126 return false;
4127
4128 // If the instruction's allocated size doesn't equal it's type size, it
4129 // requires padding and will be scalarized.
4130 auto &DL = I->getModule()->getDataLayout();
4131 if (hasIrregularType(ScalarTy, DL))
4132 return false;
4133
4134 return true;
4135}
4136
4137void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4138 // We should not collect Uniforms more than once per VF. Right now,
4139 // this function is called from collectUniformsAndScalars(), which
4140 // already does this check. Collecting Uniforms for VF=1 does not make any
4141 // sense.
4142
4143 assert(VF.isVector() && !Uniforms.contains(VF) &&
4144 "This function should not be visited twice for the same VF");
4145
4146 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4147 // not analyze again. Uniforms.count(VF) will return 1.
4148 Uniforms[VF].clear();
4149
4150 // We now know that the loop is vectorizable!
4151 // Collect instructions inside the loop that will remain uniform after
4152 // vectorization.
4153
4154 // Global values, params and instructions outside of current loop are out of
4155 // scope.
4156 auto isOutOfScope = [&](Value *V) -> bool {
4157 Instruction *I = dyn_cast<Instruction>(V);
4158 return (!I || !TheLoop->contains(I));
4159 };
4160
4161 // Worklist containing uniform instructions demanding lane 0.
4162 SetVector<Instruction *> Worklist;
4163 BasicBlock *Latch = TheLoop->getLoopLatch();
4164
4165 // Add uniform instructions demanding lane 0 to the worklist. Instructions
4166 // that are scalar with predication must not be considered uniform after
4167 // vectorization, because that would create an erroneous replicating region
4168 // where only a single instance out of VF should be formed.
4169 // TODO: optimize such seldom cases if found important, see PR40816.
4170 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4171 if (isOutOfScope(I)) {
4172 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4173 << *I << "\n");
4174 return;
4175 }
4176 if (isScalarWithPredication(I, VF)) {
4177 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4178 << *I << "\n");
4179 return;
4180 }
4181 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4182 Worklist.insert(I);
4183 };
4184
4185 // Start with the conditional branch. If the branch condition is an
4186 // instruction contained in the loop that is only used by the branch, it is
4187 // uniform.
4188 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4189 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4190 addToWorklistIfAllowed(Cmp);
4191
4192 auto PrevVF = VF.divideCoefficientBy(2);
4193 // Return true if all lanes perform the same memory operation, and we can
4194 // thus chose to execute only one.
4195 auto isUniformMemOpUse = [&](Instruction *I) {
4196 // If the value was already known to not be uniform for the previous
4197 // (smaller VF), it cannot be uniform for the larger VF.
4198 if (PrevVF.isVector()) {
4199 auto Iter = Uniforms.find(PrevVF);
4200 if (Iter != Uniforms.end() && !Iter->second.contains(I))
4201 return false;
4202 }
4203 if (!Legal->isUniformMemOp(*I, VF))
4204 return false;
4205 if (isa<LoadInst>(I))
4206 // Loading the same address always produces the same result - at least
4207 // assuming aliasing and ordering which have already been checked.
4208 return true;
4209 // Storing the same value on every iteration.
4210 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4211 };
4212
4213 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4214 InstWidening WideningDecision = getWideningDecision(I, VF);
4215 assert(WideningDecision != CM_Unknown &&
4216 "Widening decision should be ready at this moment");
4217
4218 if (isUniformMemOpUse(I))
4219 return true;
4220
4221 return (WideningDecision == CM_Widen ||
4222 WideningDecision == CM_Widen_Reverse ||
4223 WideningDecision == CM_Interleave);
4224 };
4225
4226 // Returns true if Ptr is the pointer operand of a memory access instruction
4227 // I, I is known to not require scalarization, and the pointer is not also
4228 // stored.
4229 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4230 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
4231 return false;
4232 return getLoadStorePointerOperand(I) == Ptr &&
4233 (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
4234 };
4235
4236 // Holds a list of values which are known to have at least one uniform use.
4237 // Note that there may be other uses which aren't uniform. A "uniform use"
4238 // here is something which only demands lane 0 of the unrolled iterations;
4239 // it does not imply that all lanes produce the same value (e.g. this is not
4240 // the usual meaning of uniform)
4241 SetVector<Value *> HasUniformUse;
4242
4243 // Scan the loop for instructions which are either a) known to have only
4244 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4245 for (auto *BB : TheLoop->blocks())
4246 for (auto &I : *BB) {
4247 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4248 switch (II->getIntrinsicID()) {
4249 case Intrinsic::sideeffect:
4250 case Intrinsic::experimental_noalias_scope_decl:
4251 case Intrinsic::assume:
4252 case Intrinsic::lifetime_start:
4253 case Intrinsic::lifetime_end:
4255 addToWorklistIfAllowed(&I);
4256 break;
4257 default:
4258 break;
4259 }
4260 }
4261
4262 // ExtractValue instructions must be uniform, because the operands are
4263 // known to be loop-invariant.
4264 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4265 assert(isOutOfScope(EVI->getAggregateOperand()) &&
4266 "Expected aggregate value to be loop invariant");
4267 addToWorklistIfAllowed(EVI);
4268 continue;
4269 }
4270
4271 // If there's no pointer operand, there's nothing to do.
4273 if (!Ptr)
4274 continue;
4275
4276 if (isUniformMemOpUse(&I))
4277 addToWorklistIfAllowed(&I);
4278
4279 if (isVectorizedMemAccessUse(&I, Ptr))
4280 HasUniformUse.insert(Ptr);
4281 }
4282
4283 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4284 // demanding) users. Since loops are assumed to be in LCSSA form, this
4285 // disallows uses outside the loop as well.
4286 for (auto *V : HasUniformUse) {
4287 if (isOutOfScope(V))
4288 continue;
4289 auto *I = cast<Instruction>(V);
4290 auto UsersAreMemAccesses =
4291 llvm::all_of(I->users(), [&](User *U) -> bool {
4292 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4293 });
4294 if (UsersAreMemAccesses)
4295 addToWorklistIfAllowed(I);
4296 }
4297
4298 // Expand Worklist in topological order: whenever a new instruction
4299 // is added , its users should be already inside Worklist. It ensures
4300 // a uniform instruction will only be used by uniform instructions.
4301 unsigned idx = 0;
4302 while (idx != Worklist.size()) {
4303 Instruction *I = Worklist[idx++];
4304
4305 for (auto *OV : I->operand_values()) {
4306 // isOutOfScope operands cannot be uniform instructions.
4307 if (isOutOfScope(OV))
4308 continue;
4309 // First order recurrence Phi's should typically be considered
4310 // non-uniform.
4311 auto *OP = dyn_cast<PHINode>(OV);
4313 continue;
4314 // If all the users of the operand are uniform, then add the
4315 // operand into the uniform worklist.
4316 auto *OI = cast<Instruction>(OV);
4317 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4318 auto *J = cast<Instruction>(U);
4319 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4320 }))
4321 addToWorklistIfAllowed(OI);
4322 }
4323 }
4324
4325 // For an instruction to be added into Worklist above, all its users inside
4326 // the loop should also be in Worklist. However, this condition cannot be
4327 // true for phi nodes that form a cyclic dependence. We must process phi
4328 // nodes separately. An induction variable will remain uniform if all users
4329 // of the induction variable and induction variable update remain uniform.
4330 // The code below handles both pointer and non-pointer induction variables.
4331 for (const auto &Induction : Legal->getInductionVars()) {
4332 auto *Ind = Induction.first;
4333 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4334
4335 // Determine if all users of the induction variable are uniform after
4336 // vectorization.
4337 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4338 auto *I = cast<Instruction>(U);
4339 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4340 isVectorizedMemAccessUse(I, Ind);
4341 });
4342 if (!UniformInd)
4343 continue;
4344
4345 // Determine if all users of the induction variable update instruction are
4346 // uniform after vectorization.
4347 auto UniformIndUpdate =
4348 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4349 auto *I = cast<Instruction>(U);
4350 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4351 isVectorizedMemAccessUse(I, IndUpdate);
4352 });
4353 if (!UniformIndUpdate)
4354 continue;
4355
4356 // The induction variable and its update instruction will remain uniform.
4357 addToWorklistIfAllowed(Ind);
4358 addToWorklistIfAllowed(IndUpdate);
4359 }
4360
4361 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4362}
4363
4365 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4366
4368 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4369 "runtime pointer checks needed. Enable vectorization of this "
4370 "loop with '#pragma clang loop vectorize(enable)' when "
4371 "compiling with -Os/-Oz",
4372 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4373 return true;
4374 }
4375
4376 if (!PSE.getPredicate().isAlwaysTrue()) {
4377 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4378 "runtime SCEV checks needed. Enable vectorization of this "
4379 "loop with '#pragma clang loop vectorize(enable)' when "
4380 "compiling with -Os/-Oz",
4381 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4382 return true;
4383 }
4384
4385 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4386 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4387 reportVectorizationFailure("Runtime stride check for small trip count",
4388 "runtime stride == 1 checks needed. Enable vectorization of "
4389 "this loop without such check by compiling with -Os/-Oz",
4390 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4391 return true;
4392 }
4393
4394 return false;
4395}
4396
4398LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4400 return ElementCount::getScalable(0);
4401
4403 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4404 "ScalableVectorizationDisabled", ORE, TheLoop);
4405 return ElementCount::getScalable(0);
4406 }
4407
4408 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4409
4410 auto MaxScalableVF = ElementCount::getScalable(
4411 std::numeric_limits<ElementCount::ScalarTy>::max());
4412
4413 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4414 // FIXME: While for scalable vectors this is currently sufficient, this should
4415 // be replaced by a more detailed mechanism that filters out specific VFs,
4416 // instead of invalidating vectorization for a whole set of VFs based on the
4417 // MaxVF.
4418
4419 // Disable scalable vectorization if the loop contains unsupported reductions.
4420 if (!canVectorizeReductions(MaxScalableVF)) {
4422 "Scalable vectorization not supported for the reduction "
4423 "operations found in this loop.",
4424 "ScalableVFUnfeasible", ORE, TheLoop);
4425 return ElementCount::getScalable(0);
4426 }
4427
4428 // Disable scalable vectorization if the loop contains any instructions
4429 // with element types not supported for scalable vectors.
4430 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4431 return !Ty->isVoidTy() &&
4433 })) {
4434 reportVectorizationInfo("Scalable vectorization is not supported "
4435 "for all element types found in this loop.",
4436 "ScalableVFUnfeasible", ORE, TheLoop);
4437 return ElementCount::getScalable(0);
4438 }
4439
4441 return MaxScalableVF;
4442
4443 // Limit MaxScalableVF by the maximum safe dependence distance.
4444 if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI))
4445 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
4446 else
4447 MaxScalableVF = ElementCount::getScalable(0);
4448
4449 if (!MaxScalableVF)
4451 "Max legal vector width too small, scalable vectorization "
4452 "unfeasible.",
4453 "ScalableVFUnfeasible", ORE, TheLoop);
4454
4455 return MaxScalableVF;
4456}
4457
4458FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4459 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4461 unsigned SmallestType, WidestType;
4462 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4463
4464 // Get the maximum safe dependence distance in bits computed by LAA.
4465 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4466 // the memory accesses that is most restrictive (involved in the smallest
4467 // dependence distance).
4468 unsigned MaxSafeElements =
4470
4471 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4472 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4473
4474 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4475 << ".\n");
4476 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4477 << ".\n");
4478
4479 // First analyze the UserVF, fall back if the UserVF should be ignored.
4480 if (UserVF) {
4481 auto MaxSafeUserVF =
4482 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4483
4484 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4485 // If `VF=vscale x N` is safe, then so is `VF=N`
4486 if (UserVF.isScalable())
4487 return FixedScalableVFPair(
4488 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4489 else
4490 return UserVF;
4491 }
4492
4493 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4494
4495 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4496 // is better to ignore the hint and let the compiler choose a suitable VF.
4497 if (!UserVF.isScalable()) {
4498 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4499 << " is unsafe, clamping to max safe VF="
4500 << MaxSafeFixedVF << ".\n");
4501 ORE->emit([&]() {
4502 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4504 TheLoop->getHeader())
4505 << "User-specified vectorization factor "
4506 << ore::NV("UserVectorizationFactor", UserVF)
4507 << " is unsafe, clamping to maximum safe vectorization factor "
4508 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4509 });
4510 return MaxSafeFixedVF;
4511 }
4512
4514 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4515 << " is ignored because scalable vectors are not "
4516 "available.\n");
4517 ORE->emit([&]() {
4518 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4520 TheLoop->getHeader())
4521 << "User-specified vectorization factor "
4522 << ore::NV("UserVectorizationFactor", UserVF)
4523 << " is ignored because the target does not support scalable "
4524 "vectors. The compiler will pick a more suitable value.";
4525 });
4526 } else {
4527 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4528 << " is unsafe. Ignoring scalable UserVF.\n");
4529 ORE->emit([&]() {
4530 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4532 TheLoop->getHeader())
4533 << "User-specified vectorization factor "
4534 << ore::NV("UserVectorizationFactor", UserVF)
4535 << " is unsafe. Ignoring the hint to let the compiler pick a "
4536 "more suitable value.";
4537 });
4538 }
4539 }
4540
4541 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4542 << " / " << WidestType << " bits.\n");
4543
4546 if (auto MaxVF =
4547 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4548 MaxSafeFixedVF, FoldTailByMasking))
4549 Result.FixedVF = MaxVF;
4550
4551 if (auto MaxVF =
4552 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4553 MaxSafeScalableVF, FoldTailByMasking))
4554 if (MaxVF.isScalable()) {
4555 Result.ScalableVF = MaxVF;
4556 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4557 << "\n");
4558 }
4559
4560 return Result;
4561}
4562
4566 // TODO: It may by useful to do since it's still likely to be dynamically
4567 // uniform if the target can skip.
4569 "Not inserting runtime ptr check for divergent target",
4570 "runtime pointer checks needed. Not enabled for divergent target",
4571 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4573 }
4574
4575 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4576 unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
4577 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4578 if (TC == 1) {
4579 reportVectorizationFailure("Single iteration (non) loop",
4580 "loop trip count is one, irrelevant for vectorization",
4581 "SingleIterationLoop", ORE, TheLoop);
4583 }
4584
4585 switch (ScalarEpilogueStatus) {
4587 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4589 [[fallthrough]];
4591 LLVM_DEBUG(
4592 dbgs() << "LV: vector predicate hint/switch found.\n"
4593 << "LV: Not allowing scalar epilogue, creating predicated "
4594 << "vector loop.\n");
4595 break;
4597 // fallthrough as a special case of OptForSize
4599 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4600 LLVM_DEBUG(
4601 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4602 else
4603 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4604 << "count.\n");
4605
4606 // Bail if runtime checks are required, which are not good when optimising
4607 // for size.
4610
4611 break;
4612 }
4613
4614 // The only loops we can vectorize without a scalar epilogue, are loops with
4615 // a bottom-test and a single exiting block. We'd have to handle the fact
4616 // that not every instruction executes on the last iteration. This will
4617 // require a lane mask which varies through the vector loop body. (TODO)
4619 // If there was a tail-folding hint/switch, but we can't fold the tail by
4620 // masking, fallback to a vectorization with a scalar epilogue.
4621 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4622 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4623 "scalar epilogue instead.\n");
4624 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4625 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4626 }
4628 }
4629
4630 // Now try the tail folding
4631
4632 // Invalidate interleave groups that require an epilogue if we can't mask
4633 // the interleave-group.
4635 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4636 "No decisions should have been taken at this point");
4637 // Note: There is no need to invalidate any cost modeling decisions here, as
4638 // non where taken so far.
4640 }
4641
4642 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4643
4644 // Avoid tail folding if the trip count is known to be a multiple of any VF
4645 // we choose.
4646 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4647 MaxFactors.FixedVF.getFixedValue();
4648 if (MaxFactors.ScalableVF) {
4649 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4650 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4651 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4652 *MaxPowerOf2RuntimeVF,
4653 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4654 } else
4655 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4656 }
4657
4658 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4659 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4660 "MaxFixedVF must be a power of 2");
4661 unsigned MaxVFtimesIC =
4662 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4663 ScalarEvolution *SE = PSE.getSE();
4664 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4665 const SCEV *ExitCount = SE->getAddExpr(
4666 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4667 const SCEV *Rem = SE->getURemExpr(
4668 SE->applyLoopGuards(ExitCount, TheLoop),
4669 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4670 if (Rem->isZero()) {
4671 // Accept MaxFixedVF if we do not have a tail.
4672 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4673 return MaxFactors;
4674 }
4675 }
4676
4677 // If we don't know the precise trip count, or if the trip count that we
4678 // found modulo the vectorization factor is not zero, try to fold the tail
4679 // by masking.
4680 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4681 setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4682 if (foldTailByMasking()) {
4684 LLVM_DEBUG(
4685 dbgs()
4686 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4687 "try to generate VP Intrinsics with scalable vector "
4688 "factors only.\n");
4689 // Tail folded loop using VP intrinsics restricts the VF to be scalable
4690 // for now.
4691 // TODO: extend it for fixed vectors, if required.
4692 assert(MaxFactors.ScalableVF.isScalable() &&
4693 "Expected scalable vector factor.");
4694
4695 MaxFactors.FixedVF = ElementCount::getFixed(1);
4696 }
4697 return MaxFactors;
4698 }
4699
4700 // If there was a tail-folding hint/switch, but we can't fold the tail by
4701 // masking, fallback to a vectorization with a scalar epilogue.
4702 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4703 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4704 "scalar epilogue instead.\n");
4705 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4706 return MaxFactors;
4707 }
4708
4709 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4710 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4712 }
4713
4714 if (TC == 0) {
4716 "Unable to calculate the loop count due to complex control flow",
4717 "unable to calculate the loop count due to complex control flow",
4718 "UnknownLoopCountComplexCFG", ORE, TheLoop);
4720 }
4721
4723 "Cannot optimize for size and vectorize at the same time.",
4724 "cannot optimize for size and vectorize at the same time. "
4725 "Enable vectorization of this loop with '#pragma clang loop "
4726 "vectorize(enable)' when compiling with -Os/-Oz",
4727 "NoTailLoopWithOptForSize", ORE, TheLoop);
4729}
4730
4731ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4732 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4733 ElementCount MaxSafeVF, bool FoldTailByMasking) {
4734 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4735 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4736 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4738
4739 // Convenience function to return the minimum of two ElementCounts.
4740 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4741 assert((LHS.isScalable() == RHS.isScalable()) &&
4742 "Scalable flags must match");
4743 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4744 };
4745
4746 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4747 // Note that both WidestRegister and WidestType may not be a powers of 2.
4748 auto MaxVectorElementCount = ElementCount::get(
4749 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4750 ComputeScalableMaxVF);
4751 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4752 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4753 << (MaxVectorElementCount * WidestType) << " bits.\n");
4754
4755 if (!MaxVectorElementCount) {
4756 LLVM_DEBUG(dbgs() << "LV: The target has no "
4757 << (ComputeScalableMaxVF ? "scalable" : "fixed")
4758 << " vector registers.\n");
4759 return ElementCount::getFixed(1);
4760 }
4761
4762 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4763 if (MaxVectorElementCount.isScalable() &&
4764 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4765 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4766 auto Min = Attr.getVScaleRangeMin();
4767 WidestRegisterMinEC *= Min;
4768 }
4769
4770 // When a scalar epilogue is required, at least one iteration of the scalar
4771 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4772 // max VF that results in a dead vector loop.
4773 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4774 MaxTripCount -= 1;
4775
4776 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4777 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4778 // If upper bound loop trip count (TC) is known at compile time there is no
4779 // point in choosing VF greater than TC (as done in the loop below). Select
4780 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4781 // scalable, we only fall back on a fixed VF when the TC is less than or
4782 // equal to the known number of lanes.
4783 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4784 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4785 "exceeding the constant trip count: "
4786 << ClampedUpperTripCount << "\n");
4787 return ElementCount::get(
4788 ClampedUpperTripCount,
4789 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4790 }
4791
4793 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4795 ElementCount MaxVF = MaxVectorElementCount;
4796 if (MaximizeBandwidth ||
4797 (MaximizeBandwidth.getNumOccurrences() == 0 &&
4800 auto MaxVectorElementCountMaxBW = ElementCount::get(
4801 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4802 ComputeScalableMaxVF);
4803 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4804
4805 // Collect all viable vectorization factors larger than the default MaxVF
4806 // (i.e. MaxVectorElementCount).
4808 for (ElementCount VS = MaxVectorElementCount * 2;
4809 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4810 VFs.push_back(VS);
4811
4812 // For each VF calculate its register usage.
4813 auto RUs = calculateRegisterUsage(VFs);
4814
4815 // Select the largest VF which doesn't require more registers than existing
4816 // ones.
4817 for (int i = RUs.size() - 1; i >= 0; --i) {
4818 bool Selected = true;
4819 for (auto &pair : RUs[i].MaxLocalUsers) {
4820 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
4821 if (pair.second > TargetNumRegisters)
4822 Selected = false;
4823 }
4824 if (Selected) {
4825 MaxVF = VFs[i];
4826 break;
4827 }
4828 }
4829 if (ElementCount MinVF =
4830 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4831 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4832 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4833 << ") with target's minimum: " << MinVF << '\n');
4834 MaxVF = MinVF;
4835 }
4836 }
4837
4838 // Invalidate any widening decisions we might have made, in case the loop
4839 // requires prediction (decided later), but we have already made some
4840 // load/store widening decisions.
4842 }
4843 return MaxVF;
4844}
4845
4846/// Convenience function that returns the value of vscale_range iff
4847/// vscale_range.min == vscale_range.max or otherwise returns the value
4848/// returned by the corresponding TTI method.
4849static std::optional<unsigned>
4851 const Function *Fn = L->getHeader()->getParent();
4852 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4853 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4854 auto Min = Attr.getVScaleRangeMin();
4855 auto Max = Attr.getVScaleRangeMax();
4856 if (Max && Min == Max)
4857 return Max;
4858 }
4859
4860 return TTI.getVScaleForTuning();
4861}
4862
4863bool LoopVectorizationPlanner::isMoreProfitable(
4864 const VectorizationFactor &A, const VectorizationFactor &B) const {
4865 InstructionCost CostA = A.Cost;
4866 InstructionCost CostB = B.Cost;
4867
4868 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4869
4870 if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
4871 // If the trip count is a known (possibly small) constant, the trip count
4872 // will be rounded up to an integer number of iterations under
4873 // FoldTailByMasking. The total cost in that case will be
4874 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4875 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4876 // some extra overheads, but for the purpose of comparing the costs of
4877 // different VFs we can use this to compare the total loop-body cost
4878 // expected after vectorization.
4879 auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4880 InstructionCost VectorCost,
4881 InstructionCost ScalarCost) {
4882 return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
4883 : VectorCost * (MaxTripCount / VF) +
4884 ScalarCost * (MaxTripCount % VF);
4885 };
4886 auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
4887 auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
4888
4889 return RTCostA < RTCostB;
4890 }
4891
4892 // Improve estimate for the vector width if it is scalable.
4893 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4894 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4895 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4896 if (A.Width.isScalable())
4897 EstimatedWidthA *= *VScale;
4898 if (B.Width.isScalable())
4899 EstimatedWidthB *= *VScale;
4900 }
4901
4902 // Assume vscale may be larger than 1 (or the value being tuned for),
4903 // so that scalable vectorization is slightly favorable over fixed-width
4904 // vectorization.
4905 if (A.Width.isScalable() && !B.Width.isScalable())
4906 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
4907
4908 // To avoid the need for FP division:
4909 // (CostA / A.Width) < (CostB / B.Width)
4910 // <=> (CostA * B.Width) < (CostB * A.Width)
4911 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
4912}
4913
4916 Loop *TheLoop) {
4917 if (InvalidCosts.empty())
4918 return;
4919
4920 // Emit a report of VFs with invalid costs in the loop.
4921
4922 // Group the remarks per instruction, keeping the instruction order from
4923 // InvalidCosts.
4924 std::map<Instruction *, unsigned> Numbering;
4925 unsigned I = 0;
4926 for (auto &Pair : InvalidCosts)
4927 if (!Numbering.count(Pair.first))
4928 Numbering[Pair.first] = I++;
4929
4930 // Sort the list, first on instruction(number) then on VF.
4931 sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
4932 if (Numbering[A.first] != Numbering[B.first])
4933 return Numbering[A.first] < Numbering[B.first];
4935 return ECC(A.second, B.second);
4936 });
4937
4938 // For a list of ordered instruction-vf pairs:
4939 // [(load, vf1), (load, vf2), (store, vf1)]
4940 // Group the instructions together to emit separate remarks for:
4941 // load (vf1, vf2)
4942 // store (vf1)
4943 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
4944 auto Subset = ArrayRef<InstructionVFPair>();
4945 do {
4946 if (Subset.empty())
4947 Subset = Tail.take_front(1);
4948
4949 Instruction *I = Subset.front().first;
4950
4951 // If the next instruction is different, or if there are no other pairs,
4952 // emit a remark for the collated subset. e.g.
4953 // [(load, vf1), (load, vf2))]
4954 // to emit:
4955 // remark: invalid costs for 'load' at VF=(vf, vf2)
4956 if (Subset == Tail || Tail[Subset.size()].first != I) {
4957 std::string OutString;
4958 raw_string_ostream OS(OutString);
4959 assert(!Subset.empty() && "Unexpected empty range");
4960 OS << "Instruction with invalid costs prevented vectorization at VF=(";
4961 for (const auto &Pair : Subset)
4962 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4963 OS << "):";
4964 if (auto *CI = dyn_cast<CallInst>(I))
4965 OS << " call to " << CI->getCalledFunction()->getName();
4966 else
4967 OS << " " << I->getOpcodeName();
4968 OS.flush();
4969 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
4970 Tail = Tail.drop_front(Subset.size());
4971 Subset = {};
4972 } else
4973 // Grow the subset by one element
4974 Subset = Tail.take_front(Subset.size() + 1);
4975 } while (!Tail.empty());
4976}
4977
4978VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
4979 const ElementCountSet &VFCandidates) {
4980 InstructionCost ExpectedCost =
4982 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4983 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4984 assert(VFCandidates.count(ElementCount::getFixed(1)) &&
4985 "Expected Scalar VF to be a candidate");
4986
4987 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4988 ExpectedCost);
4989 VectorizationFactor ChosenFactor = ScalarCost;
4990
4991 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4992 if (ForceVectorization && VFCandidates.size() > 1) {
4993 // Ignore scalar width, because the user explicitly wants vectorization.
4994 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4995 // evaluation.
4996 ChosenFactor.Cost = InstructionCost::getMax();
4997 }
4998
4999 SmallVector<InstructionVFPair> InvalidCosts;
5000 for (const auto &i : VFCandidates) {
5001 // The cost for scalar VF=1 is already calculated, so ignore it.
5002 if (i.isScalar())
5003 continue;
5004
5006 CM.expectedCost(i, &InvalidCosts);
5007 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5008
5009#ifndef NDEBUG
5010 unsigned AssumedMinimumVscale =
5011 getVScaleForTuning(OrigLoop, TTI).value_or(1);
5012 unsigned Width =
5013 Candidate.Width.isScalable()
5014 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5015 : Candidate.Width.getFixedValue();
5016 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5017 << " costs: " << (Candidate.Cost / Width));
5018 if (i.isScalable())
5019 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5020 << AssumedMinimumVscale << ")");
5021 LLVM_DEBUG(dbgs() << ".\n");
5022#endif
5023
5024 if (!C.second && !ForceVectorization) {
5025 LLVM_DEBUG(
5026 dbgs() << "LV: Not considering vector loop of width " << i
5027 << " because it will not generate any vector instructions.\n");
5028 continue;
5029 }
5030
5031 // If profitable add it to ProfitableVF list.
5032 if (isMoreProfitable(Candidate, ScalarCost))
5033 ProfitableVFs.push_back(Candidate);
5034
5035 if (isMoreProfitable(Candidate, ChosenFactor))
5036 ChosenFactor = Candidate;
5037 }
5038
5039 emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
5040
5043 "There are conditional stores.",
5044 "store that is conditionally executed prevents vectorization",
5045 "ConditionalStore", ORE, OrigLoop);
5046 ChosenFactor = ScalarCost;
5047 }
5048
5049 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5050 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
5051 << "LV: Vectorization seems to be not beneficial, "
5052 << "but was forced by a user.\n");
5053 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5054 return ChosenFactor;
5055}
5056
5057bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
5058 ElementCount VF) const {
5059 // Cross iteration phis such as reductions need special handling and are
5060 // currently unsupported.
5061 if (any_of(OrigLoop->getHeader()->phis(),
5062 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
5063 return false;
5064
5065 // Phis with uses outside of the loop require special handling and are
5066 // currently unsupported.
5067 for (const auto &Entry : Legal->getInductionVars()) {
5068 // Look for uses of the value of the induction at the last iteration.
5069 Value *PostInc =
5070 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
5071 for (User *U : PostInc->users())
5072 if (!OrigLoop->contains(cast<Instruction>(U)))
5073 return false;
5074 // Look for uses of penultimate value of the induction.
5075 for (User *U : Entry.first->users())
5076 if (!OrigLoop->contains(cast<Instruction>(U)))
5077 return false;
5078 }
5079
5080 // Epilogue vectorization code has not been auditted to ensure it handles
5081 // non-latch exits properly. It may be fine, but it needs auditted and
5082 // tested.
5083 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
5084 return false;
5085
5086 return true;
5087}
5088
5090 const ElementCount VF) const {
5091 // FIXME: We need a much better cost-model to take different parameters such
5092 // as register pressure, code size increase and cost of extra branches into
5093 // account. For now we apply a very crude heuristic and only consider loops
5094 // with vectorization factors larger than a certain value.
5095
5096 // Allow the target to opt out entirely.
5098 return false;
5099
5100 // We also consider epilogue vectorization unprofitable for targets that don't
5101 // consider interleaving beneficial (eg. MVE).
5102 if (TTI.getMaxInterleaveFactor(VF) <= 1)
5103 return false;
5104
5105 unsigned Multiplier = 1;
5106 if (VF.isScalable())
5107 Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
5108 if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
5109 return true;
5110 return false;
5111}
5112
5114 const ElementCount MainLoopVF, unsigned IC) {
5117 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
5118 return Result;
5119 }
5120
5121 if (!CM.isScalarEpilogueAllowed()) {
5122 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
5123 "epilogue is allowed.\n");
5124 return Result;
5125 }
5126
5127 // Not really a cost consideration, but check for unsupported cases here to
5128 // simplify the logic.
5129 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
5130 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
5131 "is not a supported candidate.\n");
5132 return Result;
5133 }
5134
5136 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
5138 if (hasPlanWithVF(ForcedEC))
5139 return {ForcedEC, 0, 0};
5140 else {
5141 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
5142 "viable.\n");
5143 return Result;
5144 }
5145 }
5146
5147 if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
5148 OrigLoop->getHeader()->getParent()->hasMinSize()) {
5149 LLVM_DEBUG(
5150 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
5151 return Result;
5152 }
5153
5154 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
5155 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5156 "this loop\n");
5157 return Result;
5158 }
5159
5160 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5161 // the main loop handles 8 lanes per iteration. We could still benefit from
5162 // vectorizing the epilogue loop with VF=4.
5163 ElementCount EstimatedRuntimeVF = MainLoopVF;
5164 if (MainLoopVF.isScalable()) {
5165 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5166 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
5167 EstimatedRuntimeVF *= *VScale;
5168 }
5169
5170 ScalarEvolution &SE = *PSE.getSE();
5171 Type *TCType = Legal->getWidestInductionType();
5172 const SCEV *RemainingIterations = nullptr;
5173 for (auto &NextVF : ProfitableVFs) {
5174 // Skip candidate VFs without a corresponding VPlan.
5175 if (!hasPlanWithVF(NextVF.Width))
5176 continue;
5177
5178 // Skip candidate VFs with widths >= the estimate runtime VF (scalable
5179 // vectors) or the VF of the main loop (fixed vectors).
5180 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5181 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
5182 ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
5183 continue;
5184
5185 // If NextVF is greater than the number of remaining iterations, the
5186 // epilogue loop would be dead. Skip such factors.
5187 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
5188 // TODO: extend to support scalable VFs.
5189 if (!RemainingIterations) {
5190 const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
5191 RemainingIterations = SE.getURemExpr(
5192 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
5193 }
5194 if (SE.isKnownPredicate(
5196 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
5197 RemainingIterations))
5198 continue;
5199 }
5200
5201 if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
5202 Result = NextVF;
5203 }
5204
5205 if (Result != VectorizationFactor::Disabled())
5206 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5207 << Result.Width << "\n");
5208 return Result;
5209}
5210
5211std::pair<unsigned, unsigned>
5213 unsigned MinWidth = -1U;
5214 unsigned MaxWidth = 8;
5216 // For in-loop reductions, no element types are added to ElementTypesInLoop
5217 // if there are no loads/stores in the loop. In this case, check through the
5218 // reduction variables to determine the maximum width.
5219 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5220 // Reset MaxWidth so that we can find the smallest type used by recurrences
5221 // in the loop.
5222 MaxWidth = -1U;
5223 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
5224 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5225 // When finding the min width used by the recurrence we need to account
5226 // for casts on the input operands of the recurrence.
5227 MaxWidth = std::min<unsigned>(
5228 MaxWidth, std::min<unsigned>(
5231 }
5232 } else {
5233 for (Type *T : ElementTypesInLoop) {
5234 MinWidth = std::min<unsigned>(
5235 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5236 MaxWidth = std::max<unsigned>(
5237 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5238 }
5239 }
5240 return {MinWidth, MaxWidth};
5241}
5242
5244 ElementTypesInLoop.clear();
5245 // For each block.
5246 for (BasicBlock *BB : TheLoop->blocks()) {
5247 // For each instruction in the loop.
5248 for (Instruction &I : BB->instructionsWithoutDebug()) {
5249 Type *T = I.getType();
5250
5251 // Skip ignored values.
5252 if (ValuesToIgnore.count(&I))
5253 continue;
5254
5255 // Only examine Loads, Stores and PHINodes.
5256 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5257 continue;
5258
5259 // Examine PHI nodes that are reduction variables. Update the type to
5260 // account for the recurrence type.
5261 if (auto *PN = dyn_cast<PHINode>(&I)) {
5262 if (!Legal->isReductionVariable(PN))
5263 continue;
5264 const RecurrenceDescriptor &RdxDesc =
5265 Legal->getReductionVars().find(PN)->second;
5268 RdxDesc.getRecurrenceType(),
5270 continue;
5271 T = RdxDesc.getRecurrenceType();
5272 }
5273
5274 // Examine the stored values.
5275 if (auto *ST = dyn_cast<StoreInst>(&I))
5276 T = ST->getValueOperand()->getType();
5277
5278 assert(T->isSized() &&
5279 "Expected the load/store/recurrence type to be sized");
5280
5281 ElementTypesInLoop.insert(T);
5282 }
5283 }
5284}
5285
5286unsigned
5288 InstructionCost LoopCost) {
5289 // -- The interleave heuristics --
5290 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5291 // There are many micro-architectural considerations that we can't predict
5292 // at this level. For example, frontend pressure (on decode or fetch) due to
5293 // code size, or the number and capabilities of the execution ports.
5294 //
5295 // We use the following heuristics to select the interleave count:
5296 // 1. If the code has reductions, then we interleave to break the cross
5297 // iteration dependency.
5298 // 2. If the loop is really small, then we interleave to reduce the loop
5299 // overhead.
5300 // 3. We don't interleave if we think that we will spill registers to memory
5301 // due to the increased register pressure.
5302
5304 return 1;
5305
5306 // Do not interleave if EVL is preferred and no User IC is specified.
5307 if (foldTailWithEVL()) {
5308 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
5309 "Unroll factor forced to be 1.\n");
5310 return 1;
5311 }
5312
5313 // We used the distance for the interleave count.
5315 return 1;
5316
5317 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5318 const bool HasReductions = !Legal->getReductionVars().empty();
5319
5320 // If we did not calculate the cost for VF (because the user selected the VF)
5321 // then we calculate the cost of VF here.
5322 if (LoopCost == 0) {
5323 LoopCost = expectedCost(VF).first;
5324 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5325
5326 // Loop body is free and there is no need for interleaving.
5327 if (LoopCost == 0)
5328 return 1;
5329 }
5330
5332 // We divide by these constants so assume that we have at least one
5333 // instruction that uses at least one register.
5334 for (auto& pair : R.MaxLocalUsers) {
5335 pair.second = std::max(pair.second, 1U);
5336 }
5337
5338 // We calculate the interleave count using the following formula.
5339 // Subtract the number of loop invariants from the number of available
5340 // registers. These registers are used by all of the interleaved instances.
5341 // Next, divide the remaining registers by the number of registers that is
5342 // required by the loop, in order to estimate how many parallel instances
5343 // fit without causing spills. All of this is rounded down if necessary to be
5344 // a power of two. We want power of two interleave count to simplify any
5345 // addressing operations or alignment considerations.
5346 // We also want power of two interleave counts to ensure that the induction
5347 // variable of the vector loop wraps to zero, when tail is folded by masking;
5348 // this currently happens when OptForSize, in which case IC is set to 1 above.
5349 unsigned IC = UINT_MAX;
5350
5351 for (auto& pair : R.MaxLocalUsers) {
5352 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5353 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5354 << " registers of "
5355 << TTI.getRegisterClassName(pair.first) << " register class\n");
5356 if (VF.isScalar()) {
5357 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5358 TargetNumRegisters = ForceTargetNumScalarRegs;
5359 } else {
5360 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5361 TargetNumRegisters = ForceTargetNumVectorRegs;
5362 }
5363 unsigned MaxLocalUsers = pair.second;
5364 unsigned LoopInvariantRegs = 0;
5365 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5366 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5367
5368 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5369 MaxLocalUsers);
5370 // Don't count the induction variable as interleaved.
5372 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5373 std::max(1U, (MaxLocalUsers - 1)));
5374 }
5375
5376 IC = std::min(IC, TmpIC);
5377 }
5378
5379 // Clamp the interleave ranges to reasonable counts.
5380 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5381
5382 // Check if the user has overridden the max.
5383 if (VF.isScalar()) {
5384 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5385 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5386 } else {
5387 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5388 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5389 }
5390
5391 unsigned EstimatedVF = VF.getKnownMinValue();
5392 if (VF.isScalable()) {
5393 if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
5394 EstimatedVF *= *VScale;
5395 }
5396 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
5397
5398 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5399 if (KnownTC > 0) {
5400 // At least one iteration must be scalar when this constraint holds. So the
5401 // maximum available iterations for interleaving is one less.
5402 unsigned AvailableTC =
5403 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5404
5405 // If trip count is known we select between two prospective ICs, where
5406 // 1) the aggressive IC is capped by the trip count divided by VF
5407 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5408 // The final IC is selected in a way that the epilogue loop trip count is
5409 // minimized while maximizing the IC itself, so that we either run the
5410 // vector loop at least once if it generates a small epilogue loop, or else
5411 // we run the vector loop at least twice.
5412
5413 unsigned InterleaveCountUB = bit_floor(
5414 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
5415 unsigned InterleaveCountLB = bit_floor(std::max(
5416 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5417 MaxInterleaveCount = InterleaveCountLB;
5418
5419 if (InterleaveCountUB != InterleaveCountLB) {
5420 unsigned TailTripCountUB =
5421 (AvailableTC % (EstimatedVF * InterleaveCountUB));
5422 unsigned TailTripCountLB =
5423 (AvailableTC % (EstimatedVF * InterleaveCountLB));
5424 // If both produce same scalar tail, maximize the IC to do the same work
5425 // in fewer vector loop iterations
5426 if (TailTripCountUB == TailTripCountLB)
5427 MaxInterleaveCount = InterleaveCountUB;
5428 }
5429 } else if (BestKnownTC && *BestKnownTC > 0) {
5430 // At least one iteration must be scalar when this constraint holds. So the
5431 // maximum available iterations for interleaving is one less.
5432 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5433 ? (*BestKnownTC) - 1
5434 : *BestKnownTC;
5435
5436 // If trip count is an estimated compile time constant, limit the
5437 // IC to be capped by the trip count divided by VF * 2, such that the vector
5438 // loop runs at least twice to make interleaving seem profitable when there
5439 // is an epilogue loop present. Since exact Trip count is not known we
5440 // choose to be conservative in our IC estimate.
5441 MaxInterleaveCount = bit_floor(std::max(
5442 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5443 }
5444
5445 assert(MaxInterleaveCount > 0 &&
5446 "Maximum interleave count must be greater than 0");
5447
5448 // Clamp the calculated IC to be between the 1 and the max interleave count
5449 // that the target and trip count allows.
5450 if (IC > MaxInterleaveCount)
5451 IC = MaxInterleaveCount;
5452 else
5453 // Make sure IC is greater than 0.
5454 IC = std::max(1u, IC);
5455
5456 assert(IC > 0 && "Interleave count must be greater than 0.");
5457
5458 // Interleave if we vectorized this loop and there is a reduction that could
5459 // benefit from interleaving.
5460 if (VF.isVector() && HasReductions) {
5461 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5462 return IC;
5463 }
5464
5465 // For any scalar loop that either requires runtime checks or predication we
5466 // are better off leaving this to the unroller. Note that if we've already
5467 // vectorized the loop we will have done the runtime check and so interleaving
5468 // won't require further checks.
5469 bool ScalarInterleavingRequiresPredication =
5470 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5471 return Legal->blockNeedsPredication(BB);
5472 }));
5473 bool ScalarInterleavingRequiresRuntimePointerCheck =
5475
5476 // We want to interleave small loops in order to reduce the loop overhead and
5477 // potentially expose ILP opportunities.
5478 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5479 << "LV: IC is " << IC << '\n'
5480 << "LV: VF is " << VF << '\n');
5481 const bool AggressivelyInterleaveReductions =
5482 TTI.enableAggressiveInterleaving(HasReductions);
5483 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5484 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5485 // We assume that the cost overhead is 1 and we use the cost model
5486 // to estimate the cost of the loop and interleave until the cost of the
5487 // loop overhead is about 5% of the cost of the loop.
5488 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5489 SmallLoopCost / *LoopCost.getValue()));
5490
5491 // Interleave until store/load ports (estimated by max interleave count) are
5492 // saturated.
5493 unsigned NumStores = Legal->getNumStores();
5494 unsigned NumLoads = Legal->getNumLoads();
5495 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5496 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5497
5498 // There is little point in interleaving for reductions containing selects
5499 // and compares when VF=1 since it may just create more overhead than it's
5500 // worth for loops with small trip counts. This is because we still have to
5501 // do the final reduction after the loop.
5502 bool HasSelectCmpReductions =
5503 HasReductions &&
5504 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5505 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5506 return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5507 RdxDesc.getRecurrenceKind());
5508 });
5509 if (HasSelectCmpReductions) {
5510 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5511 return 1;
5512 }
5513
5514 // If we have a scalar reduction (vector reductions are already dealt with
5515 // by this point), we can increase the critical path length if the loop
5516 // we're interleaving is inside another loop. For tree-wise reductions
5517 // set the limit to 2, and for ordered reductions it's best to disable
5518 // interleaving entirely.
5519 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5520 bool HasOrderedReductions =
5521 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5522 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5523 return RdxDesc.isOrdered();
5524 });
5525 if (HasOrderedReductions) {
5526 LLVM_DEBUG(
5527 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5528 return 1;
5529 }
5530
5531 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5532 SmallIC = std::min(SmallIC, F);
5533 StoresIC = std::min(StoresIC, F);
5534 LoadsIC = std::min(LoadsIC, F);
5535 }
5536
5538 std::max(StoresIC, LoadsIC) > SmallIC) {
5539 LLVM_DEBUG(
5540 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5541 return std::max(StoresIC, LoadsIC);
5542 }
5543
5544 // If there are scalar reductions and TTI has enabled aggressive
5545 // interleaving for reductions, we will interleave to expose ILP.
5546 if (VF.isScalar() && AggressivelyInterleaveReductions) {
5547 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5548 // Interleave no less than SmallIC but not as aggressive as the normal IC
5549 // to satisfy the rare situation when resources are too limited.
5550 return std::max(IC / 2, SmallIC);
5551 } else {
5552 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5553 return SmallIC;
5554 }
5555 }
5556
5557 // Interleave if this is a large loop (small loops are already dealt with by
5558 // this point) that could benefit from interleaving.
5559 if (AggressivelyInterleaveReductions) {
5560 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5561 return IC;
5562 }
5563
5564 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5565 return 1;
5566}
5567
5570 // This function calculates the register usage by measuring the highest number
5571 // of values that are alive at a single location. Obviously, this is a very
5572 // rough estimation. We scan the loop in a topological order in order and
5573 // assign a number to each instruction. We use RPO to ensure that defs are
5574 // met before their users. We assume that each instruction that has in-loop
5575 // users starts an interval. We record every time that an in-loop value is
5576 // used, so we have a list of the first and last occurrences of each
5577 // instruction. Next, we transpose this data structure into a multi map that
5578 // holds the list of intervals that *end* at a specific location. This multi
5579 // map allows us to perform a linear search. We scan the instructions linearly
5580 // and record each time that a new interval starts, by placing it in a set.
5581 // If we find this value in the multi-map then we remove it from the set.
5582 // The max register usage is the maximum size of the set.
5583 // We also search for instructions that are defined outside the loop, but are
5584 // used inside the loop. We need this number separately from the max-interval
5585 // usage number because when we unroll, loop-invariant values do not take
5586 // more register.
5588 DFS.perform(LI);
5589
5590 RegisterUsage RU;
5591
5592 // Each 'key' in the map opens a new interval. The values
5593 // of the map are the index of the 'last seen' usage of the
5594 // instruction that is the key.
5596
5597 // Maps instruction to its index.
5599 // Marks the end of each interval.
5600 IntervalMap EndPoint;
5601 // Saves the list of instruction indices that are used in the loop.
5603 // Saves the list of values that are used in the loop but are defined outside
5604 // the loop (not including non-instruction values such as arguments and
5605 // constants).
5606 SmallSetVector<Instruction *, 8> LoopInvariants;
5607
5608 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5609 for (Instruction &I : BB->instructionsWithoutDebug()) {
5610 IdxToInstr.push_back(&I);
5611
5612 // Save the end location of each USE.
5613 for (Value *U : I.operands()) {
5614 auto *Instr = dyn_cast<Instruction>(U);
5615
5616 // Ignore non-instruction values such as arguments, constants, etc.
5617 // FIXME: Might need some motivation why these values are ignored. If
5618 // for example an argument is used inside the loop it will increase the
5619 // register pressure (so shouldn't we add it to LoopInvariants).
5620 if (!Instr)
5621 continue;
5622
5623 // If this instruction is outside the loop then record it and continue.
5624 if (!TheLoop->contains(Instr)) {
5625 LoopInvariants.insert(Instr);
5626 continue;
5627 }
5628
5629 // Overwrite previous end points.
5630 EndPoint[Instr] = IdxToInstr.size();
5631 Ends.insert(Instr);
5632 }
5633 }
5634 }
5635
5636 // Saves the list of intervals that end with the index in 'key'.
5637 using InstrList = SmallVector<Instruction *, 2>;
5638 DenseMap<unsigned, InstrList> TransposeEnds;
5639
5640 // Transpose the EndPoints to a list of values that end at each index.
5641 for (auto &Interval : EndPoint)
5642 TransposeEnds[Interval.second].push_back(Interval.first);
5643
5644 SmallPtrSet<Instruction *, 8> OpenIntervals;
5647
5648 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5649
5650 const auto &TTICapture = TTI;
5651 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5652 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5653 return 0;
5654 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5655 };
5656
5657 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5658 Instruction *I = IdxToInstr[i];
5659
5660 // Remove all of the instructions that end at this location.
5661 InstrList &List = TransposeEnds[i];
5662 for (Instruction *ToRemove : List)
5663 OpenIntervals.erase(ToRemove);
5664
5665 // Ignore instructions that are never used within the loop.
5666 if (!Ends.count(I))
5667 continue;
5668
5669 // Skip ignored values.
5670 if (ValuesToIgnore.count(I))
5671 continue;
5672
5674
5675 // For each VF find the maximum usage of registers.
5676 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5677 // Count the number of registers used, per register class, given all open
5678 // intervals.
5679 // Note that elements in this SmallMapVector will be default constructed
5680 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5681 // there is no previous entry for ClassID.
5683
5684 if (VFs[j].isScalar()) {
5685 for (auto *Inst : OpenIntervals) {
5686 unsigned ClassID =
5687 TTI.getRegisterClassForType(false, Inst->getType());
5688 // FIXME: The target might use more than one register for the type
5689 // even in the scalar case.
5690 RegUsage[ClassID] += 1;
5691 }
5692 } else {
5694 for (auto *Inst : OpenIntervals) {
5695 // Skip ignored values for VF > 1.
5696 if (VecValuesToIgnore.count(Inst))
5697 continue;
5698 if (isScalarAfterVectorization(Inst, VFs[j])) {
5699 unsigned ClassID =
5700 TTI.getRegisterClassForType(false, Inst->getType());
5701 // FIXME: The target might use more than one register for the type
5702 // even in the scalar case.
5703 RegUsage[ClassID] += 1;
5704 } else {
5705 unsigned ClassID =
5706 TTI.getRegisterClassForType(true, Inst->getType());
5707 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5708 }
5709 }
5710 }
5711
5712 for (auto& pair : RegUsage) {
5713 auto &Entry = MaxUsages[j][pair.first];
5714 Entry = std::max(Entry, pair.second);
5715 }
5716 }
5717
5718 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5719 << OpenIntervals.size() << '\n');
5720
5721 // Add the current instruction to the list of open intervals.
5722 OpenIntervals.insert(I);
5723 }
5724
5725 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5726 // Note that elements in this SmallMapVector will be default constructed
5727 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5728 // there is no previous entry for ClassID.
5730
5731 for (auto *Inst : LoopInvariants) {
5732 // FIXME: The target might use more than one register for the type
5733 // even in the scalar case.
5734 bool IsScalar = all_of(Inst->users(), [&](User *U) {
5735 auto *I = cast<Instruction>(U);
5736 return TheLoop != LI->getLoopFor(I->getParent()) ||
5737 isScalarAfterVectorization(I, VFs[i]);
5738 });
5739
5740 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
5741 unsigned ClassID =
5742 TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5743 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5744 }
5745
5746 LLVM_DEBUG({
5747 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5748 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5749 << " item\n";
5750 for (const auto &pair : MaxUsages[i]) {
5751 dbgs() << "LV(REG): RegisterClass: "
5752 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5753 << " registers\n";
5754 }
5755 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5756 << " item\n";
5757 for (const auto &pair : Invariant) {
5758 dbgs() << "LV(REG): RegisterClass: "
5759 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5760 << " registers\n";
5761 }
5762 });
5763
5764 RU.LoopInvariantRegs = Invariant;
5765 RU.MaxLocalUsers = MaxUsages[i];
5766 RUs[i] = RU;
5767 }
5768
5769 return RUs;
5770}
5771
5772bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5773 ElementCount VF) {
5774 // TODO: Cost model for emulated masked load/store is completely
5775 // broken. This hack guides the cost model to use an artificially
5776 // high enough value to practically disable vectorization with such
5777 // operations, except where previously deployed legality hack allowed
5778 // using very low cost values. This is to avoid regressions coming simply
5779 // from moving "masked load/store" check from legality to cost model.
5780 // Masked Load/Gather emulation was previously never allowed.
5781 // Limited number of Masked Store/Scatter emulation was allowed.
5783 "Expecting a scalar emulated instruction");
5784 return isa<LoadInst>(I) ||
5785 (isa<StoreInst>(I) &&
5786 NumPredStores > NumberOfStoresToPredicate);
5787}
5788
5790 // If we aren't vectorizing the loop, or if we've already collected the
5791 // instructions to scalarize, there's nothing to do. Collection may already
5792 // have occurred if we have a user-selected VF and are now computing the
5793 // expected cost for interleaving.
5794 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5795 return;
5796
5797 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5798 // not profitable to scalarize any instructions, the presence of VF in the
5799 // map will indicate that we've analyzed it already.
5800 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5801
5802 PredicatedBBsAfterVectorization[VF].clear();
5803
5804 // Find all the instructions that are scalar with predication in the loop and
5805 // determine if it would be better to not if-convert the blocks they are in.
5806 // If so, we also record the instructions to scalarize.
5807 for (BasicBlock *BB : TheLoop->blocks()) {
5809 continue;
5810 for (Instruction &I : *BB)
5811 if (isScalarWithPredication(&I, VF)) {
5812 ScalarCostsTy ScalarCosts;
5813 // Do not apply discount if scalable, because that would lead to
5814 // invalid scalarization costs.
5815 // Do not apply discount logic if hacked cost is needed
5816 // for emulated masked memrefs.
5817 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
5818 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5819 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5820 // Remember that BB will remain after vectorization.
5821 PredicatedBBsAfterVectorization[VF].insert(BB);
5822 }
5823 }
5824}
5825
5826InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5827 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5828 assert(!isUniformAfterVectorization(PredInst, VF) &&
5829 "Instruction marked uniform-after-vectorization will be predicated");
5830
5831 // Initialize the discount to zero, meaning that the scalar version and the
5832 // vector version cost the same.
5833 InstructionCost Discount = 0;
5834
5835 // Holds instructions to analyze. The instructions we visit are mapped in
5836 // ScalarCosts. Those instructions are the ones that would be scalarized if
5837 // we find that the scalar version costs less.
5839
5840 // Returns true if the given instruction can be scalarized.
5841 auto canBeScalarized = [&](Instruction *I) -> bool {
5842 // We only attempt to scalarize instructions forming a single-use chain
5843 // from the original predicated block that would otherwise be vectorized.
5844 // Although not strictly necessary, we give up on instructions we know will
5845 // already be scalar to avoid traversing chains that are unlikely to be
5846 // beneficial.
5847 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5849 return false;
5850
5851 // If the instruction is scalar with predication, it will be analyzed
5852 // separately. We ignore it within the context of PredInst.
5853 if (isScalarWithPredication(I, VF))
5854 return false;
5855
5856 // If any of the instruction's operands are uniform after vectorization,
5857 // the instruction cannot be scalarized. This prevents, for example, a
5858 // masked load from being scalarized.
5859 //
5860 // We assume we will only emit a value for lane zero of an instruction
5861 // marked uniform after vectorization, rather than VF identical values.
5862 // Thus, if we scalarize an instruction that uses a uniform, we would
5863 // create uses of values corresponding to the lanes we aren't emitting code
5864 // for. This behavior can be changed by allowing getScalarValue to clone
5865 // the lane zero values for uniforms rather than asserting.
5866 for (Use &U : I->operands())
5867 if (auto *J = dyn_cast<Instruction>(U.get()))
5868 if (isUniformAfterVectorization(J, VF))
5869 return false;
5870
5871 // Otherwise, we can scalarize the instruction.
5872 return true;
5873 };
5874
5875 // Compute the expected cost discount from scalarizing the entire expression
5876 // feeding the predicated instruction. We currently only consider expressions
5877 // that are single-use instruction chains.
5878 Worklist.push_back(PredInst);
5879 while (!Worklist.empty()) {
5880 Instruction *I = Worklist.pop_back_val();
5881
5882 // If we've already analyzed the instruction, there's nothing to do.
5883 if (ScalarCosts.contains(I))
5884 continue;
5885
5886 // Compute the cost of the vector instruction. Note that this cost already
5887 // includes the scalarization overhead of the predicated instruction.
5888 InstructionCost VectorCost = getInstructionCost(I, VF).first;
5889
5890 // Compute the cost of the scalarized instruction. This cost is the cost of
5891 // the instruction as if it wasn't if-converted and instead remained in the
5892 // predicated block. We will scale this cost by block probability after
5893 // computing the scalarization overhead.
5894 InstructionCost ScalarCost =
5895 VF.getFixedValue() *
5896 getInstructionCost(I, ElementCount::getFixed(1)).first;
5897
5898 // Compute the scalarization overhead of needed insertelement instructions
5899 // and phi nodes.
5901 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5902 ScalarCost += TTI.getScalarizationOverhead(
5903 cast<VectorType>(ToVectorTy(I->getType(), VF)),
5904 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5905 /*Extract*/ false, CostKind);
5906 ScalarCost +=
5907 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5908 }
5909
5910 // Compute the scalarization overhead of needed extractelement
5911 // instructions. For each of the instruction's operands, if the operand can
5912 // be scalarized, add it to the worklist; otherwise, account for the
5913 // overhead.
5914 for (Use &U : I->operands())
5915 if (auto *J = dyn_cast<Instruction>(U.get())) {
5916 assert(VectorType::isValidElementType(J->getType()) &&
5917 "Instruction has non-scalar type");
5918 if (canBeScalarized(J))
5919 Worklist.push_back(J);
5920 else if (needsExtract(J, VF)) {
5921 ScalarCost += TTI.getScalarizationOverhead(
5922 cast<VectorType>(ToVectorTy(J->getType(), VF)),
5923 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5924 /*Extract*/ true, CostKind);
5925 }
5926 }
5927
5928 // Scale the total scalar cost by block probability.
5929 ScalarCost /= getReciprocalPredBlockProb();
5930
5931 // Compute the discount. A non-negative discount means the vector version
5932 // of the instruction costs more, and scalarizing would be beneficial.
5933 Discount += VectorCost - ScalarCost;
5934 ScalarCosts[I] = ScalarCost;
5935 }
5936
5937 return Discount;
5938}
5939
5944
5945 // For each block.
5946 for (BasicBlock *BB : TheLoop->blocks()) {
5947 VectorizationCostTy BlockCost;
5948
5949 // For each instruction in the old loop.
5950 for (Instruction &I : BB->instructionsWithoutDebug()) {
5951 // Skip ignored values.
5952 if (ValuesToIgnore.count(&I) ||
5953 (VF.isVector() && VecValuesToIgnore.count(&I)))
5954 continue;
5955
5956 VectorizationCostTy C = getInstructionCost(&I, VF);
5957
5958 // Check if we should override the cost.
5959 if (C.first.isValid() &&
5960 ForceTargetInstructionCost.getNumOccurrences() > 0)
5962
5963 // Keep a list of instructions with invalid costs.
5964 if (Invalid && !C.first.isValid())
5965 Invalid->emplace_back(&I, VF);
5966
5967 BlockCost.first += C.first;
5968 BlockCost.second |= C.second;
5969 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5970 << " for VF " << VF << " For instruction: " << I
5971 << '\n');
5972 }
5973
5974 // If we are vectorizing a predicated block, it will have been
5975 // if-converted. This means that the block's instructions (aside from
5976 // stores and instructions that may divide by zero) will now be
5977 // unconditionally executed. For the scalar case, we may not always execute
5978 // the predicated block, if it is an if-else block. Thus, scale the block's
5979 // cost by the probability of executing it. blockNeedsPredication from
5980 // Legal is used so as to not include all blocks in tail folded loops.
5981 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5982 BlockCost.first /= getReciprocalPredBlockProb();
5983
5984 Cost.first += BlockCost.first;
5985 Cost.second |= BlockCost.second;
5986 }
5987
5988 return Cost;
5989}
5990
5991/// Gets Address Access SCEV after verifying that the access pattern
5992/// is loop invariant except the induction variable dependence.
5993///
5994/// This SCEV can be sent to the Target in order to estimate the address
5995/// calculation cost.
5997 Value *Ptr,
6000 const Loop *TheLoop) {
6001
6002 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6003 if (!Gep)
6004 return nullptr;
6005
6006 // We are looking for a gep with all loop invariant indices except for one
6007 // which should be an induction variable.
6008 auto SE = PSE.getSE();
6009 unsigned NumOperands = Gep->getNumOperands();
6010 for (unsigned i = 1; i < NumOperands; ++i) {
6011 Value *Opd = Gep->getOperand(i);
6012 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6013 !Legal->isInductionVariable(Opd))
6014 return nullptr;
6015 }
6016
6017 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6018 return PSE.getSCEV(Ptr);
6019}
6020
6022LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6023 ElementCount VF) {
6024 assert(VF.isVector() &&
6025 "Scalarization cost of instruction implies vectorization.");
6026 if (VF.isScalable())
6028
6029 Type *ValTy = getLoadStoreType(I);
6030 auto SE = PSE.getSE();
6031
6032 unsigned AS = getLoadStoreAddressSpace(I);
6034 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6035 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6036 // that it is being called from this specific place.
6037
6038 // Figure out whether the access is strided and get the stride value
6039 // if it's known in compile time
6040 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6041
6042 // Get the cost of the scalar memory instruction and address computation.
6044 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6045
6046 // Don't pass *I here, since it is scalar but will actually be part of a
6047 // vectorized loop where the user of it is a vectorized instruction.
6049 const Align Alignment = getLoadStoreAlignment(I);
6050 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
6051 ValTy->getScalarType(),
6052 Alignment, AS, CostKind);
6053
6054 // Get the overhead of the extractelement and insertelement instructions
6055 // we might create due to scalarization.
6056 Cost += getScalarizationOverhead(I, VF, CostKind);
6057
6058 // If we have a predicated load/store, it will need extra i1 extracts and
6059 // conditional branches, but may not be executed for each vector lane. Scale
6060 // the cost by the probability of executing the predicated block.
6061 if (isPredicatedInst(I)) {
6063
6064 // Add the cost of an i1 extract and a branch
6065 auto *Vec_i1Ty =
6068 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6069 /*Insert=*/false, /*Extract=*/true, CostKind);
6070 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
6071
6072 if (useEmulatedMaskMemRefHack(I, VF))
6073 // Artificially setting to a high enough value to practically disable
6074 // vectorization with such operations.
6075 Cost = 3000000;
6076 }
6077
6078 return Cost;
6079}
6080
6082LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6083 ElementCount VF) {
6084 Type *ValTy = getLoadStoreType(I);
6085 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6087 unsigned AS = getLoadStoreAddressSpace(I);
6088 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6090
6091 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6092 "Stride should be 1 or -1 for consecutive memory access");
6093 const Align Alignment = getLoadStoreAlignment(I);
6095 if (Legal->isMaskRequired(I)) {
6096 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6097 CostKind);
6098 } else {
6099 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6100 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6101 CostKind, OpInfo, I);
6102 }
6103
6104 bool Reverse = ConsecutiveStride < 0;
6105 if (Reverse)
6107 std::nullopt, CostKind, 0);
6108 return Cost;
6109}
6110
6112LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6113 ElementCount VF) {
6114 assert(Legal->isUniformMemOp(*I, VF));
6115
6116 Type *ValTy = getLoadStoreType(I);
6117 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6118 const Align Alignment = getLoadStoreAlignment(I);
6119 unsigned AS = getLoadStoreAddressSpace(I);
6121 if (isa<LoadInst>(I)) {
6122 return TTI.getAddressComputationCost(ValTy) +
6123 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6124 CostKind) +
6126 }
6127 StoreInst *SI = cast<StoreInst>(I);
6128
6129 bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
6130 return TTI.getAddressComputationCost(ValTy) +
6131 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6132 CostKind) +
6133 (isLoopInvariantStoreValue
6134 ? 0
6135 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6136 CostKind, VF.getKnownMinValue() - 1));
6137}
6138
6140LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6141 ElementCount VF) {
6142 Type *ValTy = getLoadStoreType(I);
6143 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6144 const Align Alignment = getLoadStoreAlignment(I);
6146
6147 return TTI.getAddressComputationCost(VectorTy) +
6149 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6151}
6152
6154LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6155 ElementCount VF) {
6156 Type *ValTy = getLoadStoreType(I);
6157 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6158 unsigned AS = getLoadStoreAddressSpace(I);
6160
6161 auto Group = getInterleavedAccessGroup(I);
6162 assert(Group && "Fail to get an interleaved access group.");
6163
6164 unsigned InterleaveFactor = Group->getFactor();
6165 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6166
6167 // Holds the indices of existing members in the interleaved group.
6169 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6170 if (Group->getMember(IF))
6171 Indices.push_back(IF);
6172
6173 // Calculate the cost of the whole interleaved group.
6174 bool UseMaskForGaps =
6175 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6176 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6178 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6179 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
6180
6181 if (Group->isReverse()) {
6182 // TODO: Add support for reversed masked interleaved access.
6184 "Reverse masked interleaved access not supported.");
6185 Cost += Group->getNumMembers() *
6187 std::nullopt, CostKind, 0);
6188 }
6189 return Cost;
6190}
6191
6192std::optional<InstructionCost>
6193LoopVectorizationCostModel::getReductionPatternCost(
6194 Instruction *I, ElementCount VF, Type *Ty,
6196 using namespace llvm::PatternMatch;
6197 // Early exit for no inloop reductions
6198 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6199 return std::nullopt;
6200 auto *VectorTy = cast<VectorType>(Ty);
6201
6202 // We are looking for a pattern of, and finding the minimal acceptable cost:
6203 // reduce(mul(ext(A), ext(B))) or
6204 // reduce(mul(A, B)) or
6205 // reduce(ext(A)) or
6206 // reduce(A).
6207 // The basic idea is that we walk down the tree to do that, finding the root
6208 // reduction instruction in InLoopReductionImmediateChains. From there we find
6209 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6210 // of the components. If the reduction cost is lower then we return it for the
6211 // reduction instruction and 0 for the other instructions in the pattern. If
6212 // it is not we return an invalid cost specifying the orignal cost method
6213 // should be used.
6214 Instruction *RetI = I;
6215 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6216 if (!RetI->hasOneUser())
6217 return std::nullopt;
6218 RetI = RetI->user_back();
6219 }
6220
6221 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6222 RetI->user_back()->getOpcode() == Instruction::Add) {
6223 RetI = RetI->user_back();
6224 }
6225
6226 // Test if the found instruction is a reduction, and if not return an invalid
6227 // cost specifying the parent to use the original cost modelling.
6228 if (!InLoopReductionImmediateChains.count(RetI))
6229 return std::nullopt;
6230
6231 // Find the reduction this chain is a part of and calculate the basic cost of
6232 // the reduction on its own.
6233 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
6234 Instruction *ReductionPhi = LastChain;
6235 while (!isa<PHINode>(ReductionPhi))
6236 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
6237
6238 const RecurrenceDescriptor &RdxDesc =
6239 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6240
6242 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6243
6244 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6245 // normal fmul instruction to the cost of the fadd reduction.
6246 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6247 BaseCost +=
6248 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6249
6250 // If we're using ordered reductions then we can just return the base cost
6251 // here, since getArithmeticReductionCost calculates the full ordered
6252 // reduction cost when FP reassociation is not allowed.
6253 if (useOrderedReductions(RdxDesc))
6254 return BaseCost;
6255
6256 // Get the operand that was not the reduction chain and match it to one of the
6257 // patterns, returning the better cost if it is found.
6258 Instruction *RedOp = RetI->getOperand(1) == LastChain
6259 ? dyn_cast<Instruction>(RetI->getOperand(0))
6260 : dyn_cast<Instruction>(RetI->getOperand(1));
6261
6262 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6263
6264 Instruction *Op0, *Op1;
6265 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6266 match(RedOp,
6268 match(Op0, m_ZExtOrSExt(m_Value())) &&
6269 Op0->getOpcode() == Op1->getOpcode() &&
6270 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6272 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6273
6274 // Matched reduce.add(ext(mul(ext(A), ext(B)))
6275 // Note that the extend opcodes need to all match, or if A==B they will have
6276 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6277 // which is equally fine.
6278 bool IsUnsigned = isa<ZExtInst>(Op0);
6279 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6280 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6281
6282 InstructionCost ExtCost =
6283 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6285 InstructionCost MulCost =
6286 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6287 InstructionCost Ext2Cost =
6288 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6290
6292 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6293
6294 if (RedCost.isValid() &&
6295 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6296 return I == RetI ? RedCost : 0;
6297 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6298 !TheLoop->isLoopInvariant(RedOp)) {
6299 // Matched reduce(ext(A))
6300 bool IsUnsigned = isa<ZExtInst>(RedOp);
6301 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6303 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6304 RdxDesc.getFastMathFlags(), CostKind);
6305
6306 InstructionCost ExtCost =
6307 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6309 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6310 return I == RetI ? RedCost : 0;
6311 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6312 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6313 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6314 Op0->getOpcode() == Op1->getOpcode() &&
6316 bool IsUnsigned = isa<ZExtInst>(Op0);
6317 Type *Op0Ty = Op0->getOperand(0)->getType();
6318 Type *Op1Ty = Op1->getOperand(0)->getType();
6319 Type *LargestOpTy =
6320 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6321 : Op0Ty;
6322 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6323
6324 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6325 // different sizes. We take the largest type as the ext to reduce, and add
6326 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6328 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6331 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6333 InstructionCost MulCost =
6334 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6335
6337 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6338 InstructionCost ExtraExtCost = 0;
6339 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6340 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6341 ExtraExtCost = TTI.getCastInstrCost(
6342 ExtraExtOp->getOpcode(), ExtType,
6343 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6345 }
6346
6347 if (RedCost.isValid() &&
6348 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6349 return I == RetI ? RedCost : 0;
6350 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6351 // Matched reduce.add(mul())
6352 InstructionCost MulCost =
6353 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6354
6356 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6357
6358 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6359 return I == RetI ? RedCost : 0;
6360 }
6361 }
6362
6363 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6364}
6365
6367LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6368 ElementCount VF) {
6369 // Calculate scalar cost only. Vectorization cost should be ready at this
6370 // moment.
6371 if (VF.isScalar()) {
6372 Type *ValTy = getLoadStoreType(I);
6373 const Align Alignment = getLoadStoreAlignment(I);
6374 unsigned AS = getLoadStoreAddressSpace(I);
6375
6376 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6377 return TTI.getAddressComputationCost(ValTy) +
6378 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6379 TTI::TCK_RecipThroughput, OpInfo, I);
6380 }
6381 return getWideningCost(I, VF);
6382}
6383
6385LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6386 ElementCount VF) {
6387 // If we know that this instruction will remain uniform, check the cost of
6388 // the scalar version.
6390 VF = ElementCount::getFixed(1);
6391
6392 if (VF.isVector() && isProfitableToScalarize(I, VF))
6393 return VectorizationCostTy(InstsToScalarize[VF][I], false);
6394
6395 // Forced scalars do not have any scalarization overhead.
6396 auto ForcedScalar = ForcedScalars.find(VF);
6397 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6398 auto InstSet = ForcedScalar->second;
6399 if (InstSet.count(I))
6400 return VectorizationCostTy(
6401 (getInstructionCost(I, ElementCount::getFixed(1)).first *
6402 VF.getKnownMinValue()),
6403 false);
6404 }
6405
6406 Type *VectorTy;
6407 InstructionCost C = getInstructionCost(I, VF, VectorTy);
6408
6409 bool TypeNotScalarized = false;
6410 if (VF.isVector() && VectorTy->isVectorTy()) {
6411 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6412 if (VF.isScalable())
6413 // <vscale x 1 x iN> is assumed to be profitable over iN because
6414 // scalable registers are a distinct register class from scalar ones.
6415 // If we ever find a target which wants to lower scalable vectors
6416 // back to scalars, we'll need to update this code to explicitly
6417 // ask TTI about the register class uses for each part.
6418 TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6419 else
6420 TypeNotScalarized = NumParts < VF.getKnownMinValue();
6421 } else
6423 }
6424 return VectorizationCostTy(C, TypeNotScalarized);
6425}
6426
6427InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6429
6430 // There is no mechanism yet to create a scalable scalarization loop,
6431 // so this is currently Invalid.
6432 if (VF.isScalable())
6434
6435 if (VF.isScalar())
6436 return 0;
6437
6439 Type *RetTy = ToVectorTy(I->getType(), VF);
6440 if (!RetTy->isVoidTy() &&
6441 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6443 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6444 /*Insert*/ true,
6445 /*Extract*/ false, CostKind);
6446
6447 // Some targets keep addresses scalar.
6448 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6449 return Cost;
6450
6451 // Some targets support efficient element stores.
6452 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6453 return Cost;
6454
6455 // Collect operands to consider.
6456 CallInst *CI = dyn_cast<CallInst>(I);
6457 Instruction::op_range Ops = CI ? CI->args() : I->operands();
6458
6459 // Skip operands that do not require extraction/scalarization and do not incur
6460 // any overhead.
6462 for (auto *V : filterExtractingOperands(Ops, VF))
6463 Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6465 filterExtractingOperands(Ops, VF), Tys, CostKind);
6466}
6467
6469 if (VF.isScalar())
6470 return;
6471 NumPredStores = 0;
6472 for (BasicBlock *BB : TheLoop->blocks()) {
6473 // For each instruction in the old loop.
6474 for (Instruction &I : *BB) {
6476 if (!Ptr)
6477 continue;
6478
6479 // TODO: We should generate better code and update the cost model for
6480 // predicated uniform stores. Today they are treated as any other
6481 // predicated store (see added test cases in
6482 // invariant-store-vectorization.ll).
6483 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6484 NumPredStores++;
6485
6486 if (Legal->isUniformMemOp(I, VF)) {
6487 auto isLegalToScalarize = [&]() {
6488 if (!VF.isScalable())
6489 // Scalarization of fixed length vectors "just works".
6490 return true;
6491
6492 // We have dedicated lowering for unpredicated uniform loads and
6493 // stores. Note that even with tail folding we know that at least
6494 // one lane is active (i.e. generalized predication is not possible
6495 // here), and the logic below depends on this fact.
6496 if (!foldTailByMasking())
6497 return true;
6498
6499 // For scalable vectors, a uniform memop load is always
6500 // uniform-by-parts and we know how to scalarize that.
6501 if (isa<LoadInst>(I))
6502 return true;
6503
6504 // A uniform store isn't neccessarily uniform-by-part
6505 // and we can't assume scalarization.
6506 auto &SI = cast<StoreInst>(I);
6507 return TheLoop->isLoopInvariant(SI.getValueOperand());
6508 };
6509
6510 const InstructionCost GatherScatterCost =
6512 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6513
6514 // Load: Scalar load + broadcast
6515 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6516 // FIXME: This cost is a significant under-estimate for tail folded
6517 // memory ops.
6518 const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6519 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6520
6521 // Choose better solution for the current VF, Note that Invalid
6522 // costs compare as maximumal large. If both are invalid, we get
6523 // scalable invalid which signals a failure and a vectorization abort.
6524 if (GatherScatterCost < ScalarizationCost)
6525 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6526 else
6527 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6528 continue;
6529 }
6530
6531 // We assume that widening is the best solution when possible.
6532 if (memoryInstructionCanBeWidened(&I, VF)) {
6533 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6534 int ConsecutiveStride = Legal->isConsecutivePtr(
6536 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6537 "Expected consecutive stride.");
6538 InstWidening Decision =
6539 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6540 setWideningDecision(&I, VF, Decision, Cost);
6541 continue;
6542 }
6543
6544 // Choose between Interleaving, Gather/Scatter or Scalarization.
6546 unsigned NumAccesses = 1;
6547 if (isAccessInterleaved(&I)) {
6548 auto Group = getInterleavedAccessGroup(&I);
6549 assert(Group && "Fail to get an interleaved access group.");
6550
6551 // Make one decision for the whole group.
6552 if (getWideningDecision(&I, VF) != CM_Unknown)
6553 continue;
6554
6555 NumAccesses = Group->getNumMembers();
6557 InterleaveCost = getInterleaveGroupCost(&I, VF);
6558 }
6559
6560 InstructionCost GatherScatterCost =
6562 ? getGatherScatterCost(&I, VF) * NumAccesses
6564
6565 InstructionCost ScalarizationCost =
6566 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6567
6568 // Choose better solution for the current VF,
6569 // write down this decision and use it during vectorization.
6571 InstWidening Decision;
6572 if (InterleaveCost <= GatherScatterCost &&
6573 InterleaveCost < ScalarizationCost) {
6574 Decision = CM_Interleave;
6575 Cost = InterleaveCost;
6576 } else if (GatherScatterCost < ScalarizationCost) {
6577 Decision = CM_GatherScatter;
6578 Cost = GatherScatterCost;
6579 } else {
6580 Decision = CM_Scalarize;
6581 Cost = ScalarizationCost;
6582 }
6583 // If the instructions belongs to an interleave group, the whole group
6584 // receives the same decision. The whole group receives the cost, but
6585 // the cost will actually be assigned to one instruction.
6586 if (auto Group = getInterleavedAccessGroup(&I))
6587 setWideningDecision(Group, VF, Decision, Cost);
6588 else
6589 setWideningDecision(&I, VF, Decision, Cost);
6590 }
6591 }
6592
6593 // Make sure that any load of address and any other address computation
6594 // remains scalar unless there is gather/scatter support. This avoids
6595 // inevitable extracts into address registers, and also has the benefit of
6596 // activating LSR more, since that pass can't optimize vectorized
6597 // addresses.
6599 return;
6600
6601 // Start with all scalar pointer uses.
6603 for (BasicBlock *BB : TheLoop->blocks())
6604 for (Instruction &I : *BB) {
6605 Instruction *PtrDef =
6606 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6607 if (PtrDef && TheLoop->contains(PtrDef) &&
6609 AddrDefs.insert(PtrDef);
6610 }
6611
6612 // Add all instructions used to generate the addresses.
6614 append_range(Worklist, AddrDefs);
6615 while (!Worklist.empty()) {
6616 Instruction *I = Worklist.pop_back_val();
6617 for (auto &Op : I->operands())
6618 if (auto *InstOp = dyn_cast<Instruction>(Op))
6619 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6620 AddrDefs.insert(InstOp).second)
6621 Worklist.push_back(InstOp);
6622 }
6623
6624 for (auto *I : AddrDefs) {
6625 if (isa<LoadInst>(I)) {
6626 // Setting the desired widening decision should ideally be handled in
6627 // by cost functions, but since this involves the task of finding out
6628 // if the loaded register is involved in an address computation, it is
6629 // instead changed here when we know this is the case.
6630 InstWidening Decision = getWideningDecision(I, VF);
6631 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6632 // Scalarize a widened load of address.
6634 I, VF, CM_Scalarize,
6635 (VF.getKnownMinValue() *
6636 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6637 else if (auto Group = getInterleavedAccessGroup(I)) {
6638 // Scalarize an interleave group of address loads.
6639 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6640 if (Instruction *Member = Group->getMember(I))
6642 Member, VF, CM_Scalarize,
6643 (VF.getKnownMinValue() *
6644 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6645 }
6646 }
6647 } else
6648 // Make sure I gets scalarized and a cost estimate without
6649 // scalarization overhead.
6650 ForcedScalars[VF].insert(I);
6651 }
6652}
6653
6655 assert(!VF.isScalar() &&
6656 "Trying to set a vectorization decision for a scalar VF");
6657
6658 for (BasicBlock *BB : TheLoop->blocks()) {
6659 // For each instruction in the old loop.
6660 for (Instruction &I : *BB) {
6661 CallInst *CI = dyn_cast<CallInst>(&I);
6662
6663 if (!CI)
6664 continue;
6665
6670
6671 Function *ScalarFunc = CI->getCalledFunction();
6672 Type *ScalarRetTy = CI->getType();
6673 SmallVector<Type *, 4> Tys, ScalarTys;
6674 bool MaskRequired = Legal->isMaskRequired(CI);
6675 for (auto &ArgOp : CI->args())
6676 ScalarTys.push_back(ArgOp->getType());
6677
6678 // Compute corresponding vector type for return value and arguments.
6679 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
6680 for (Type *ScalarTy : ScalarTys)
6681 Tys.push_back(ToVectorTy(ScalarTy, VF));
6682
6683 // An in-loop reduction using an fmuladd intrinsic is a special case;
6684 // we don't want the normal cost for that intrinsic.
6686 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6689 std::nullopt, *RedCost);
6690 continue;
6691 }
6692
6693 // Estimate cost of scalarized vector call. The source operands are
6694 // assumed to be vectors, so we need to extract individual elements from
6695 // there, execute VF scalar calls, and then gather the result into the
6696 // vector return value.
6697 InstructionCost ScalarCallCost =
6698 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6699
6700 // Compute costs of unpacking argument values for the scalar calls and
6701 // packing the return values to a vector.
6702 InstructionCost ScalarizationCost =
6703 getScalarizationOverhead(CI, VF, CostKind);
6704
6705 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6706
6707 // Find the cost of vectorizing the call, if we can find a suitable
6708 // vector variant of the function.
6709 bool UsesMask = false;
6710 VFInfo FuncInfo;
6711 Function *VecFunc = nullptr;
6712 // Search through any available variants for one we can use at this VF.
6713 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6714 // Must match requested VF.
6715 if (Info.Shape.VF != VF)
6716 continue;
6717
6718 // Must take a mask argument if one is required
6719 if (MaskRequired && !Info.isMasked())
6720 continue;
6721
6722 // Check that all parameter kinds are supported
6723 bool ParamsOk = true;
6724 for (VFParameter Param : Info.Shape.Parameters) {
6725 switch (Param.ParamKind) {
6727 break;
6729 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6730 // Make sure the scalar parameter in the loop is invariant.
6731 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6732 TheLoop))
6733 ParamsOk = false;
6734 break;
6735 }
6737 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6738 // Find the stride for the scalar parameter in this loop and see if
6739 // it matches the stride for the variant.
6740 // TODO: do we need to figure out the cost of an extract to get the
6741 // first lane? Or do we hope that it will be folded away?
6742 ScalarEvolution *SE = PSE.getSE();
6743 const auto *SAR =
6744 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6745
6746 if (!SAR || SAR->getLoop() != TheLoop) {
6747 ParamsOk = false;
6748 break;
6749 }
6750
6751 const SCEVConstant *Step =
6752 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6753
6754 if (!Step ||
6755 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6756 ParamsOk = false;
6757
6758 break;
6759 }
6761 UsesMask = true;
6762 break;
6763 default:
6764 ParamsOk = false;
6765 break;
6766 }
6767 }
6768
6769 if (!ParamsOk)
6770 continue;
6771
6772 // Found a suitable candidate, stop here.
6773 VecFunc = CI->getModule()->getFunction(Info.VectorName);
6774 FuncInfo = Info;
6775 break;
6776 }
6777
6778 // Add in the cost of synthesizing a mask if one wasn't required.
6779 InstructionCost MaskCost = 0;
6780 if (VecFunc && UsesMask && !MaskRequired)
6781 MaskCost = TTI.getShuffleCost(
6784 VecFunc->getFunctionType()->getContext()),
6785 VF));
6786
6787 if (TLI && VecFunc && !CI->isNoBuiltin())
6788 VectorCost =
6789 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6790
6791 // Find the cost of an intrinsic; some targets may have instructions that
6792 // perform the operation without needing an actual call.
6794 if (IID != Intrinsic::not_intrinsic)
6795 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6796
6797 InstructionCost Cost = ScalarCost;
6798 InstWidening Decision = CM_Scalarize;
6799
6800 if (VectorCost <= Cost) {
6801 Cost = VectorCost;
6802 Decision = CM_VectorCall;
6803 }
6804
6805 if (IntrinsicCost <= Cost) {
6806 Cost = IntrinsicCost;
6807 Decision = CM_IntrinsicCall;
6808 }
6809
6810 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6812 }
6813 }
6814}
6815
6817LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6818 Type *&VectorTy) {
6819 Type *RetTy = I->getType();
6821 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6822 auto SE = PSE.getSE();
6824
6825 auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6826 ElementCount VF) -> bool {
6827 if (VF.isScalar())
6828 return true;
6829
6830 auto Scalarized = InstsToScalarize.find(VF);
6831 assert(Scalarized != InstsToScalarize.end() &&
6832 "VF not yet analyzed for scalarization profitability");
6833 return !Scalarized->second.count(I) &&
6834 llvm::all_of(I->users(), [&](User *U) {
6835 auto *UI = cast<Instruction>(U);
6836 return !Scalarized->second.count(UI);
6837 });
6838 };
6839 (void) hasSingleCopyAfterVectorization;
6840
6841 if (isScalarAfterVectorization(I, VF)) {
6842 // With the exception of GEPs and PHIs, after scalarization there should
6843 // only be one copy of the instruction generated in the loop. This is
6844 // because the VF is either 1, or any instructions that need scalarizing
6845 // have already been dealt with by the time we get here. As a result,
6846 // it means we don't have to multiply the instruction cost by VF.
6847 assert(I->getOpcode() == Instruction::GetElementPtr ||
6848 I->getOpcode() == Instruction::PHI ||
6849 (I->getOpcode() == Instruction::BitCast &&
6850 I->getType()->isPointerTy()) ||
6851 hasSingleCopyAfterVectorization(I, VF));
6852 VectorTy = RetTy;
6853 } else
6854 VectorTy = ToVectorTy(RetTy, VF);
6855
6856 // TODO: We need to estimate the cost of intrinsic calls.
6857 switch (I->getOpcode()) {
6858 case Instruction::GetElementPtr:
6859 // We mark this instruction as zero-cost because the cost of GEPs in
6860 // vectorized code depends on whether the corresponding memory instruction
6861 // is scalarized or not. Therefore, we handle GEPs with the memory
6862 // instruction cost.
6863 return 0;
6864 case Instruction::Br: {
6865 // In cases of scalarized and predicated instructions, there will be VF
6866 // predicated blocks in the vectorized loop. Each branch around these
6867 // blocks requires also an extract of its vector compare i1 element.
6868 bool ScalarPredicatedBB = false;
6869 BranchInst *BI = cast<BranchInst>(I);
6870 if (VF.isVector() && BI->isConditional() &&
6871 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6872 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
6873 ScalarPredicatedBB = true;
6874
6875 if (ScalarPredicatedBB) {
6876 // Not possible to scalarize scalable vector with predicated instructions.
6877 if (VF.isScalable())
6879 // Return cost for branches around scalarized and predicated blocks.
6880 auto *Vec_i1Ty =
6881 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6882 return (
6884 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
6885 /*Insert*/ false, /*Extract*/ true, CostKind) +
6886 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6887 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6888 // The back-edge branch will remain, as will all scalar branches.
6889 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6890 else
6891 // This branch will be eliminated by if-conversion.
6892 return 0;
6893 // Note: We currently assume zero cost for an unconditional branch inside
6894 // a predicated block since it will become a fall-through, although we
6895 // may decide in the future to call TTI for all branches.
6896 }
6897 case Instruction::PHI: {
6898 auto *Phi = cast<PHINode>(I);
6899
6900 // First-order recurrences are replaced by vector shuffles inside the loop.
6901 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6903 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6905 cast<VectorType>(VectorTy), Mask, CostKind,
6906 VF.getKnownMinValue() - 1);
6907 }
6908
6909 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6910 // converted into select instructions. We require N - 1 selects per phi
6911 // node, where N is the number of incoming values.
6912 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6913 return (Phi->getNumIncomingValues() - 1) *
6915 Instruction::Select, ToVectorTy(Phi->getType(), VF),
6916 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6918
6919 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6920 }
6921 case Instruction::UDiv:
6922 case Instruction::SDiv:
6923 case Instruction::URem:
6924 case Instruction::SRem:
6925 if (VF.isVector() && isPredicatedInst(I)) {
6926 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6927 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6928 ScalarCost : SafeDivisorCost;
6929 }
6930 // We've proven all lanes safe to speculate, fall through.
6931 [[fallthrough]];
6932 case Instruction::Add:
6933 case Instruction::FAdd:
6934 case Instruction::Sub:
6935 case Instruction::FSub:
6936 case Instruction::Mul:
6937 case Instruction::FMul:
6938 case Instruction::FDiv:
6939 case Instruction::FRem:
6940 case Instruction::Shl:
6941 case Instruction::LShr:
6942 case Instruction::AShr:
6943 case Instruction::And:
6944 case Instruction::Or:
6945 case Instruction::Xor: {
6946 // If we're speculating on the stride being 1, the multiplication may
6947 // fold away. We can generalize this for all operations using the notion
6948 // of neutral elements. (TODO)
6949 if (I->getOpcode() == Instruction::Mul &&
6950 (PSE.getSCEV(I->getOperand(0))->isOne() ||
6951 PSE.getSCEV(I->getOperand(1))->isOne()))
6952 return 0;
6953
6954 // Detect reduction patterns
6955 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6956 return *RedCost;
6957
6958 // Certain instructions can be cheaper to vectorize if they have a constant
6959 // second vector operand. One example of this are shifts on x86.
6960 Value *Op2 = I->getOperand(1);
6961 auto Op2Info = TTI.getOperandInfo(Op2);
6962 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6963 Legal->isInvariant(Op2))
6965
6966 SmallVector<const Value *, 4> Operands(I->operand_values());
6968 I->getOpcode(), VectorTy, CostKind,
6969 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6970 Op2Info, Operands, I, TLI);
6971 }
6972 case Instruction::FNeg: {
6974 I->getOpcode(), VectorTy, CostKind,
6975 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6976 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6977 I->getOperand(0), I);
6978 }
6979 case Instruction::Select: {
6980 SelectInst *SI = cast<SelectInst>(I);
6981 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6982 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6983
6984 const Value *Op0, *Op1;
6985 using namespace llvm::PatternMatch;
6986 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6987 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6988 // select x, y, false --> x & y
6989 // select x, true, y --> x | y
6990 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6991 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6992 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6993 Op1->getType()->getScalarSizeInBits() == 1);
6994
6997 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6998 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6999 }
7000
7001 Type *CondTy = SI->getCondition()->getType();
7002 if (!ScalarCond)
7003 CondTy = VectorType::get(CondTy, VF);
7004
7006 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7007 Pred = Cmp->getPredicate();
7008 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7009 CostKind, I);
7010 }
7011 case Instruction::ICmp:
7012 case Instruction::FCmp: {
7013 Type *ValTy = I->getOperand(0)->getType();
7014 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7015 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7016 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7017 VectorTy = ToVectorTy(ValTy, VF);
7018 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7019 cast<CmpInst>(I)->getPredicate(), CostKind,
7020 I);
7021 }
7022 case Instruction::Store:
7023 case Instruction::Load: {
7024 ElementCount Width = VF;
7025 if (Width.isVector()) {
7026 InstWidening Decision = getWideningDecision(I, Width);
7027 assert(Decision != CM_Unknown &&
7028 "CM decision should be taken at this point");
7031 if (Decision == CM_Scalarize)
7032 Width = ElementCount::getFixed(1);
7033 }
7034 VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7035 return getMemoryInstructionCost(I, VF);
7036 }
7037 case Instruction::BitCast:
7038 if (I->getType()->isPointerTy())
7039 return 0;
7040 [[fallthrough]];
7041 case Instruction::ZExt:
7042 case Instruction::SExt:
7043 case Instruction::FPToUI:
7044 case Instruction::FPToSI:
7045 case Instruction::FPExt:
7046 case Instruction::PtrToInt:
7047 case Instruction::IntToPtr:
7048 case Instruction::SIToFP:
7049 case Instruction::UIToFP:
7050 case Instruction::Trunc:
7051 case Instruction::FPTrunc: {
7052 // Computes the CastContextHint from a Load/Store instruction.
7053 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7054 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7055 "Expected a load or a store!");
7056
7057 if (VF.isScalar() || !TheLoop->contains(I))
7059
7060 switch (getWideningDecision(I, VF)) {
7072 llvm_unreachable("Instr did not go through cost modelling?");
7075 llvm_unreachable_internal("Instr has invalid widening decision");
7076 }
7077
7078 llvm_unreachable("Unhandled case!");
7079 };
7080
7081 unsigned Opcode = I->getOpcode();
7083 // For Trunc, the context is the only user, which must be a StoreInst.
7084 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7085 if (I->hasOneUse())
7086 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7087 CCH = ComputeCCH(Store);
7088 }
7089 // For Z/Sext, the context is the operand, which must be a LoadInst.
7090 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7091 Opcode == Instruction::FPExt) {
7092 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7093 CCH = ComputeCCH(Load);
7094 }
7095
7096 // We optimize the truncation of induction variables having constant
7097 // integer steps. The cost of these truncations is the same as the scalar
7098 // operation.
7099 if (isOptimizableIVTruncate(I, VF)) {
7100 auto *Trunc = cast<TruncInst>(I);
7101 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7102 Trunc->getSrcTy(), CCH, CostKind, Trunc);
7103 }
7104
7105 // Detect reduction patterns
7106 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7107 return *RedCost;
7108
7109 Type *SrcScalarTy = I->getOperand(0)->getType();
7110 Type *SrcVecTy =
7111 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7113 // This cast is going to be shrunk. This may remove the cast or it might
7114 // turn it into slightly different cast. For example, if MinBW == 16,
7115 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7116 //
7117 // Calculate the modified src and dest types.
7118 Type *MinVecTy = VectorTy;
7119 if (Opcode == Instruction::Trunc) {
7120 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7121 VectorTy =
7122 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7123 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7124 // Leave SrcVecTy unchanged - we only shrink the destination element
7125 // type.
7126 VectorTy =
7127 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7128 }
7129 }
7130
7131 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7132 }
7133 case Instruction::Call:
7134 return getVectorCallCost(cast<CallInst>(I), VF);
7135 case Instruction::ExtractValue:
7137 case Instruction::Alloca:
7138 // We cannot easily widen alloca to a scalable alloca, as
7139 // the result would need to be a vector of pointers.
7140 if (VF.isScalable())
7142 [[fallthrough]];
7143 default:
7144 // This opcode is unknown. Assume that it is the same as 'mul'.
7145 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7146 } // end of switch.
7147}
7148
7150 // Ignore ephemeral values.
7152
7153 // Find all stores to invariant variables. Since they are going to sink
7154 // outside the loop we do not need calculate cost for them.
7155 for (BasicBlock *BB : TheLoop->blocks())
7156 for (Instruction &I : *BB) {
7157 StoreInst *SI;
7158 if ((SI = dyn_cast<StoreInst>(&I)) &&
7159 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7160 ValuesToIgnore.insert(&I);
7161 }
7162
7163 // Ignore type-promoting instructions we identified during reduction
7164 // detection.
7165 for (const auto &Reduction : Legal->getReductionVars()) {
7166 const RecurrenceDescriptor &RedDes = Reduction.second;
7167 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7168 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7169 }
7170 // Ignore type-casting instructions we identified during induction
7171 // detection.
7172 for (const auto &Induction : Legal->getInductionVars()) {
7173 const InductionDescriptor &IndDes = Induction.second;
7174 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7175 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7176 }
7177}
7178
7180 for (const auto &Reduction : Legal->getReductionVars()) {
7181 PHINode *Phi = Reduction.first;
7182 const RecurrenceDescriptor &RdxDesc = Reduction.second;
7183
7184 // We don't collect reductions that are type promoted (yet).
7185 if (RdxDesc.getRecurrenceType() != Phi->getType())
7186 continue;
7187
7188 // If the target would prefer this reduction to happen "in-loop", then we
7189 // want to record it as such.
7190 unsigned Opcode = RdxDesc.getOpcode();
7191 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7192 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7194 continue;
7195
7196 // Check that we can correctly put the reductions into the loop, by
7197 // finding the chain of operations that leads from the phi to the loop
7198 // exit value.
7199 SmallVector<Instruction *, 4> ReductionOperations =
7200 RdxDesc.getReductionOpChain(Phi, TheLoop);
7201 bool InLoop = !ReductionOperations.empty();
7202
7203 if (InLoop) {
7204 InLoopReductions.insert(Phi);
7205 // Add the elements to InLoopReductionImmediateChains for cost modelling.
7206 Instruction *LastChain = Phi;
7207 for (auto *I : ReductionOperations) {
7208 InLoopReductionImmediateChains[I] = LastChain;
7209 LastChain = I;
7210 }
7211 }
7212 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7213 << " reduction for phi: " << *Phi << "\n");
7214 }
7215}
7216
7218 DebugLoc DL, const Twine &Name) {
7220 Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
7221 return tryInsertInstruction(
7222 new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
7223}
7224
7225// This function will select a scalable VF if the target supports scalable
7226// vectors and a fixed one otherwise.
7227// TODO: we could return a pair of values that specify the max VF and
7228// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7229// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7230// doesn't have a cost model that can choose which plan to execute if
7231// more than one is generated.
7234 unsigned WidestType;
7235 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7236
7241
7243 unsigned N = RegSize.getKnownMinValue() / WidestType;
7244 return ElementCount::get(N, RegSize.isScalable());
7245}
7246
7249 ElementCount VF = UserVF;
7250 // Outer loop handling: They may require CFG and instruction level
7251 // transformations before even evaluating whether vectorization is profitable.
7252 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7253 // the vectorization pipeline.
7254 if (!OrigLoop->isInnermost()) {
7255 // If the user doesn't provide a vectorization factor, determine a
7256 // reasonable one.
7257 if (UserVF.isZero()) {
7258 VF = determineVPlanVF(TTI, CM);
7259 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7260
7261 // Make sure we have a VF > 1 for stress testing.
7262 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7263 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7264 << "overriding computed VF.\n");
7265 VF = ElementCount::getFixed(4);
7266 }
7267 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7269 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7270 << "not supported by the target.\n");
7272 "Scalable vectorization requested but not supported by the target",
7273 "the scalable user-specified vectorization width for outer-loop "
7274 "vectorization cannot be used because the target does not support "
7275 "scalable vectors.",
7276 "ScalableVFUnfeasible", ORE, OrigLoop);
7278 }
7279 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7281 "VF needs to be a power of two");
7282 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7283 << "VF " << VF << " to build VPlans.\n");
7284 buildVPlans(VF, VF);
7285
7286 // For VPlan build stress testing, we bail out after VPlan construction.
7289
7290 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7291 }
7292
7293 LLVM_DEBUG(
7294 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7295 "VPlan-native path.\n");
7297}
7298
7299std::optional<VectorizationFactor>
7301 assert(OrigLoop->isInnermost() && "Inner loop expected.");
7304
7305 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7306 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7307 return std::nullopt;
7308
7309 // Invalidate interleave groups if all blocks of loop will be predicated.
7310 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7312 LLVM_DEBUG(
7313 dbgs()
7314 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7315 "which requires masked-interleaved support.\n");
7317 // Invalidating interleave groups also requires invalidating all decisions
7318 // based on them, which includes widening decisions and uniform and scalar
7319 // values.
7321 }
7322
7323 ElementCount MaxUserVF =
7324 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7325 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7326 if (!UserVF.isZero() && UserVFIsLegal) {
7328 "VF needs to be a power of two");
7329 // Collect the instructions (and their associated costs) that will be more
7330 // profitable to scalarize.
7332 if (CM.selectUserVectorizationFactor(UserVF)) {
7333 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7334 buildVPlansWithVPRecipes(UserVF, UserVF);
7335 if (!hasPlanWithVF(UserVF)) {
7336 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
7337 << ".\n");
7338 return std::nullopt;
7339 }
7340
7342 return {{UserVF, 0, 0}};
7343 } else
7344 reportVectorizationInfo("UserVF ignored because of invalid costs.",
7345 "InvalidCost", ORE, OrigLoop);
7346 }
7347
7348 // Populate the set of Vectorization Factor Candidates.
7349 ElementCountSet VFCandidates;
7350 for (auto VF = ElementCount::getFixed(1);
7351 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7352 VFCandidates.insert(VF);
7353 for (auto VF = ElementCount::getScalable(1);
7354 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7355 VFCandidates.insert(VF);
7356
7358 for (const auto &VF : VFCandidates) {
7359 // Collect Uniform and Scalar instructions after vectorization with VF.
7361
7362 // Collect the instructions (and their associated costs) that will be more
7363 // profitable to scalarize.
7364 if (VF.isVector())
7366 }
7367
7368 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7369 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7370
7372 if (!MaxFactors.hasVector())
7374
7375 // Select the optimal vectorization factor.
7376 VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
7377 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7378 if (!hasPlanWithVF(VF.Width)) {
7379 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
7380 << ".\n");
7381 return std::nullopt;
7382 }
7383 return VF;
7384}
7385
7387 assert(count_if(VPlans,
7388 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7389 1 &&
7390 "Best VF has not a single VPlan.");
7391
7392 for (const VPlanPtr &Plan : VPlans) {
7393 if (Plan->hasVF(VF))
7394 return *Plan.get();
7395 }
7396 llvm_unreachable("No plan found!");
7397}
7398
7401 // Reserve first location for self reference to the LoopID metadata node.
7402 MDs.push_back(nullptr);
7403 bool IsUnrollMetadata = false;
7404 MDNode *LoopID = L->getLoopID();
7405 if (LoopID) {
7406 // First find existing loop unrolling disable metadata.
7407 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7408 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7409 if (MD) {
7410 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7411 IsUnrollMetadata =
7412 S && S->getString().starts_with("llvm.loop.unroll.disable");
7413 }
7414 MDs.push_back(LoopID->getOperand(i));
7415 }
7416 }
7417
7418 if (!IsUnrollMetadata) {
7419 // Add runtime unroll disable metadata.
7420 LLVMContext &Context = L->getHeader()->getContext();
7421 SmallVector<Metadata *, 1> DisableOperands;
7422 DisableOperands.push_back(
7423 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7424 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7425 MDs.push_back(DisableNode);
7426 MDNode *NewLoopID = MDNode::get(Context, MDs);
7427 // Set operand 0 to refer to the loop id itself.
7428 NewLoopID->replaceOperandWith(0, NewLoopID);
7429 L->setLoopID(NewLoopID);
7430 }
7431}
7432
7433// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7434// create a merge phi node for it and add it to \p ReductionResumeValues.
7436 VPInstruction *RedResult,
7438 VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) {
7439 if (!RedResult ||
7441 return;
7442
7443 auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
7444 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7445
7446 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
7447 Value *FinalValue =
7448 State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
7449 auto *ResumePhi =
7450 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
7451
7452 // TODO: bc.merge.rdx should not be created here, instead it should be
7453 // modeled in VPlan.
7454 BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7455 // Create a phi node that merges control-flow from the backedge-taken check
7456 // block and the middle block.
7457 auto *BCBlockPhi =
7458 PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
7459 LoopScalarPreHeader->getTerminator()->getIterator());
7460
7461 // If we are fixing reductions in the epilogue loop then we should already
7462 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7463 // we carry over the incoming values correctly.
7464 for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
7465 if (Incoming == LoopMiddleBlock)
7466 BCBlockPhi->addIncoming(FinalValue, Incoming);
7467 else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
7468 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
7469 Incoming);
7470 else
7471 BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
7472 }
7473
7474 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
7475 // TODO: This fixup should instead be modeled in VPlan.
7476 // Fix the scalar loop reduction variable with the incoming reduction sum
7477 // from the vector body and from the backedge value.
7478 int IncomingEdgeBlockIdx =
7479 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
7480 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
7481 // Pick the other block.
7482 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7483 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
7484 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7485 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
7486
7487 ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7488}
7489
7490std::pair<DenseMap<const SCEV *, Value *>,
7493 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7494 InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7495 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7496 assert(BestVPlan.hasVF(BestVF) &&
7497 "Trying to execute plan with unsupported VF");
7498 assert(BestVPlan.hasUF(BestUF) &&
7499 "Trying to execute plan with unsupported UF");
7500 assert(
7501 (IsEpilogueVectorization || !ExpandedSCEVs) &&
7502 "expanded SCEVs to reuse can only be used during epilogue vectorization");
7503
7504 if (!IsEpilogueVectorization)
7505 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7506
7507 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
7508 << ", UF=" << BestUF << '\n');
7509 BestVPlan.setName("Final VPlan");
7510 LLVM_DEBUG(BestVPlan.dump());
7511
7512 // Perform the actual loop transformation.
7513 VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7514 OrigLoop->getHeader()->getContext());
7515
7516 // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7517 // before making any changes to the CFG.
7518 if (!BestVPlan.getPreheader()->empty()) {
7519 State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7521 BestVPlan.getPreheader()->execute(&State);
7522 }
7523 if (!ILV.getTripCount())
7524 ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
7525 else
7526 assert(IsEpilogueVectorization && "should only re-use the existing trip "
7527 "count during epilogue vectorization");
7528
7529 // 1. Set up the skeleton for vectorization, including vector pre-header and
7530 // middle block. The vector loop is created during VPlan execution.
7531 Value *CanonicalIVStartValue;
7532 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7533 ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
7534 : State.ExpandedSCEVs);
7535
7536 // Only use noalias metadata when using memory checks guaranteeing no overlap
7537 // across all iterations.
7538 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7539 std::unique_ptr<LoopVersioning> LVer = nullptr;
7540 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7542
7543 // We currently don't use LoopVersioning for the actual loop cloning but we
7544 // still use it to add the noalias metadata.
7545 // TODO: Find a better way to re-use LoopVersioning functionality to add
7546 // metadata.
7547 LVer = std::make_unique<LoopVersioning>(
7548 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7549 PSE.getSE());
7550 State.LVer = &*LVer;
7552 }
7553
7555
7556 //===------------------------------------------------===//
7557 //
7558 // Notice: any optimization or new instruction that go
7559 // into the code below should also be implemented in
7560 // the cost-model.
7561 //
7562 //===------------------------------------------------===//
7563
7564 // 2. Copy and widen instructions from the old loop into the new loop.
7565 BestVPlan.prepareToExecute(ILV.getTripCount(),
7566 ILV.getOrCreateVectorTripCount(nullptr),
7567 CanonicalIVStartValue, State);
7568
7569 BestVPlan.execute(&State);
7570
7571 // 2.5 Collect reduction resume values.
7573 auto *ExitVPBB =
7574 cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7575 for (VPRecipeBase &R : *ExitVPBB) {
7576 createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R),
7577 ReductionResumeValues, State, OrigLoop,
7578 State.CFG.VPBB2IRBB[ExitVPBB]);
7579 }
7580
7581 // 2.6. Maintain Loop Hints
7582 // Keep all loop hints from the original loop on the vector loop (we'll
7583 // replace the vectorizer-specific hints below).
7584 MDNode *OrigLoopID = OrigLoop->getLoopID();
7585
7586 std::optional<MDNode *> VectorizedLoopID =
7589
7590 VPBasicBlock *HeaderVPBB =
7592 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7593 if (VectorizedLoopID)
7594 L->setLoopID(*VectorizedLoopID);
7595 else {
7596 // Keep all loop hints from the original loop on the vector loop (we'll
7597 // replace the vectorizer-specific hints below).
7598 if (MDNode *LID = OrigLoop->getLoopID())
7599 L->setLoopID(LID);
7600
7601 LoopVectorizeHints Hints(L, true, *ORE);
7602 Hints.setAlreadyVectorized();
7603 }
7605 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7606 if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
7608
7609 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7610 // predication, updating analyses.
7611 ILV.fixVectorizedLoop(State, BestVPlan);
7612
7614
7615 return {State.ExpandedSCEVs, ReductionResumeValues};
7616}
7617
7618#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7620 for (const auto &Plan : VPlans)
7622 Plan->printDOT(O);
7623 else
7624 Plan->print(O);
7625}
7626#endif
7627
7628//===--------------------------------------------------------------------===//
7629// EpilogueVectorizerMainLoop
7630//===--------------------------------------------------------------------===//
7631
7632/// This function is partially responsible for generating the control flow
7633/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7634std::pair<BasicBlock *, Value *>
7636 const SCEV2ValueTy &ExpandedSCEVs) {
7638
7639 // Generate the code to check the minimum iteration count of the vector
7640 // epilogue (see below).
7644
7645 // Generate the code to check any assumptions that we've made for SCEV
7646 // expressions.
7648
7649 // Generate the code that checks at runtime if arrays overlap. We put the
7650 // checks into a separate block to make the more common case of few elements
7651 // faster.
7653
7654 // Generate the iteration count check for the main loop, *after* the check
7655 // for the epilogue loop, so that the path-length is shorter for the case
7656 // that goes directly through the vector epilogue. The longer-path length for
7657 // the main loop is compensated for, by the gain from vectorizing the larger
7658 // trip count. Note: the branch will get updated later on when we vectorize
7659 // the epilogue.
7662
7663 // Generate the induction variable.
7665
7666 // Skip induction resume value creation here because they will be created in
7667 // the second pass for the scalar loop. The induction resume values for the
7668 // inductions in the epilogue loop are created before executing the plan for
7669 // the epilogue loop.
7670
7671 return {completeLoopSkeleton(), nullptr};
7672}
7673
7675 LLVM_DEBUG({
7676 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7677 << "Main Loop VF:" << EPI.MainLoopVF
7678 << ", Main Loop UF:" << EPI.MainLoopUF
7679 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7680 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7681 });
7682}
7683
7686 dbgs() << "intermediate fn:\n"
7687 << *OrigLoop->getHeader()->getParent() << "\n";
7688 });
7689}
7690
7691BasicBlock *
7693 bool ForEpilogue) {
7694 assert(Bypass && "Expected valid bypass basic block.");
7695 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7696 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7697 Value *Count = getTripCount();
7698 // Reuse existing vector loop preheader for TC checks.
7699 // Note that new preheader block is generated for vector loop.
7700 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7701 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7702
7703 // Generate code to check if the loop's trip count is less than VF * UF of the
7704 // main vector loop.
7705 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7706 : VF.isVector())
7709
7710 Value *CheckMinIters = Builder.CreateICmp(
7711 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7712 "min.iters.check");
7713
7714 if (!ForEpilogue)
7715 TCCheckBlock->setName("vector.main.loop.iter.check");
7716
7717 // Create new preheader for vector loop.
7718 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7719 DT, LI, nullptr, "vector.ph");
7720
7721 if (ForEpilogue) {
7722 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7723 DT->getNode(Bypass)->getIDom()) &&
7724 "TC check is expected to dominate Bypass");
7725
7726 // Update dominator for Bypass & LoopExit.
7727 DT->changeImmediateDominator(Bypass, TCCheckBlock);
7728 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7729 // For loops with multiple exits, there's no edge from the middle block
7730 // to exit blocks (as the epilogue must run) and thus no need to update
7731 // the immediate dominator of the exit blocks.
7733
7734 LoopBypassBlocks.push_back(TCCheckBlock);
7735
7736 // Save the trip count so we don't have to regenerate it in the
7737 // vec.epilog.iter.check. This is safe to do because the trip count
7738 // generated here dominates the vector epilog iter check.
7739 EPI.TripCount = Count;
7740 }
7741
7742 BranchInst &BI =
7743 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7746 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7747
7748 return TCCheckBlock;
7749}
7750
7751//===--------------------------------------------------------------------===//
7752// EpilogueVectorizerEpilogueLoop
7753//===--------------------------------------------------------------------===//
7754
7755/// This function is partially responsible for generating the control flow
7756/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7757std::pair<BasicBlock *, Value *>
7759 const SCEV2ValueTy &ExpandedSCEVs) {
7760 createVectorLoopSkeleton("vec.epilog.");
7761
7762 // Now, compare the remaining count and if there aren't enough iterations to
7763 // execute the vectorized epilogue skip to the scalar part.
7764 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7765 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7768 LI, nullptr, "vec.epilog.ph");
7770 VecEpilogueIterationCountCheck);
7771
7772 // Adjust the control flow taking the state info from the main loop
7773 // vectorization into account.
7775 "expected this to be saved from the previous pass.");
7777 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7778
7781
7783 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7784
7785 if (EPI.SCEVSafetyCheck)
7787 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7788 if (EPI.MemSafetyCheck)
7790 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7791
7793 VecEpilogueIterationCountCheck,
7794 VecEpilogueIterationCountCheck->getSinglePredecessor());
7795
7798 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7799 // If there is an epilogue which must run, there's no edge from the
7800 // middle block to exit blocks and thus no need to update the immediate
7801 // dominator of the exit blocks.
7804
7805 // Keep track of bypass blocks, as they feed start values to the induction and
7806 // reduction phis in the scalar loop preheader.
7807 if (EPI.SCEVSafetyCheck)
7809 if (EPI.MemSafetyCheck)
7812
7813 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7814 // reductions which merge control-flow from the latch block and the middle
7815 // block. Update the incoming values here and move the Phi into the preheader.
7816 SmallVector<PHINode *, 4> PhisInBlock;
7817 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7818 PhisInBlock.push_back(&Phi);
7819
7820 for (PHINode *Phi : PhisInBlock) {
7821 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7822 Phi->replaceIncomingBlockWith(
7823 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7824 VecEpilogueIterationCountCheck);
7825
7826 // If the phi doesn't have an incoming value from the
7827 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7828 // value and also those from other check blocks. This is needed for
7829 // reduction phis only.
7830 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7831 return EPI.EpilogueIterationCountCheck == IncB;
7832 }))
7833 continue;
7834 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7835 if (EPI.SCEVSafetyCheck)
7836 Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7837 if (EPI.MemSafetyCheck)
7838 Phi->removeIncomingValue(EPI.MemSafetyCheck);
7839 }
7840
7841 // Generate a resume induction for the vector epilogue and put it in the
7842 // vector epilogue preheader
7843 Type *IdxTy = Legal->getWidestInductionType();
7844 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
7846 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7847 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7849
7850 // Generate induction resume values. These variables save the new starting
7851 // indexes for the scalar loop. They are used to test if there are any tail
7852 // iterations left once the vector loop has completed.
7853 // Note that when the vectorized epilogue is skipped due to iteration count
7854 // check, then the resume value for the induction variable comes from
7855 // the trip count of the main vector loop, hence passing the AdditionalBypass
7856 // argument.
7857 createInductionResumeValues(ExpandedSCEVs,
7858 {VecEpilogueIterationCountCheck,
7859 EPI.VectorTripCount} /* AdditionalBypass */);
7860
7861 return {completeLoopSkeleton(), EPResumeVal};
7862}
7863
7864BasicBlock *
7866 BasicBlock *Bypass, BasicBlock *Insert) {
7867
7869 "Expected trip count to have been safed in the first pass.");
7870 assert(
7871 (!isa<Instruction>(EPI.TripCount) ||
7872 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7873 "saved trip count does not dominate insertion point.");
7874 Value *TC = EPI.TripCount;
7875 IRBuilder<> Builder(Insert->getTerminator());
7876 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7877
7878 // Generate code to check if the loop's trip count is less than VF * UF of the
7879 // vector epilogue loop.
7880 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7883
7884 Value *CheckMinIters =
7885 Builder.CreateICmp(P, Count,
7888 "min.epilog.iters.check");
7889
7890 BranchInst &BI =
7891 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7893 unsigned MainLoopStep = UF * VF.getKnownMinValue();
7894 unsigned EpilogueLoopStep =
7896 // We assume the remaining `Count` is equally distributed in
7897 // [0, MainLoopStep)
7898 // So the probability for `Count < EpilogueLoopStep` should be
7899 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7900 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7901 const uint32_t Weights[] = {EstimatedSkipCount,
7902 MainLoopStep - EstimatedSkipCount};
7903 setBranchWeights(BI, Weights);
7904 }
7905 ReplaceInstWithInst(Insert->getTerminator(), &BI);
7906
7907 LoopBypassBlocks.push_back(Insert);
7908 return Insert;
7909}
7910
7912 LLVM_DEBUG({
7913 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7914 << "Epilogue Loop VF:" << EPI.EpilogueVF
7915 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7916 });
7917}
7918
7921 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7922 });
7923}
7924
7926 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7927 assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7928 bool PredicateAtRangeStart = Predicate(Range.Start);
7929
7930 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
7931 if (Predicate(TmpVF) != PredicateAtRangeStart) {
7932 Range.End = TmpVF;
7933 break;
7934 }
7935
7936 return PredicateAtRangeStart;
7937}
7938
7939/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7940/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7941/// of VF's starting at a given VF and extending it as much as possible. Each
7942/// vectorization decision can potentially shorten this sub-range during
7943/// buildVPlan().
7945 ElementCount MaxVF) {
7946 auto MaxVFTimes2 = MaxVF * 2;
7947 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
7948 VFRange SubRange = {VF, MaxVFTimes2};
7949 VPlans.push_back(buildVPlan(SubRange));
7950 VF = SubRange.End;
7951 }
7952}
7953
7954iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
7956 std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
7957 if (auto *I = dyn_cast<Instruction>(Op)) {
7958 if (auto *R = Ingredient2Recipe.lookup(I))
7959 return R->getVPSingleValue();
7960 }
7961 return Plan.getOrAddLiveIn(Op);
7962 };
7963 return map_range(Operands, Fn);
7964}
7965
7967 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7968
7969 // Look for cached value.
7970 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7971 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7972 if (ECEntryIt != EdgeMaskCache.end())
7973 return ECEntryIt->second;
7974
7975 VPValue *SrcMask = getBlockInMask(Src);
7976
7977 // The terminator has to be a branch inst!
7978 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7979 assert(BI && "Unexpected terminator found");
7980
7981 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7982 return EdgeMaskCache[Edge] = SrcMask;
7983
7984 // If source is an exiting block, we know the exit edge is dynamically dead
7985 // in the vector loop, and thus we don't need to restrict the mask. Avoid
7986 // adding uses of an otherwise potentially dead instruction.
7987 if (OrigLoop->isLoopExiting(Src))
7988 return EdgeMaskCache[Edge] = SrcMask;
7989
7990 VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition(), Plan);
7991 assert(EdgeMask && "No Edge Mask found for condition");
7992
7993 if (BI->getSuccessor(0) != Dst)
7994 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
7995
7996 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
7997 // The condition is 'SrcMask && EdgeMask', which is equivalent to
7998 // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
7999 // The select version does not introduce new UB if SrcMask is false and
8000 // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8001 VPValue *False = Plan.getOrAddLiveIn(
8003 EdgeMask =
8004 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8005 }
8006
8007 return EdgeMaskCache[Edge] = EdgeMask;
8008}
8009
8011 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8012
8013 // Look for cached value.
8014 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8015 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
8016 assert(ECEntryIt != EdgeMaskCache.end() &&
8017 "looking up mask for edge which has not been created");
8018 return ECEntryIt->second;
8019}
8020
8022 BasicBlock *Header = OrigLoop->getHeader();
8023
8024 // When not folding the tail, use nullptr to model all-true mask.
8025 if (!CM.foldTailByMasking()) {
8026 BlockMaskCache[Header] = nullptr;
8027 return;
8028 }
8029
8030 // Introduce the early-exit compare IV <= BTC to form header block mask.
8031 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8032 // constructing the desired canonical IV in the header block as its first
8033 // non-phi instructions.
8034
8035 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8036 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8037 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8038 HeaderVPBB->insert(IV, NewInsertionPoint);
8039
8040 VPBuilder::InsertPointGuard Guard(Builder);
8041 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8042 VPValue *BlockMask = nullptr;
8044 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8045 BlockMaskCache[Header] = BlockMask;
8046}
8047
8049 // Return the cached value.
8050 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8051 assert(BCEntryIt != BlockMaskCache.end() &&
8052 "Trying to access mask for block without one.");
8053 return BCEntryIt->second;
8054}
8055
8057 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8058 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8059 assert(OrigLoop->getHeader() != BB &&
8060 "Loop header must have cached block mask");
8061
8062 // All-one mask is modelled as no-mask following the convention for masked
8063 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8064 VPValue *BlockMask = nullptr;
8065 // This is the block mask. We OR all incoming edges.
8066 for (auto *Predecessor : predecessors(BB)) {
8067 VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
8068 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8069 BlockMaskCache[BB] = EdgeMask;
8070 return;
8071 }
8072
8073 if (!BlockMask) { // BlockMask has its initialized nullptr value.
8074 BlockMask = EdgeMask;
8075 continue;
8076 }
8077
8078 BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8079 }
8080
8081 BlockMaskCache[BB] = BlockMask;
8082}
8083
8085VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8086 VFRange &Range) {
8087 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8088 "Must be called with either a load or store");
8089
8090 auto willWiden = [&](ElementCount VF) -> bool {
8092 CM.getWideningDecision(I, VF);
8094 "CM decision should be taken at this point.");
8096 return true;
8097 if (CM.isScalarAfterVectorization(I, VF) ||
8098 CM.isProfitableToScalarize(I, VF))
8099 return false;
8101 };
8102
8104 return nullptr;
8105
8106 VPValue *Mask = nullptr;
8107 if (Legal->isMaskRequired(I))
8108 Mask = getBlockInMask(I->getParent());
8109
8110 // Determine if the pointer operand of the access is either consecutive or
8111 // reverse consecutive.
8113 CM.getWideningDecision(I, Range.Start);
8115 bool Consecutive =
8117
8118 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8119 if (Consecutive) {
8120 auto *GEP = dyn_cast<GetElementPtrInst>(
8121 Ptr->getUnderlyingValue()->stripPointerCasts());
8122 auto *VectorPtr = new VPVectorPointerRecipe(
8123 Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
8124 I->getDebugLoc());
8125 Builder.getInsertBlock()->appendRecipe(VectorPtr);
8126 Ptr = VectorPtr;
8127 }
8128 if (LoadInst *Load = dyn_cast<LoadInst>(I))
8129 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
8130 I->getDebugLoc());
8131
8132 StoreInst *Store = cast<StoreInst>(I);
8133 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
8134 Reverse, I->getDebugLoc());
8135}
8136
8137/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8138/// insert a recipe to expand the step for the induction recipe.
8141 VPValue *Start, const InductionDescriptor &IndDesc,
8142 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,
8143 VFRange &Range) {
8144 assert(IndDesc.getStartValue() ==
8145 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8146 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8147 "step must be loop invariant");
8148
8149 VPValue *Step =
8151 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8152 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
8153 }
8154 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8155 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
8156}
8157
8158VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8160
8161 // Check if this is an integer or fp induction. If so, build the recipe that
8162 // produces its scalar and vector values.
8163 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8164 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8165 *PSE.getSE(), *OrigLoop, Range);
8166
8167 // Check if this is pointer induction. If so, build the recipe for it.
8168 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8169 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8170 *PSE.getSE());
8172 Phi, Operands[0], Step, *II,
8174 [&](ElementCount VF) {
8175 return CM.isScalarAfterVectorization(Phi, VF);
8176 },
8177 Range));
8178 }
8179 return nullptr;
8180}
8181
8182VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8184 // Optimize the special case where the source is a constant integer
8185 // induction variable. Notice that we can only optimize the 'trunc' case
8186 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8187 // (c) other casts depend on pointer size.
8188
8189 // Determine whether \p K is a truncation based on an induction variable that
8190 // can be optimized.
8191 auto isOptimizableIVTruncate =
8192 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8193 return [=](ElementCount VF) -> bool {
8194 return CM.isOptimizableIVTruncate(K, VF);
8195 };
8196 };
8197
8199 isOptimizableIVTruncate(I), Range)) {
8200
8201 auto *Phi = cast<PHINode>(I->getOperand(0));
8202 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8203 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
8204 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8205 *OrigLoop, Range);
8206 }
8207 return nullptr;
8208}
8209
8210VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8212 unsigned NumIncoming = Phi->getNumIncomingValues();
8213
8214 // We know that all PHIs in non-header blocks are converted into selects, so
8215 // we don't have to worry about the insertion order and we can just use the
8216 // builder. At this point we generate the predication tree. There may be
8217 // duplications since this is a simple recursive scan, but future
8218 // optimizations will clean it up.
8219 // TODO: At the moment the first mask is always skipped, but it would be
8220 // better to skip the most expensive mask.
8221 SmallVector<VPValue *, 2> OperandsWithMask;
8222
8223 for (unsigned In = 0; In < NumIncoming; In++) {
8224 OperandsWithMask.push_back(Operands[In]);
8225 VPValue *EdgeMask =
8226 getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8227 if (!EdgeMask) {
8228 assert(In == 0 && "Both null and non-null edge masks found");
8230 "Distinct incoming values with one having a full mask");
8231 break;
8232 }
8233 if (In == 0)
8234 continue;
8235 OperandsWithMask.push_back(EdgeMask);
8236 }
8237 return new VPBlendRecipe(Phi, OperandsWithMask);
8238}
8239
8240VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8242 VFRange &Range) {
8244 [this, CI](ElementCount VF) {
8245 return CM.isScalarWithPredication(CI, VF);
8246 },
8247 Range);
8248
8249 if (IsPredicated)
8250 return nullptr;
8251
8253 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8254 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8255 ID == Intrinsic::pseudoprobe ||
8256 ID == Intrinsic::experimental_noalias_scope_decl))
8257 return nullptr;
8258
8259 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8260
8261 // Is it beneficial to perform intrinsic call compared to lib call?
8262 bool ShouldUseVectorIntrinsic =
8264 [&](ElementCount VF) -> bool {
8265 return CM.getCallWideningDecision(CI, VF).Kind ==
8267 },
8268 Range);
8269 if (ShouldUseVectorIntrinsic)
8270 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID,
8271 CI->getDebugLoc());
8272
8273 Function *Variant = nullptr;
8274 std::optional<unsigned> MaskPos;
8275 // Is better to call a vectorized version of the function than to to scalarize
8276 // the call?
8277 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8278 [&](ElementCount VF) -> bool {
8279 // The following case may be scalarized depending on the VF.
8280 // The flag shows whether we can use a usual Call for vectorized
8281 // version of the instruction.
8282
8283 // If we've found a variant at a previous VF, then stop looking. A
8284 // vectorized variant of a function expects input in a certain shape
8285 // -- basically the number of input registers, the number of lanes
8286 // per register, and whether there's a mask required.
8287 // We store a pointer to the variant in the VPWidenCallRecipe, so
8288 // once we have an appropriate variant it's only valid for that VF.
8289 // This will force a different vplan to be generated for each VF that
8290 // finds a valid variant.
8291 if (Variant)
8292 return false;
8294 CM.getCallWideningDecision(CI, VF);
8296 Variant = Decision.Variant;
8297 MaskPos = Decision.MaskPos;
8298 return true;
8299 }
8300
8301 return false;
8302 },
8303 Range);
8304 if (ShouldUseVectorCall) {
8305 if (MaskPos.has_value()) {
8306 // We have 2 cases that would require a mask:
8307 // 1) The block needs to be predicated, either due to a conditional
8308 // in the scalar loop or use of an active lane mask with
8309 // tail-folding, and we use the appropriate mask for the block.
8310 // 2) No mask is required for the block, but the only available
8311 // vector variant at this VF requires a mask, so we synthesize an
8312 // all-true mask.
8313 VPValue *Mask = nullptr;
8314 if (Legal->isMaskRequired(CI))
8315 Mask = getBlockInMask(CI->getParent());
8316 else
8318 IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
8319
8320 Ops.insert(Ops.begin() + *MaskPos, Mask);
8321 }
8322
8323 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
8325 Variant);
8326 }
8327
8328 return nullptr;
8329}
8330
8331bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8332 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8333 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8334 // Instruction should be widened, unless it is scalar after vectorization,
8335 // scalarization is profitable or it is predicated.
8336 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8337 return CM.isScalarAfterVectorization(I, VF) ||
8338 CM.isProfitableToScalarize(I, VF) ||
8339 CM.isScalarWithPredication(I, VF);
8340 };
8342 Range);
8343}
8344
8345VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8347 VPBasicBlock *VPBB) {
8348 switch (I->getOpcode()) {
8349 default:
8350 return nullptr;
8351 case Instruction::SDiv:
8352 case Instruction::UDiv:
8353 case Instruction::SRem:
8354 case Instruction::URem: {
8355 // If not provably safe, use a select to form a safe divisor before widening the
8356 // div/rem operation itself. Otherwise fall through to general handling below.
8357 if (CM.isPredicatedInst(I)) {
8358 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8359 VPValue *Mask = getBlockInMask(I->getParent());
8360 VPValue *One =
8361 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8362 auto *SafeRHS =
8363 new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
8364 I->getDebugLoc());
8365 VPBB->appendRecipe(SafeRHS);
8366 Ops[1] = SafeRHS;
8367 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8368 }
8369 [[fallthrough]];
8370 }
8371 case Instruction::Add:
8372 case Instruction::And:
8373 case Instruction::AShr:
8374 case Instruction::FAdd:
8375 case Instruction::FCmp:
8376 case Instruction::FDiv:
8377 case Instruction::FMul:
8378 case Instruction::FNeg:
8379 case Instruction::FRem:
8380 case Instruction::FSub:
8381 case Instruction::ICmp:
8382 case Instruction::LShr:
8383 case Instruction::Mul:
8384 case Instruction::Or:
8385 case Instruction::Select:
8386 case Instruction::Shl:
8387 case Instruction::Sub:
8388 case Instruction::Xor:
8389 case Instruction::Freeze:
8390 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8391 };
8392}
8393
8395 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8396 for (VPHeaderPHIRecipe *R : PhisToFix) {
8397 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8398 VPRecipeBase *IncR =
8399 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8400 R->addOperand(IncR->getVPSingleValue());
8401 }
8402}
8403
8405 VFRange &Range) {
8407 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8408 Range);
8409
8410 bool IsPredicated = CM.isPredicatedInst(I);
8411
8412 // Even if the instruction is not marked as uniform, there are certain
8413 // intrinsic calls that can be effectively treated as such, so we check for
8414 // them here. Conservatively, we only do this for scalable vectors, since
8415 // for fixed-width VFs we can always fall back on full scalarization.
8416 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8417 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8418 case Intrinsic::assume:
8419 case Intrinsic::lifetime_start:
8420 case Intrinsic::lifetime_end:
8421 // For scalable vectors if one of the operands is variant then we still
8422 // want to mark as uniform, which will generate one instruction for just
8423 // the first lane of the vector. We can't scalarize the call in the same
8424 // way as for fixed-width vectors because we don't know how many lanes
8425 // there are.
8426 //
8427 // The reasons for doing it this way for scalable vectors are:
8428 // 1. For the assume intrinsic generating the instruction for the first
8429 // lane is still be better than not generating any at all. For
8430 // example, the input may be a splat across all lanes.
8431 // 2. For the lifetime start/end intrinsics the pointer operand only
8432 // does anything useful when the input comes from a stack object,
8433 // which suggests it should always be uniform. For non-stack objects
8434 // the effect is to poison the object, which still allows us to
8435 // remove the call.
8436 IsUniform = true;
8437 break;
8438 default:
8439 break;
8440 }
8441 }
8442 VPValue *BlockInMask = nullptr;
8443 if (!IsPredicated) {
8444 // Finalize the recipe for Instr, first if it is not predicated.
8445 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8446 } else {
8447 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8448 // Instructions marked for predication are replicated and a mask operand is
8449 // added initially. Masked replicate recipes will later be placed under an
8450 // if-then construct to prevent side-effects. Generate recipes to compute
8451 // the block mask for this region.
8452 BlockInMask = getBlockInMask(I->getParent());
8453 }
8454
8455 auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
8456 IsUniform, BlockInMask);
8457 return Recipe;
8458}
8459
8463 VFRange &Range, VPBasicBlock *VPBB) {
8464 // First, check for specific widening recipes that deal with inductions, Phi
8465 // nodes, calls and memory operations.
8466 VPRecipeBase *Recipe;
8467 if (auto Phi = dyn_cast<PHINode>(Instr)) {
8468 if (Phi->getParent() != OrigLoop->getHeader())
8469 return tryToBlend(Phi, Operands);
8470
8471 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8472 return Recipe;
8473
8474 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8475 assert((Legal->isReductionVariable(Phi) ||
8476 Legal->isFixedOrderRecurrence(Phi)) &&
8477 "can only widen reductions and fixed-order recurrences here");
8478 VPValue *StartV = Operands[0];
8479 if (Legal->isReductionVariable(Phi)) {
8480 const RecurrenceDescriptor &RdxDesc =
8481 Legal->getReductionVars().find(Phi)->second;
8482 assert(RdxDesc.getRecurrenceStartValue() ==
8483 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8484 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8485 CM.isInLoopReduction(Phi),
8486 CM.useOrderedReductions(RdxDesc));
8487 } else {
8488 // TODO: Currently fixed-order recurrences are modeled as chains of
8489 // first-order recurrences. If there are no users of the intermediate
8490 // recurrences in the chain, the fixed order recurrence should be modeled
8491 // directly, enabling more efficient codegen.
8492 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8493 }
8494
8495 PhisToFix.push_back(PhiRecipe);
8496 return PhiRecipe;
8497 }
8498
8499 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8500 cast<TruncInst>(Instr), Operands, Range)))
8501 return Recipe;
8502
8503 // All widen recipes below deal only with VF > 1.
8505 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8506 return nullptr;
8507
8508 if (auto *CI = dyn_cast<CallInst>(Instr))
8509 return tryToWidenCall(CI, Operands, Range);
8510
8511 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8512 return tryToWidenMemory(Instr, Operands, Range);
8513
8514 if (!shouldWiden(Instr, Range))
8515 return nullptr;
8516
8517 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8518 return new VPWidenGEPRecipe(GEP,
8519 make_range(Operands.begin(), Operands.end()));
8520
8521 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8522 return new VPWidenSelectRecipe(
8523 *SI, make_range(Operands.begin(), Operands.end()));
8524 }
8525
8526 if (auto *CI = dyn_cast<CastInst>(Instr)) {
8527 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8528 *CI);
8529 }
8530
8531 return tryToWiden(Instr, Operands, VPBB);
8532}
8533
8534void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8535 ElementCount MaxVF) {
8536 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8537
8538 auto MaxVFTimes2 = MaxVF * 2;
8539 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8540 VFRange SubRange = {VF, MaxVFTimes2};
8541 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8542 // Now optimize the initial VPlan.
8543 if (!Plan->hasVF(ElementCount::getFixed(1)))
8545 *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
8546 VPlanTransforms::optimize(*Plan, *PSE.getSE());
8547 // TODO: try to put it close to addActiveLaneMask().
8548 if (CM.foldTailWithEVL())
8550 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8551 VPlans.push_back(std::move(Plan));
8552 }
8553 VF = SubRange.End;
8554 }
8555}
8556
8557// Add the necessary canonical IV and branch recipes required to control the
8558// loop.
8559static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8560 DebugLoc DL) {
8561 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8562 auto *StartV = Plan.getOrAddLiveIn(StartIdx);
8563
8564 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8565 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8566 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8567 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8568 Header->insert(CanonicalIVPHI, Header->begin());
8569
8570 VPBuilder Builder(TopRegion->getExitingBasicBlock());
8571 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8572 auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8573 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
8574 "index.next");
8575 CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8576
8577 // Add the BranchOnCount VPInstruction to the latch.
8579 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8580}
8581
8582// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8583// original exit block.
8584static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
8585 VPRecipeBuilder &Builder, VPlan &Plan) {
8586 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8587 BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8588 // Only handle single-exit loops with unique exit blocks for now.
8589 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8590 return;
8591
8592 // Introduce VPUsers modeling the exit values.
8593 for (PHINode &ExitPhi : ExitBB->phis()) {
8594 Value *IncomingValue =
8595 ExitPhi.getIncomingValueForBlock(ExitingBB);
8596 VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan);
8597 Plan.addLiveOut(&ExitPhi, V);
8598 }
8599}
8600
8602LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8603
8605
8606 // ---------------------------------------------------------------------------
8607 // Build initial VPlan: Scan the body of the loop in a topological order to
8608 // visit each basic block after having visited its predecessor basic blocks.
8609 // ---------------------------------------------------------------------------
8610
8611 // Create initial VPlan skeleton, having a basic block for the pre-header
8612 // which contains SCEV expansions that need to happen before the CFG is
8613 // modified; a basic block for the vector pre-header, followed by a region for
8614 // the vector loop, followed by the middle basic block. The skeleton vector
8615 // loop region contains a header and latch basic blocks.
8617 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8618 *PSE.getSE());
8619 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8620 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8621 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8622 Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);
8623 Plan->getVectorLoopRegion()->setExiting(LatchVPBB);
8624
8625 // Don't use getDecisionAndClampRange here, because we don't know the UF
8626 // so this function is better to be conservative, rather than to split
8627 // it up into different VPlans.
8628 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8629 bool IVUpdateMayOverflow = false;
8630 for (ElementCount VF : Range)
8631 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8632
8634 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8635 // When not folding the tail, we know that the induction increment will not
8636 // overflow.
8637 bool HasNUW = Style == TailFoldingStyle::None;
8638 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8639
8640 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
8641
8642 // ---------------------------------------------------------------------------
8643 // Pre-construction: record ingredients whose recipes we'll need to further
8644 // process after constructing the initial VPlan.
8645 // ---------------------------------------------------------------------------
8646
8647 // For each interleave group which is relevant for this (possibly trimmed)
8648 // Range, add it to the set of groups to be later applied to the VPlan and add
8649 // placeholders for its members' Recipes which we'll be replacing with a
8650 // single VPInterleaveRecipe.
8652 auto applyIG = [IG, this](ElementCount VF) -> bool {
8653 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8654 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8656 // For scalable vectors, the only interleave factor currently supported
8657 // is 2 since we require the (de)interleave2 intrinsics instead of
8658 // shufflevectors.
8659 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8660 "Unsupported interleave factor for scalable vectors");
8661 return Result;
8662 };
8663 if (!getDecisionAndClampRange(applyIG, Range))
8664 continue;
8665 InterleaveGroups.insert(IG);
8666 };
8667
8668 // ---------------------------------------------------------------------------
8669 // Construct recipes for the instructions in the loop
8670 // ---------------------------------------------------------------------------
8671
8672 // Scan the body of the loop in a topological order to visit each basic block
8673 // after having visited its predecessor basic blocks.
8674 LoopBlocksDFS DFS(OrigLoop);
8675 DFS.perform(LI);
8676
8677 VPBasicBlock *VPBB = HeaderVPBB;
8678 BasicBlock *HeaderBB = OrigLoop->getHeader();
8679 bool NeedsMasks =
8680 CM.foldTailByMasking() ||
8681 any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
8682 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
8683 return Legal->blockNeedsPredication(BB) || NeedsBlends;
8684 });
8685 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8686 // Relevant instructions from basic block BB will be grouped into VPRecipe
8687 // ingredients and fill a new VPBasicBlock.
8688 if (VPBB != HeaderVPBB)
8689 VPBB->setName(BB->getName());
8690 Builder.setInsertPoint(VPBB);
8691
8692 if (VPBB == HeaderVPBB)
8693 RecipeBuilder.createHeaderMask();
8694 else if (NeedsMasks)
8695 RecipeBuilder.createBlockInMask(BB);
8696
8697 // Introduce each ingredient into VPlan.
8698 // TODO: Model and preserve debug intrinsics in VPlan.
8699 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
8700 Instruction *Instr = &I;
8702 auto *Phi = dyn_cast<PHINode>(Instr);
8703 if (Phi && Phi->getParent() == HeaderBB) {
8704 Operands.push_back(Plan->getOrAddLiveIn(
8705 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8706 } else {
8707 auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
8708 Operands = {OpRange.begin(), OpRange.end()};
8709 }
8710
8711 // Invariant stores inside loop will be deleted and a single store
8712 // with the final reduction value will be added to the exit block
8713 StoreInst *SI;
8714 if ((SI = dyn_cast<StoreInst>(&I)) &&
8715 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8716 continue;
8717
8718 VPRecipeBase *Recipe =
8719 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
8720 if (!Recipe)
8721 Recipe = RecipeBuilder.handleReplication(Instr, Range);
8722
8723 RecipeBuilder.setRecipe(Instr, Recipe);
8724 if (isa<VPHeaderPHIRecipe>(Recipe)) {
8725 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8726 // the following cases, VPHeaderPHIRecipes may be created after non-phi
8727 // recipes and need to be moved to the phi section of HeaderVPBB:
8728 // * tail-folding (non-phi recipes computing the header mask are
8729 // introduced earlier than regular header phi recipes, and should appear
8730 // after them)
8731 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8732
8733 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8734 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8735 "unexpected recipe needs moving");
8736 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8737 } else
8738 VPBB->appendRecipe(Recipe);
8739 }
8740
8742 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8743 }
8744
8745 // After here, VPBB should not be used.
8746 VPBB = nullptr;
8747
8748 if (CM.requiresScalarEpilogue(Range)) {
8749 // No edge from the middle block to the unique exit block has been inserted
8750 // and there is nothing to fix from vector loop; phis should have incoming
8751 // from scalar loop only.
8752 } else
8753 addUsersInExitBlock(HeaderVPBB, OrigLoop, RecipeBuilder, *Plan);
8754
8755 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8756 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8757 "entry block must be set to a VPRegionBlock having a non-empty entry "
8758 "VPBasicBlock");
8759 RecipeBuilder.fixHeaderPhis();
8760
8761 // ---------------------------------------------------------------------------
8762 // Transform initial VPlan: Apply previously taken decisions, in order, to
8763 // bring the VPlan to its final state.
8764 // ---------------------------------------------------------------------------
8765
8766 // Adjust the recipes for any inloop reductions.
8767 adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start);
8768
8769 // Interleave memory: for each Interleave Group we marked earlier as relevant
8770 // for this VPlan, replace the Recipes widening its memory instructions with a
8771 // single VPInterleaveRecipe at its insertion point.
8772 for (const auto *IG : InterleaveGroups) {
8773 auto *Recipe =
8774 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos()));
8775 SmallVector<VPValue *, 4> StoredValues;
8776 for (unsigned i = 0; i < IG->getFactor(); ++i)
8777 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8778 auto *StoreR = cast<VPWidenStoreRecipe>(RecipeBuilder.getRecipe(SI));
8779 StoredValues.push_back(StoreR->getStoredValue());
8780 }
8781
8782 bool NeedsMaskForGaps =
8783 IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8784 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8785 Recipe->getMask(), NeedsMaskForGaps);
8786 VPIG->insertBefore(Recipe);
8787 unsigned J = 0;
8788 for (unsigned i = 0; i < IG->getFactor(); ++i)
8789 if (Instruction *Member = IG->getMember(i)) {
8790 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
8791 if (!Member->getType()->isVoidTy()) {
8792 VPValue *OriginalV = MemberR->getVPSingleValue();
8793 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8794 J++;
8795 }
8796 MemberR->eraseFromParent();
8797 }
8798 }
8799
8800 for (ElementCount VF : Range)
8801 Plan->addVF(VF);
8802 Plan->setName("Initial VPlan");
8803
8804 // Replace VPValues for known constant strides guaranteed by predicate scalar
8805 // evolution.
8806 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8807 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8808 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
8809 // Only handle constant strides for now.
8810 if (!ScevStride)
8811 continue;
8812 Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt());
8813
8814 auto *ConstVPV = Plan->getOrAddLiveIn(CI);
8815 // The versioned value may not be used in the loop directly, so just add a
8816 // new live-in in those cases.
8817 Plan->getOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV);
8818 }
8819
8821 return Legal->blockNeedsPredication(BB);
8822 });
8823
8824 // Sink users of fixed-order recurrence past the recipe defining the previous
8825 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8827 return nullptr;
8828
8829 if (useActiveLaneMask(Style)) {
8830 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8831 // TailFoldingStyle is visible there.
8832 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8833 bool WithoutRuntimeCheck =
8835 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8836 WithoutRuntimeCheck);
8837 }
8838 return Plan;
8839}
8840
8841VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8842 // Outer loop handling: They may require CFG and instruction level
8843 // transformations before even evaluating whether vectorization is profitable.
8844 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8845 // the vectorization pipeline.
8846 assert(!OrigLoop->isInnermost());
8847 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8848
8849 // Create new empty VPlan
8850 auto Plan = VPlan::createInitialVPlan(
8851 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8852 *PSE.getSE());
8853
8854 // Build hierarchical CFG
8855 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8856 HCFGBuilder.buildHierarchicalCFG();
8857
8858 for (ElementCount VF : Range)
8859 Plan->addVF(VF);
8860
8862 Plan,
8863 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
8864 *PSE.getSE(), *TLI);
8865
8866 // Remove the existing terminator of the exiting block of the top-most region.
8867 // A BranchOnCount will be added instead when adding the canonical IV recipes.
8868 auto *Term =
8869 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8870 Term->eraseFromParent();
8871
8872 // Tail folding is not supported for outer loops, so the induction increment
8873 // is guaranteed to not wrap.
8874 bool HasNUW = true;
8875 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
8876 DebugLoc());
8877 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8878 return Plan;
8879}
8880
8881// Adjust the recipes for reductions. For in-loop reductions the chain of
8882// instructions leading from the loop exit instr to the phi need to be converted
8883// to reductions, with one operand being vector and the other being the scalar
8884// reduction chain. For other reductions, a select is introduced between the phi
8885// and live-out recipes when folding the tail.
8886//
8887// A ComputeReductionResult recipe is added to the middle block, also for
8888// in-loop reductions which compute their result in-loop, because generating
8889// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8890void LoopVectorizationPlanner::adjustRecipesForReductions(
8891 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
8892 ElementCount MinVF) {
8893 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8894 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8895 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8896 // sank outside of the loop would keep the same order as they had in the
8897 // original loop.
8898 SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8899 for (VPRecipeBase &R : Header->phis()) {
8900 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
8901 ReductionPHIList.emplace_back(ReductionPhi);
8902 }
8903 bool HasIntermediateStore = false;
8904 stable_sort(ReductionPHIList,
8905 [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8906 const VPReductionPHIRecipe *R2) {
8907 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8908 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8909 HasIntermediateStore |= IS1 || IS2;
8910
8911 // If neither of the recipes has an intermediate store, keep the
8912 // order the same.
8913 if (!IS1 && !IS2)
8914 return false;
8915
8916 // If only one of the recipes has an intermediate store, then
8917 // move it towards the beginning of the list.
8918 if (IS1 && !IS2)
8919 return true;
8920
8921 if (!IS1 && IS2)
8922 return false;
8923
8924 // If both recipes have an intermediate store, then the recipe
8925 // with the later store should be processed earlier. So it
8926 // should go to the beginning of the list.
8927 return DT->dominates(IS2, IS1);
8928 });
8929
8930 if (HasIntermediateStore && ReductionPHIList.size() > 1)
8931 for (VPRecipeBase *R : ReductionPHIList)
8932 R->moveBefore(*Header, Header->getFirstNonPhi());
8933
8934 for (VPRecipeBase &R : Header->phis()) {
8935 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8936 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8937 continue;
8938
8939 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
8940 RecurKind Kind = RdxDesc.getRecurrenceKind();
8942 "AnyOf reductions are not allowed for in-loop reductions");
8943
8944 // Collect the chain of "link" recipes for the reduction starting at PhiR.
8946 Worklist.insert(PhiR);
8947 for (unsigned I = 0; I != Worklist.size(); ++I) {
8948 VPSingleDefRecipe *Cur = Worklist[I];
8949 for (VPUser *U : Cur->users()) {
8950 auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
8951 if (!UserRecipe) {
8952 assert(isa<VPLiveOut>(U) &&
8953 "U must either be a VPSingleDef or VPLiveOut");
8954 continue;
8955 }
8956 Worklist.insert(UserRecipe);
8957 }
8958 }
8959
8960 // Visit operation "Links" along the reduction chain top-down starting from
8961 // the phi until LoopExitValue. We keep track of the previous item
8962 // (PreviousLink) to tell which of the two operands of a Link will remain
8963 // scalar and which will be reduced. For minmax by select(cmp), Link will be
8964 // the select instructions. Blend recipes of in-loop reduction phi's will
8965 // get folded to their non-phi operand, as the reduction recipe handles the
8966 // condition directly.
8967 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
8968 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
8969 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
8970
8971 // Index of the first operand which holds a non-mask vector operand.
8972 unsigned IndexOfFirstOperand;
8973 // Recognize a call to the llvm.fmuladd intrinsic.
8974 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
8975 VPValue *VecOp;
8976 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
8977 if (IsFMulAdd) {
8978 assert(
8980 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
8981 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
8982 isa<VPWidenCallRecipe>(CurrentLink)) &&
8983 CurrentLink->getOperand(2) == PreviousLink &&
8984 "expected a call where the previous link is the added operand");
8985
8986 // If the instruction is a call to the llvm.fmuladd intrinsic then we
8987 // need to create an fmul recipe (multiplying the first two operands of
8988 // the fmuladd together) to use as the vector operand for the fadd
8989 // reduction.
8990 VPInstruction *FMulRecipe = new VPInstruction(
8991 Instruction::FMul,
8992 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
8993 CurrentLinkI->getFastMathFlags());
8994 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
8995 VecOp = FMulRecipe;
8996 } else {
8997 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
8998 if (PhiR->isInLoop() && Blend) {
8999 assert(Blend->getNumIncomingValues() == 2 &&
9000 "Blend must have 2 incoming values");
9001 if (Blend->getIncomingValue(0) == PhiR)
9002 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9003 else {
9004 assert(Blend->getIncomingValue(1) == PhiR &&
9005 "PhiR must be an operand of the blend");
9006 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9007 }
9008 continue;
9009 }
9010
9012 if (isa<VPWidenRecipe>(CurrentLink)) {
9013 assert(isa<CmpInst>(CurrentLinkI) &&
9014 "need to have the compare of the select");
9015 continue;
9016 }
9017 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9018 "must be a select recipe");
9019 IndexOfFirstOperand = 1;
9020 } else {
9021 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9022 "Expected to replace a VPWidenSC");
9023 IndexOfFirstOperand = 0;
9024 }
9025 // Note that for non-commutable operands (cmp-selects), the semantics of
9026 // the cmp-select are captured in the recurrence kind.
9027 unsigned VecOpId =
9028 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9029 ? IndexOfFirstOperand + 1
9030 : IndexOfFirstOperand;
9031 VecOp = CurrentLink->getOperand(VecOpId);
9032 assert(VecOp != PreviousLink &&
9033 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9034 (VecOpId - IndexOfFirstOperand)) ==
9035 PreviousLink &&
9036 "PreviousLink must be the operand other than VecOp");
9037 }
9038
9039 BasicBlock *BB = CurrentLinkI->getParent();
9040 VPValue *CondOp = nullptr;
9042 CondOp = RecipeBuilder.getBlockInMask(BB);
9043
9044 VPReductionRecipe *RedRecipe =
9045 new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp,
9046 CondOp, CM.useOrderedReductions(RdxDesc));
9047 // Append the recipe to the end of the VPBasicBlock because we need to
9048 // ensure that it comes after all of it's inputs, including CondOp.
9049 // Note that this transformation may leave over dead recipes (including
9050 // CurrentLink), which will be cleaned by a later VPlan transform.
9051 LinkVPBB->appendRecipe(RedRecipe);
9052 CurrentLink->replaceAllUsesWith(RedRecipe);
9053 PreviousLink = RedRecipe;
9054 }
9055 }
9056 Builder.setInsertPoint(&*LatchVPBB->begin());
9057 for (VPRecipeBase &R :
9058 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9059 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9060 if (!PhiR)
9061 continue;
9062
9063 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9064 // If tail is folded by masking, introduce selects between the phi
9065 // and the live-out instruction of each reduction, at the beginning of the
9066 // dedicated latch block.
9067 auto *OrigExitingVPV = PhiR->getBackedgeValue();
9068 auto *NewExitingVPV = PhiR->getBackedgeValue();
9069 if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9070 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9071 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9072 "reduction recipe must be defined before latch");
9073 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9074 std::optional<FastMathFlags> FMFs =
9075 PhiTy->isFloatingPointTy()
9076 ? std::make_optional(RdxDesc.getFastMathFlags())
9077 : std::nullopt;
9078 NewExitingVPV =
9079 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9080 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9081 return isa<VPInstruction>(&U) &&
9082 cast<VPInstruction>(&U)->getOpcode() ==
9084 });
9087 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
9089 PhiR->setOperand(1, NewExitingVPV);
9090 }
9091
9092 // If the vector reduction can be performed in a smaller type, we truncate
9093 // then extend the loop exit value to enable InstCombine to evaluate the
9094 // entire expression in the smaller type.
9095 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9096 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
9097 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9098 Type *RdxTy = RdxDesc.getRecurrenceType();
9099 auto *Trunc =
9100 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9101 auto *Extnd =
9102 RdxDesc.isSigned()
9103 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9104 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9105
9106 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9107 Extnd->insertAfter(Trunc);
9108 if (PhiR->getOperand(1) == NewExitingVPV)
9109 PhiR->setOperand(1, Extnd->getVPSingleValue());
9110 NewExitingVPV = Extnd;
9111 }
9112
9113 // We want code in the middle block to appear to execute on the location of
9114 // the scalar loop's latch terminator because: (a) it is all compiler
9115 // generated, (b) these instructions are always executed after evaluating
9116 // the latch conditional branch, and (c) other passes may add new
9117 // predecessors which terminate on this line. This is the easiest way to
9118 // ensure we don't accidentally cause an extra step back into the loop while
9119 // debugging.
9120 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9121
9122 // TODO: At the moment ComputeReductionResult also drives creation of the
9123 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9124 // even for in-loop reductions, until the reduction resume value handling is
9125 // also modeled in VPlan.
9126 auto *FinalReductionResult = new VPInstruction(
9127 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9128 cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
9129 ->appendRecipe(FinalReductionResult);
9130 OrigExitingVPV->replaceUsesWithIf(
9131 FinalReductionResult,
9132 [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
9133 }
9134
9136}
9137
9138#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9140 VPSlotTracker &SlotTracker) const {
9141 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9142 IG->getInsertPos()->printAsOperand(O, false);
9143 O << ", ";
9145 VPValue *Mask = getMask();
9146 if (Mask) {
9147 O << ", ";
9148 Mask->printAsOperand(O, SlotTracker);
9149 }
9150
9151 unsigned OpIdx = 0;
9152 for (unsigned i = 0; i < IG->getFactor(); ++i) {
9153 if (!IG->getMember(i))
9154 continue;
9155 if (getNumStoreOperands() > 0) {
9156 O << "\n" << Indent << " store ";
9157 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9158 O << " to index " << i;
9159 } else {
9160 O << "\n" << Indent << " ";
9162 O << " = load from index " << i;
9163 }
9164 ++OpIdx;
9165 }
9166}
9167#endif
9168
9171 "Not a pointer induction according to InductionDescriptor!");
9172 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9173 "Unexpected type.");
9175 "Recipe should have been replaced");
9176
9177 auto *IVR = getParent()->getPlan()->getCanonicalIV();
9178 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
9179 Type *PhiType = IndDesc.getStep()->getType();
9180
9181 // Build a pointer phi
9182 Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9183 Type *ScStValueType = ScalarStartValue->getType();
9184 PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
9185 CanonicalIV->getIterator());
9186
9187 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9188 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9189
9190 // A pointer induction, performed by using a gep
9191 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
9192
9193 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9194 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9195 Value *NumUnrolledElems =
9196 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9197 Value *InductionGEP = GetElementPtrInst::Create(
9198 State.Builder.getInt8Ty(), NewPointerPhi,
9199 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9200 InductionLoc);
9201 // Add induction update using an incorrect block temporarily. The phi node
9202 // will be fixed after VPlan execution. Note that at this point the latch
9203 // block cannot be used, as it does not exist yet.
9204 // TODO: Model increment value in VPlan, by turning the recipe into a
9205 // multi-def and a subclass of VPHeaderPHIRecipe.
9206 NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9207
9208 // Create UF many actual address geps that use the pointer
9209 // phi as base and a vectorized version of the step value
9210 // (<step*0, ..., step*N>) as offset.
9211 for (unsigned Part = 0; Part < State.UF; ++Part) {
9212 Type *VecPhiType = VectorType::get(PhiType, State.VF);
9213 Value *StartOffsetScalar =
9214 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9215 Value *StartOffset =
9216 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9217 // Create a vector of consecutive numbers from zero to VF.
9218 StartOffset = State.Builder.CreateAdd(
9219 StartOffset, State.Builder.CreateStepVector(VecPhiType));
9220
9221 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
9222 "scalar step must be the same across all parts");
9223 Value *GEP = State.Builder.CreateGEP(
9224 State.Builder.getInt8Ty(), NewPointerPhi,
9225 State.Builder.CreateMul(
9226 StartOffset,
9227 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9228 "vector.gep"));
9229 State.set(this, GEP, Part);
9230 }
9231}
9232
9234 assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9235
9236 // Fast-math-flags propagate from the original induction instruction.
9238 if (FPBinOp)
9239 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9240
9241 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9242 Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0));
9243 Value *DerivedIV = emitTransformedIndex(
9244 State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9245 Kind, cast_if_present<BinaryOperator>(FPBinOp));
9246 DerivedIV->setName("offset.idx");
9247 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9248
9249 State.set(this, DerivedIV, VPIteration(0, 0));
9250}
9251
9253 assert(!State.Instance && "Interleave group being replicated.");
9254 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9256 NeedsMaskForGaps);
9257}
9258
9261 if (State.Instance) { // Generate a single instance.
9262 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9263 State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
9264 // Insert scalar instance packing it into a vector.
9265 if (State.VF.isVector() && shouldPack()) {
9266 // If we're constructing lane 0, initialize to start from poison.
9267 if (State.Instance->Lane.isFirstLane()) {
9268 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9269 Value *Poison = PoisonValue::get(
9270 VectorType::get(UI->getType(), State.VF));
9271 State.set(this, Poison, State.Instance->Part);
9272 }
9273 State.packScalarIntoVectorValue(this, *State.Instance);
9274 }
9275 return;
9276 }
9277
9278 if (IsUniform) {
9279 // If the recipe is uniform across all parts (instead of just per VF), only
9280 // generate a single instance.
9281 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9282 all_of(operands(), [](VPValue *Op) {
9283 return Op->isDefinedOutsideVectorRegions();
9284 })) {
9285 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
9286 if (user_begin() != user_end()) {
9287 for (unsigned Part = 1; Part < State.UF; ++Part)
9288 State.set(this, State.get(this, VPIteration(0, 0)),
9289 VPIteration(Part, 0));
9290 }
9291 return;
9292 }
9293
9294 // Uniform within VL means we need to generate lane 0 only for each
9295 // unrolled copy.
9296 for (unsigned Part = 0; Part < State.UF; ++Part)
9297 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
9298 return;
9299 }
9300
9301 // A store of a loop varying value to a uniform address only needs the last
9302 // copy of the store.
9303 if (isa<StoreInst>(UI) &&
9305 auto Lane = VPLane::getLastLaneForVF(State.VF);
9306 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
9307 State);
9308 return;
9309 }
9310
9311 // Generate scalar instances for all VF lanes of all UF parts.
9312 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9313 const unsigned EndLane = State.VF.getKnownMinValue();
9314 for (unsigned Part = 0; Part < State.UF; ++Part)
9315 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9316 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
9317}
9318
9319/// Creates either vp_store or vp_scatter intrinsics calls to represent
9320/// predicated store/scatter.
9321static Instruction *
9323 Value *StoredVal, bool IsScatter, Value *Mask,
9324 Value *EVL, const Align &Alignment) {
9325 CallInst *Call;
9326 if (IsScatter) {
9327 Call = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
9328 Intrinsic::vp_scatter,
9329 {StoredVal, Addr, Mask, EVL});
9330 } else {
9331 VectorBuilder VBuilder(Builder);
9332 VBuilder.setEVL(EVL).setMask(Mask);
9333 Call = cast<CallInst>(VBuilder.createVectorInstruction(
9334 Instruction::Store, Type::getVoidTy(EVL->getContext()),
9335 {StoredVal, Addr}));
9336 }
9337 Call->addParamAttr(
9338 1, Attribute::getWithAlignment(Call->getContext(), Alignment));
9339 return Call;
9340}
9341
9342/// Creates either vp_load or vp_gather intrinsics calls to represent
9343/// predicated load/gather.
9345 VectorType *DataTy,
9346 Value *Addr, bool IsGather,
9347 Value *Mask, Value *EVL,
9348 const Align &Alignment) {
9349 CallInst *Call;
9350 if (IsGather) {
9351 Call =
9352 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
9353 nullptr, "wide.masked.gather");
9354 } else {
9355 VectorBuilder VBuilder(Builder);
9356 VBuilder.setEVL(EVL).setMask(Mask);
9357 Call = cast<CallInst>(VBuilder.createVectorInstruction(
9358 Instruction::Load, DataTy, Addr, "vp.op.load"));
9359 }
9360 Call->addParamAttr(
9361 0, Attribute::getWithAlignment(Call->getContext(), Alignment));
9362 return Call;
9363}
9364
9366 auto *LI = cast<LoadInst>(&Ingredient);
9367
9368 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9369 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9370 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9371 bool CreateGather = !isConsecutive();
9372
9373 auto &Builder = State.Builder;
9375 for (unsigned Part = 0; Part < State.UF; ++Part) {
9376 Value *NewLI;
9377 Value *Mask = nullptr;
9378 if (auto *VPMask = getMask()) {
9379 // Mask reversal is only needed for non-all-one (null) masks, as reverse
9380 // of a null all-one mask is a null mask.
9381 Mask = State.get(VPMask, Part);
9382 if (isReverse())
9383 Mask = Builder.CreateVectorReverse(Mask, "reverse");
9384 }
9385
9386 // TODO: split this into several classes for better design.
9387 if (State.EVL) {
9388 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9389 "explicit vector length.");
9390 assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
9392 "EVL must be VPInstruction::ExplicitVectorLength.");
9393 Value *EVL = State.get(State.EVL, VPIteration(0, 0));
9394 // If EVL is not nullptr, then EVL must be a valid value set during plan
9395 // creation, possibly default value = whole vector register length. EVL
9396 // is created only if TTI prefers predicated vectorization, thus if EVL
9397 // is not nullptr it also implies preference for predicated
9398 // vectorization.
9399 // FIXME: Support reverse loading after vp_reverse is added.
9401 Builder, DataTy, State.get(getAddr(), Part, !CreateGather),
9402 CreateGather, Mask, EVL, Alignment);
9403 } else if (CreateGather) {
9404 Value *VectorGep = State.get(getAddr(), Part);
9405 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, Mask,
9406 nullptr, "wide.masked.gather");
9407 State.addMetadata(NewLI, LI);
9408 } else {
9409 auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true);
9410 if (Mask)
9411 NewLI = Builder.CreateMaskedLoad(DataTy, VecPtr, Alignment, Mask,
9412 PoisonValue::get(DataTy),
9413 "wide.masked.load");
9414 else
9415 NewLI =
9416 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9417
9418 // Add metadata to the load, but setVectorValue to the reverse shuffle.
9419 State.addMetadata(NewLI, LI);
9420 if (Reverse)
9421 NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9422 }
9423
9424 State.set(this, NewLI, Part);
9425 }
9426}
9427
9429 auto *SI = cast<StoreInst>(&Ingredient);
9430
9431 VPValue *StoredVPValue = getStoredValue();
9432 bool CreateScatter = !isConsecutive();
9433 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9434
9435 auto &Builder = State.Builder;
9437
9438 for (unsigned Part = 0; Part < State.UF; ++Part) {
9439 Instruction *NewSI = nullptr;
9440 Value *Mask = nullptr;
9441 if (auto *VPMask = getMask()) {
9442 // Mask reversal is only needed for non-all-one (null) masks, as reverse
9443 // of a null all-one mask is a null mask.
9444 Mask = State.get(VPMask, Part);
9445 if (isReverse())
9446 Mask = Builder.CreateVectorReverse(Mask, "reverse");
9447 }
9448
9449 Value *StoredVal = State.get(StoredVPValue, Part);
9450 if (isReverse()) {
9451 assert(!State.EVL && "reversing not yet implemented with EVL");
9452 // If we store to reverse consecutive memory locations, then we need
9453 // to reverse the order of elements in the stored value.
9454 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9455 // We don't want to update the value in the map as it might be used in
9456 // another expression. So don't call resetVectorValue(StoredVal).
9457 }
9458 // TODO: split this into several classes for better design.
9459 if (State.EVL) {
9460 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9461 "explicit vector length.");
9462 assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
9464 "EVL must be VPInstruction::ExplicitVectorLength.");
9465 Value *EVL = State.get(State.EVL, VPIteration(0, 0));
9466 // If EVL is not nullptr, then EVL must be a valid value set during plan
9467 // creation, possibly default value = whole vector register length. EVL
9468 // is created only if TTI prefers predicated vectorization, thus if EVL
9469 // is not nullptr it also implies preference for predicated
9470 // vectorization.
9471 // FIXME: Support reverse store after vp_reverse is added.
9473 Builder, State.get(getAddr(), Part, !CreateScatter), StoredVal,
9474 CreateScatter, Mask, EVL, Alignment);
9475 } else if (CreateScatter) {
9476 Value *VectorGep = State.get(getAddr(), Part);
9477 NewSI =
9478 Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, Mask);
9479 } else {
9480 auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true);
9481 if (Mask)
9482 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, Mask);
9483 else
9484 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9485 }
9486 State.addMetadata(NewSI, SI);
9487 }
9488}
9489
9490// Determine how to lower the scalar epilogue, which depends on 1) optimising
9491// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9492// predication, and 4) a TTI hook that analyses whether the loop is suitable
9493// for predication.
9498 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9499 // don't look at hints or options, and don't request a scalar epilogue.
9500 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9501 // LoopAccessInfo (due to code dependency and not being able to reliably get
9502 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9503 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9504 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9505 // back to the old way and vectorize with versioning when forced. See D81345.)
9506 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9510
9511 // 2) If set, obey the directives
9512 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9520 };
9521 }
9522
9523 // 3) If set, obey the hints
9524 switch (Hints.getPredicate()) {
9529 };
9530
9531 // 4) if the TTI hook indicates this is profitable, request predication.
9532 TailFoldingInfo TFI(TLI, &LVL, IAI);
9535
9537}
9538
9539// Process the loop in the VPlan-native vectorization path. This path builds
9540// VPlan upfront in the vectorization pipeline, which allows to apply
9541// VPlan-to-VPlan transformations from the very beginning without modifying the
9542// input LLVM IR.
9549 LoopVectorizationRequirements &Requirements) {
9550
9551 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9552 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9553 return false;
9554 }
9555 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9556 Function *F = L->getHeader()->getParent();
9557 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9558
9560 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9561
9562 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9563 &Hints, IAI);
9564 // Use the planner for outer loop vectorization.
9565 // TODO: CM is not used at this point inside the planner. Turn CM into an
9566 // optional argument if we don't need it in the future.
9567 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9568 ORE);
9569
9570 // Get user vectorization factor.
9571 ElementCount UserVF = Hints.getWidth();
9572
9574
9575 // Plan how to best vectorize, return the best VF and its cost.
9576 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9577
9578 // If we are stress testing VPlan builds, do not attempt to generate vector
9579 // code. Masked vector code generation support will follow soon.
9580 // Also, do not attempt to vectorize if no vector code will be produced.
9582 return false;
9583
9584 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9585
9586 {
9587 bool AddBranchWeights =
9588 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9589 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9590 F->getParent()->getDataLayout(), AddBranchWeights);
9591 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9592 VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9593 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9594 << L->getHeader()->getParent()->getName() << "\"\n");
9595 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9596 }
9597
9598 reportVectorization(ORE, L, VF, 1);
9599
9600 // Mark the loop as already vectorized to avoid vectorizing again.
9601 Hints.setAlreadyVectorized();
9602 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9603 return true;
9604}
9605
9606// Emit a remark if there are stores to floats that required a floating point
9607// extension. If the vectorized loop was generated with floating point there
9608// will be a performance penalty from the conversion overhead and the change in
9609// the vector width.
9612 for (BasicBlock *BB : L->getBlocks()) {
9613 for (Instruction &Inst : *BB) {
9614 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9615 if (S->getValueOperand()->getType()->isFloatTy())
9616 Worklist.push_back(S);
9617 }
9618 }
9619 }
9620
9621 // Traverse the floating point stores upwards searching, for floating point
9622 // conversions.
9625 while (!Worklist.empty()) {
9626 auto *I = Worklist.pop_back_val();
9627 if (!L->contains(I))
9628 continue;
9629 if (!Visited.insert(I).second)
9630 continue;
9631
9632 // Emit a remark if the floating point store required a floating
9633 // point conversion.
9634 // TODO: More work could be done to identify the root cause such as a
9635 // constant or a function return type and point the user to it.
9636 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9637 ORE->emit([&]() {
9638 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9639 I->getDebugLoc(), L->getHeader())
9640 << "floating point conversion changes vector width. "
9641 << "Mixed floating point precision requires an up/down "
9642 << "cast that will negatively impact performance.";
9643 });
9644
9645 for (Use &Op : I->operands())
9646 if (auto *OpI = dyn_cast<Instruction>(Op))
9647 Worklist.push_back(OpI);
9648 }
9649}
9650
9651static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9653 std::optional<unsigned> VScale, Loop *L,
9654 ScalarEvolution &SE,
9656 InstructionCost CheckCost = Checks.getCost();
9657 if (!CheckCost.isValid())
9658 return false;
9659
9660 // When interleaving only scalar and vector cost will be equal, which in turn
9661 // would lead to a divide by 0. Fall back to hard threshold.
9662 if (VF.Width.isScalar()) {
9663 if (CheckCost > VectorizeMemoryCheckThreshold) {
9664 LLVM_DEBUG(
9665 dbgs()
9666 << "LV: Interleaving only is not profitable due to runtime checks\n");
9667 return false;
9668 }
9669 return true;
9670 }
9671
9672 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9673 uint64_t ScalarC = *VF.ScalarCost.getValue();
9674 if (ScalarC == 0)
9675 return true;
9676
9677 // First, compute the minimum iteration count required so that the vector
9678 // loop outperforms the scalar loop.
9679 // The total cost of the scalar loop is
9680 // ScalarC * TC
9681 // where
9682 // * TC is the actual trip count of the loop.
9683 // * ScalarC is the cost of a single scalar iteration.
9684 //
9685 // The total cost of the vector loop is
9686 // RtC + VecC * (TC / VF) + EpiC
9687 // where
9688 // * RtC is the cost of the generated runtime checks
9689 // * VecC is the cost of a single vector iteration.
9690 // * TC is the actual trip count of the loop
9691 // * VF is the vectorization factor
9692 // * EpiCost is the cost of the generated epilogue, including the cost
9693 // of the remaining scalar operations.
9694 //
9695 // Vectorization is profitable once the total vector cost is less than the
9696 // total scalar cost:
9697 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9698 //
9699 // Now we can compute the minimum required trip count TC as
9700 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
9701 //
9702 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9703 // the computations are performed on doubles, not integers and the result
9704 // is rounded up, hence we get an upper estimate of the TC.
9705 unsigned IntVF = VF.Width.getKnownMinValue();
9706 if (VF.Width.isScalable()) {
9707 unsigned AssumedMinimumVscale = 1;
9708 if (VScale)
9709 AssumedMinimumVscale = *VScale;
9710 IntVF *= AssumedMinimumVscale;
9711 }
9712 uint64_t RtC = *CheckCost.getValue();
9713 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
9714 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
9715
9716 // Second, compute a minimum iteration count so that the cost of the
9717 // runtime checks is only a fraction of the total scalar loop cost. This
9718 // adds a loop-dependent bound on the overhead incurred if the runtime
9719 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9720 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9721 // cost, compute
9722 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9723 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
9724
9725 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9726 // epilogue is allowed, choose the next closest multiple of VF. This should
9727 // partly compensate for ignoring the epilogue cost.
9728 uint64_t MinTC = std::max(MinTC1, MinTC2);
9729 if (SEL == CM_ScalarEpilogueAllowed)
9730 MinTC = alignTo(MinTC, IntVF);
9732
9733 LLVM_DEBUG(
9734 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9735 << VF.MinProfitableTripCount << "\n");
9736
9737 // Skip vectorization if the expected trip count is less than the minimum
9738 // required trip count.
9739 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9742 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9743 "trip count < minimum profitable VF ("
9744 << *ExpectedTC << " < " << VF.MinProfitableTripCount
9745 << ")\n");
9746
9747 return false;
9748 }
9749 }
9750 return true;
9751}
9752
9754 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9756 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9758
9760 assert((EnableVPlanNativePath || L->isInnermost()) &&
9761 "VPlan-native path is not enabled. Only process inner loops.");
9762
9763#ifndef NDEBUG
9764 const std::string DebugLocStr = getDebugLocString(L);
9765#endif /* NDEBUG */
9766
9767 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9768 << L->getHeader()->getParent()->getName() << "' from "
9769 << DebugLocStr << "\n");
9770
9771 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9772
9773 LLVM_DEBUG(
9774 dbgs() << "LV: Loop hints:"
9775 << " force="
9777 ? "disabled"
9779 ? "enabled"
9780 : "?"))
9781 << " width=" << Hints.getWidth()
9782 << " interleave=" << Hints.getInterleave() << "\n");
9783
9784 // Function containing loop
9785 Function *F = L->getHeader()->getParent();
9786
9787 // Looking at the diagnostic output is the only way to determine if a loop
9788 // was vectorized (other than looking at the IR or machine code), so it
9789 // is important to generate an optimization remark for each loop. Most of
9790 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9791 // generated as OptimizationRemark and OptimizationRemarkMissed are
9792 // less verbose reporting vectorized loops and unvectorized loops that may
9793 // benefit from vectorization, respectively.
9794
9795 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9796 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9797 return false;
9798 }
9799
9800 PredicatedScalarEvolution PSE(*SE, *L);
9801
9802 // Check if it is legal to vectorize the loop.
9803 LoopVectorizationRequirements Requirements;
9804 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9805 &Requirements, &Hints, DB, AC, BFI, PSI);
9807 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9808 Hints.emitRemarkWithHints();
9809 return false;
9810 }
9811
9812 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9813 // here. They may require CFG and instruction level transformations before
9814 // even evaluating whether vectorization is profitable. Since we cannot modify
9815 // the incoming IR, we need to build VPlan upfront in the vectorization
9816 // pipeline.
9817 if (!L->isInnermost())
9818 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9819 ORE, BFI, PSI, Hints, Requirements);
9820
9821 assert(L->isInnermost() && "Inner loop expected.");
9822
9823 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9824 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9825
9826 // If an override option has been passed in for interleaved accesses, use it.
9827 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9828 UseInterleaved = EnableInterleavedMemAccesses;
9829
9830 // Analyze interleaved memory accesses.
9831 if (UseInterleaved)
9833
9834 // Check the function attributes and profiles to find out if this function
9835 // should be optimized for size.
9837 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9838
9839 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9840 // count by optimizing for size, to minimize overheads.
9841 auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9842 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9843 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9844 << "This loop is worth vectorizing only if no scalar "
9845 << "iteration overheads are incurred.");
9847 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9848 else {
9849 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
9850 LLVM_DEBUG(dbgs() << "\n");
9851 // Predicate tail-folded loops are efficient even when the loop
9852 // iteration count is low. However, setting the epilogue policy to
9853 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9854 // with runtime checks. It's more effective to let
9855 // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9856 // for the loop.
9859 } else {
9860 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
9861 "small to consider vectorizing.\n");
9863 "The trip count is below the minial threshold value.",
9864 "loop trip count is too low, avoiding vectorization",
9865 "LowTripCount", ORE, L);
9866 Hints.emitRemarkWithHints();
9867 return false;
9868 }
9869 }
9870 }
9871
9872 // Check the function attributes to see if implicit floats or vectors are
9873 // allowed.
9874 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9876 "Can't vectorize when the NoImplicitFloat attribute is used",
9877 "loop not vectorized due to NoImplicitFloat attribute",
9878 "NoImplicitFloat", ORE, L);
9879 Hints.emitRemarkWithHints();
9880 return false;
9881 }
9882
9883 // Check if the target supports potentially unsafe FP vectorization.
9884 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9885 // for the target we're vectorizing for, to make sure none of the
9886 // additional fp-math flags can help.
9887 if (Hints.isPotentiallyUnsafe() &&
9890 "Potentially unsafe FP op prevents vectorization",
9891 "loop not vectorized due to unsafe FP support.",
9892 "UnsafeFP", ORE, L);
9893 Hints.emitRemarkWithHints();
9894 return false;
9895 }
9896
9897 bool AllowOrderedReductions;
9898 // If the flag is set, use that instead and override the TTI behaviour.
9899 if (ForceOrderedReductions.getNumOccurrences() > 0)
9900 AllowOrderedReductions = ForceOrderedReductions;
9901 else
9902 AllowOrderedReductions = TTI->enableOrderedReductions();
9903 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9904 ORE->emit([&]() {
9905 auto *ExactFPMathInst = Requirements.getExactFPInst();
9906 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9907 ExactFPMathInst->getDebugLoc(),
9908 ExactFPMathInst->getParent())
9909 << "loop not vectorized: cannot prove it is safe to reorder "
9910 "floating-point operations";
9911 });
9912 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9913 "reorder floating-point operations\n");
9914 Hints.emitRemarkWithHints();
9915 return false;
9916 }
9917
9918 // Use the cost model.
9919 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9920 F, &Hints, IAI);
9921 // Use the planner for vectorization.
9922 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9923 ORE);
9924
9925 // Get user vectorization factor and interleave count.
9926 ElementCount UserVF = Hints.getWidth();
9927 unsigned UserIC = Hints.getInterleave();
9928
9929 // Plan how to best vectorize, return the best VF and its cost.
9930 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9931
9933 unsigned IC = 1;
9934
9935 bool AddBranchWeights =
9936 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9937 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9938 F->getParent()->getDataLayout(), AddBranchWeights);
9939 if (MaybeVF) {
9940 VF = *MaybeVF;
9941 // Select the interleave count.
9942 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9943
9944 unsigned SelectedIC = std::max(IC, UserIC);
9945 // Optimistically generate runtime checks if they are needed. Drop them if
9946 // they turn out to not be profitable.
9947 if (VF.Width.isVector() || SelectedIC > 1)
9948 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
9949
9950 // Check if it is profitable to vectorize with runtime checks.
9951 bool ForceVectorization =
9953 if (!ForceVectorization &&
9955 *PSE.getSE(), SEL)) {
9956 ORE->emit([&]() {
9958 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9959 L->getHeader())
9960 << "loop not vectorized: cannot prove it is safe to reorder "
9961 "memory operations";
9962 });
9963 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9964 Hints.emitRemarkWithHints();
9965 return false;
9966 }
9967 }
9968
9969 // Identify the diagnostic messages that should be produced.
9970 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9971 bool VectorizeLoop = true, InterleaveLoop = true;
9972 if (VF.Width.isScalar()) {
9973 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9974 VecDiagMsg = std::make_pair(
9975 "VectorizationNotBeneficial",
9976 "the cost-model indicates that vectorization is not beneficial");
9977 VectorizeLoop = false;
9978 }
9979
9980 if (!MaybeVF && UserIC > 1) {
9981 // Tell the user interleaving was avoided up-front, despite being explicitly
9982 // requested.
9983 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9984 "interleaving should be avoided up front\n");
9985 IntDiagMsg = std::make_pair(
9986 "InterleavingAvoided",
9987 "Ignoring UserIC, because interleaving was avoided up front");
9988 InterleaveLoop = false;
9989 } else if (IC == 1 && UserIC <= 1) {
9990 // Tell the user interleaving is not beneficial.
9991 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9992 IntDiagMsg = std::make_pair(
9993 "InterleavingNotBeneficial",
9994 "the cost-model indicates that interleaving is not beneficial");
9995 InterleaveLoop = false;
9996 if (UserIC == 1) {
9997 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9998 IntDiagMsg.second +=
9999 " and is explicitly disabled or interleave count is set to 1";
10000 }
10001 } else if (IC > 1 && UserIC == 1) {
10002 // Tell the user interleaving is beneficial, but it explicitly disabled.
10003 LLVM_DEBUG(
10004 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10005 IntDiagMsg = std::make_pair(
10006 "InterleavingBeneficialButDisabled",
10007 "the cost-model indicates that interleaving is beneficial "
10008 "but is explicitly disabled or interleave count is set to 1");
10009 InterleaveLoop = false;
10010 }
10011
10012 // Override IC if user provided an interleave count.
10013 IC = UserIC > 0 ? UserIC : IC;
10014
10015 // Emit diagnostic messages, if any.
10016 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10017 if (!VectorizeLoop && !InterleaveLoop) {
10018 // Do not vectorize or interleaving the loop.
10019 ORE->emit([&]() {
10020 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10021 L->getStartLoc(), L->getHeader())
10022 << VecDiagMsg.second;
10023 });
10024 ORE->emit([&]() {
10025 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10026 L->getStartLoc(), L->getHeader())
10027 << IntDiagMsg.second;
10028 });
10029 return false;
10030 } else if (!VectorizeLoop && InterleaveLoop) {
10031 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10032 ORE->emit([&]() {
10033 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10034 L->getStartLoc(), L->getHeader())
10035 << VecDiagMsg.second;
10036 });
10037 } else if (VectorizeLoop && !InterleaveLoop) {
10038 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10039 << ") in " << DebugLocStr << '\n');
10040 ORE->emit([&]() {
10041 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10042 L->getStartLoc(), L->getHeader())
10043 << IntDiagMsg.second;
10044 });
10045 } else if (VectorizeLoop && InterleaveLoop) {
10046 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10047 << ") in " << DebugLocStr << '\n');
10048 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10049 }
10050
10051 bool DisableRuntimeUnroll = false;
10052 MDNode *OrigLoopID = L->getLoopID();
10053 {
10054 using namespace ore;
10055 if (!VectorizeLoop) {
10056 assert(IC > 1 && "interleave count should not be 1 or 0");
10057 // If we decided that it is not legal to vectorize the loop, then
10058 // interleave it.
10059 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10060 &CM, BFI, PSI, Checks);
10061
10062 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10063 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10064
10065 ORE->emit([&]() {
10066 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10067 L->getHeader())
10068 << "interleaved loop (interleaved count: "
10069 << NV("InterleaveCount", IC) << ")";
10070 });
10071 } else {
10072 // If we decided that it is *legal* to vectorize the loop, then do it.
10073
10074 // Consider vectorizing the epilogue too if it's profitable.
10075 VectorizationFactor EpilogueVF =
10077 if (EpilogueVF.Width.isVector()) {
10078
10079 // The first pass vectorizes the main loop and creates a scalar epilogue
10080 // to be vectorized by executing the plan (potentially with a different
10081 // factor) again shortly afterwards.
10082 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10083 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10084 EPI, &LVL, &CM, BFI, PSI, Checks);
10085
10086 std::unique_ptr<VPlan> BestMainPlan(
10088 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
10089 EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
10090 ++LoopsVectorized;
10091
10092 // Second pass vectorizes the epilogue and adjusts the control flow
10093 // edges from the first pass.
10094 EPI.MainLoopVF = EPI.EpilogueVF;
10095 EPI.MainLoopUF = EPI.EpilogueUF;
10096 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10097 ORE, EPI, &LVL, &CM, BFI, PSI,
10098 Checks);
10099
10100 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10101 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10102 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10103 Header->setName("vec.epilog.vector.body");
10104
10105 // Re-use the trip count and steps expanded for the main loop, as
10106 // skeleton creation needs it as a value that dominates both the scalar
10107 // and vector epilogue loops
10108 // TODO: This is a workaround needed for epilogue vectorization and it
10109 // should be removed once induction resume value creation is done
10110 // directly in VPlan.
10111 EpilogILV.setTripCount(MainILV.getTripCount());
10112 for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
10113 auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
10114 auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn(
10115 ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10116 ExpandR->replaceAllUsesWith(ExpandedVal);
10117 if (BestEpiPlan.getTripCount() == ExpandR)
10118 BestEpiPlan.resetTripCount(ExpandedVal);
10119 ExpandR->eraseFromParent();
10120 }
10121
10122 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10123 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10124 // before vectorizing the epilogue loop.
10125 for (VPRecipeBase &R : Header->phis()) {
10126 if (isa<VPCanonicalIVPHIRecipe>(&R))
10127 continue;
10128
10129 Value *ResumeV = nullptr;
10130 // TODO: Move setting of resume values to prepareToExecute.
10131 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10132 ResumeV = ReductionResumeValues
10133 .find(&ReductionPhi->getRecurrenceDescriptor())
10134 ->second;
10135 } else {
10136 // Create induction resume values for both widened pointer and
10137 // integer/fp inductions and update the start value of the induction
10138 // recipes to use the resume value.
10139 PHINode *IndPhi = nullptr;
10140 const InductionDescriptor *ID;
10141 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10142 IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10143 ID = &Ind->getInductionDescriptor();
10144 } else {
10145 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10146 IndPhi = WidenInd->getPHINode();
10147 ID = &WidenInd->getInductionDescriptor();
10148 }
10149
10150 ResumeV = MainILV.createInductionResumeValue(
10151 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
10153 }
10154 assert(ResumeV && "Must have a resume value");
10155 VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV);
10156 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10157 }
10158
10159 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10160 DT, true, &ExpandedSCEVs);
10161 ++LoopsEpilogueVectorized;
10162
10163 if (!MainILV.areSafetyChecksAdded())
10164 DisableRuntimeUnroll = true;
10165 } else {
10166 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10167 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10168 PSI, Checks);
10169
10170 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10171 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10172 ++LoopsVectorized;
10173
10174 // Add metadata to disable runtime unrolling a scalar loop when there
10175 // are no runtime checks about strides and memory. A scalar loop that is
10176 // rarely used is not worth unrolling.
10177 if (!LB.areSafetyChecksAdded())
10178 DisableRuntimeUnroll = true;
10179 }
10180 // Report the vectorization decision.
10181 reportVectorization(ORE, L, VF, IC);
10182 }
10183
10186 }
10187
10188 std::optional<MDNode *> RemainderLoopID =
10191 if (RemainderLoopID) {
10192 L->setLoopID(*RemainderLoopID);
10193 } else {
10194 if (DisableRuntimeUnroll)
10196
10197 // Mark the loop as already vectorized to avoid vectorizing again.
10198 Hints.setAlreadyVectorized();
10199 }
10200
10201 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10202 return true;
10203}
10204
10210 SE = &SE_;
10211 LI = &LI_;
10212 TTI = &TTI_;
10213 DT = &DT_;
10214 BFI = BFI_;
10215 TLI = TLI_;
10216 AC = &AC_;
10217 LAIs = &LAIs_;
10218 DB = &DB_;
10219 ORE = &ORE_;
10220 PSI = PSI_;
10221
10222 // Don't attempt if
10223 // 1. the target claims to have no vector registers, and
10224 // 2. interleaving won't help ILP.
10225 //
10226 // The second condition is necessary because, even if the target has no
10227 // vector registers, loop vectorization may still enable scalar
10228 // interleaving.
10231 return LoopVectorizeResult(false, false);
10232
10233 bool Changed = false, CFGChanged = false;
10234
10235 // The vectorizer requires loops to be in simplified form.
10236 // Since simplification may add new inner loops, it has to run before the
10237 // legality and profitability checks. This means running the loop vectorizer
10238 // will simplify all loops, regardless of whether anything end up being
10239 // vectorized.
10240 for (const auto &L : *LI)
10241 Changed |= CFGChanged |=
10242 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10243
10244 // Build up a worklist of inner-loops to vectorize. This is necessary as
10245 // the act of vectorizing or partially unrolling a loop creates new loops
10246 // and can invalidate iterators across the loops.
10247 SmallVector<Loop *, 8> Worklist;
10248
10249 for (Loop *L : *LI)
10250 collectSupportedLoops(*L, LI, ORE, Worklist);
10251
10252 LoopsAnalyzed += Worklist.size();
10253
10254 // Now walk the identified inner loops.
10255 while (!Worklist.empty()) {
10256 Loop *L = Worklist.pop_back_val();
10257
10258 // For the inner loops we actually process, form LCSSA to simplify the
10259 // transform.
10260 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10261
10262 Changed |= CFGChanged |= processLoop(L);
10263
10264 if (Changed) {
10265 LAIs->clear();
10266
10267#ifndef NDEBUG
10268 if (VerifySCEV)
10269 SE->verify();
10270#endif
10271 }
10272 }
10273
10274 // Process each loop nest in the function.
10275 return LoopVectorizeResult(Changed, CFGChanged);
10276}
10277
10280 auto &LI = AM.getResult<LoopAnalysis>(F);
10281 // There are no loops in the function. Return before computing other expensive
10282 // analyses.
10283 if (LI.empty())
10284 return PreservedAnalyses::all();
10286 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10287 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10288 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10289 auto &AC = AM.getResult<AssumptionAnalysis>(F);
10290 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10292
10294 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10296 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10297 BlockFrequencyInfo *BFI = nullptr;
10298 if (PSI && PSI->hasProfileSummary())
10300 LoopVectorizeResult Result =
10301 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10302 if (!Result.MadeAnyChange)
10303 return PreservedAnalyses::all();
10305
10306 if (isAssignmentTrackingEnabled(*F.getParent())) {
10307 for (auto &BB : F)
10309 }
10310
10311 // We currently do not preserve loopinfo/dominator analyses with outer loop
10312 // vectorization. Until this is addressed, mark these analyses as preserved
10313 // only for non-VPlan-native path.
10314 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10315 if (!EnableVPlanNativePath) {
10316 PA.preserve<LoopAnalysis>();
10319 }
10320
10321 if (Result.MadeCFGChange) {
10322 // Making CFG changes likely means a loop got vectorized. Indicate that
10323 // extra simplification passes should be run.
10324 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10325 // be run if runtime checks have been added.
10328 } else {
10330 }
10331 return PA;
10332}
10333
10335 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10336 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10337 OS, MapClassName2PassName);
10338
10339 OS << '<';
10340 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10341 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10342 OS << '>';
10343}
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
AMDGPU Lower Kernel Arguments
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
ReachingDefAnalysis InstSet & ToRemove
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:693
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
#define DEBUG_WITH_TYPE(TYPE, X)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition: Debug.h:64
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
uint64_t Addr
std::string Name
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define Check(C,...)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
This header provides classes for managing per-loop analyses.
static const char * VerboseDebug
loop Loop Strength Reduction
#define LV_NAME
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static std::optional< unsigned > getSmallBestKnownTC(ScalarEvolution &SE, Loop *L)
Returns "best known" trip count for the specified loop L as defined by the following procedure: 1) Re...
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static void createAndCollectMergePhiForReduction(VPInstruction *RedResult, DenseMap< const RecurrenceDescriptor *, Value * > &ReductionResumeValues, VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock)
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range)
Creates a VPWidenIntOrFpInductionRecpipe for Phi.
static Value * interleaveVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vals, const Twine &Name)
Return a vector containing interleaved elements from multiple smaller input vectors.
static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or it's operands.
static void emitInvalidCostRemarks(SmallVector< InstructionVFPair > InvalidCosts, OptimizationRemarkEmitter *ORE, Loop *TheLoop)
static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan)
const char LLVMLoopVectorizeFollowupAll[]
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, DebugLoc DL)
static std::optional< unsigned > getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI)
Convenience function that returns the value of vscale_range iff vscale_range.min == vscale_range....
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static constexpr uint32_t MemCheckBypassWeights[]
static Type * MaybeVectorizeType(Type *Elt, ElementCount VF)
static Instruction * lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder, VectorType *DataTy, Value *Addr, bool IsGather, Value *Mask, Value *EVL, const Align &Alignment)
Creates either vp_load or vp_gather intrinsics calls to represent predicated load/gather.
static Instruction * lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr, Value *StoredVal, bool IsScatter, Value *Mask, Value *EVL, const Align &Alignment)
Creates either vp_store or vp_scatter intrinsics calls to represent predicated store/scatter.
static cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing."))
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static Type * smallestIntegerVectorType(Type *T1, Type *T2)
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, std::optional< unsigned > VScale, Loop *L, ScalarEvolution &SE, ScalarEpilogueLowering SEL)
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
const char VerboseDebug[]
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I)
Create an analysis remark that explains why vectorization failed.
static constexpr uint32_t SCEVCheckBypassWeights[]
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
const char LLVMLoopVectorizeFollowupVectorized[]
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static std::string getDebugLocString(const Loop *L)
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
const char LLVMLoopVectorizeFollowupEpilogue[]
static bool useActiveLaneMask(TailFoldingStyle Style)
static Type * largestIntegerVectorType(Type *T1, Type *T2)
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
static unsigned getReciprocalPredBlockProb()
A helper function that returns the reciprocal of the block probability of predicated blocks.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
#define DEBUG_TYPE
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static void AddRuntimeUnrollDisableMetaData(Loop *L)
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static cl::opt< bool > PrintVPlansInDotFormat("vplan-print-in-dot-format", cl::Hidden, cl::desc("Use dot format instead of plain text when dumping VPlans"))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
#define R2(n)
This file contains the declarations for metadata subclasses.
#define T1
Module.h This file contains the declarations for the Module class.
LLVMContext & Context
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(VerifyEach)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
separate const offset from gep
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
This defines the Use class.
This file defines the VPlanHCFGBuilder class which contains the public interface (buildHierarchicalCF...
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:473
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
void registerAssumption(AssumeInst *CI)
Add an @llvm.assume intrinsic to this function's cache.
unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
Definition: Attributes.cpp:411
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:194
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:430
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:499
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
Definition: BasicBlock.cpp:367
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:360
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:452
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:165
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:221
BinaryOps getOpcode() const
Definition: InstrTypes.h:513
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
void setCondition(Value *V)
static BranchInst * Create(BasicBlock *IfTrue, BasicBlock::iterator InsertBefore)
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:70
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:2222
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1742
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1678
unsigned arg_size() const
Definition: InstrTypes.h:1685
This class represents a function call, abstracting a target machine's calling convention.
static bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:1016
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:1018
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:1019
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:849
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:856
This is an important base class in LLVM.
Definition: Constant.h:41
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
A debug info location.
Definition: DebugLoc.h:33
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:101
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:211
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
DomTreeNodeBase * getIDom() const
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:311
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:299
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:296
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition: TypeSize.h:302
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:307
BasicBlock * emitMinimumVectorEpilogueIterCountCheck(BasicBlock *Bypass, BasicBlock *Insert)
Emits an iteration count bypass check after the main vector loop has finished to see if there are any...
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (ie the ...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the main loop strategy (ie the firs...
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:318
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent function types.
Definition: DerivedTypes.h:103
param_iterator param_begin() const
Definition: DerivedTypes.h:128
param_iterator param_end() const
Definition: DerivedTypes.h:129
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:681
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:201
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:701
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:678
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:675
static GetElementPtrInst * Create(Type *PointeeType, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &NameStr, BasicBlock::iterator InsertBefore)
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:921
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:511
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2460
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1807
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1214
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2516
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:466
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:578
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1110
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:175
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:526
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:311
Value * CreateVectorReverse(Value *V, const Twine &Name="")
Return a vector value that contains the vector V reversed.
Definition: IRBuilder.cpp:1170
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1721
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:486
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2205
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2397
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2241
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:145
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1344
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2494
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:598
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1327
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:471
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1666
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1826
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1866
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2351
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:516
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1404
Value * CreateStepVector(Type *DstType, const Twine &Name="")
Creates a vector of type DstType with the linear sequence <0, 1, ...>
Definition: IRBuilder.cpp:109
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1361
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
A struct for saving information about induction variables.
BinaryOperator * getInductionBinOp() const
InductionKind getKind() const
const SCEV * getStep() const
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
Value * getStartValue() const
An extension of the inner loop vectorizer that creates a skeleton for a vectorized loop that has its ...
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
virtual std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)=0
The interface for creating a vectorized skeleton using one of two different strategies,...
std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
PHINode * createInductionResumeValue(PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, ArrayRef< BasicBlock * > BypassBlocks, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create a new phi node for the induction variable OrigPhi to resume iteration count in the scalar epil...
void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
BasicBlock * LoopScalarBody
The scalar loop body.
Value * TripCount
Trip count of the original loop.
void sinkScalarOperands(Instruction *PredInst)
Iteratively sink the scalarized operands of a predicated instruction into the block that was created ...
const TargetLibraryInfo * TLI
Target Library Info.
DenseMap< PHINode *, Value * > IVEndValues
ElementCount MinProfitableTripCount
Value * createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL)
Returns a bitcasted value to the requested vector type.
const TargetTransformInfo * TTI
Target Transform Info.
Value * VectorTripCount
Trip count of the widened loop (TripCount - TripCount % (VF*UF))
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
BasicBlock * emitSCEVChecks(BasicBlock *Bypass)
Emit a bypass check to see if all of the SCEV assumptions we've had to make are correct.
LoopVectorizationCostModel * Cost
The profitablity analysis.
SmallMapVector< const RecurrenceDescriptor *, PHINode *, 4 > ReductionResumeValues
BlockFrequencyInfo * BFI
BFI and PSI are used to check for profile guided size optimizations.
Value * getTripCount() const
Returns the original loop trip count.
BasicBlock * LoopMiddleBlock
Middle Block between the vector and the scalar.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
SmallVector< Instruction *, 4 > PredicatedInstructions
Store instructions that were predicated.
void createVectorLoopSkeleton(StringRef Prefix)
Emit basic blocks (prefixed with Prefix) for the iteration check, vector loop preheader,...
BasicBlock * completeLoopSkeleton()
Complete the loop skeleton by adding debug MDs, creating appropriate conditional branches in the midd...
BasicBlock * emitMemRuntimeChecks(BasicBlock *Bypass)
Emit bypass checks to check any memory assumptions we may have made.
BasicBlock * LoopScalarPreHeader
The scalar-loop preheader.
LoopVectorizationLegality * Legal
The legality analysis.
void emitIterationCountCheck(BasicBlock *Bypass)
Emit a bypass check to see if the vector trip count is zero, including if it overflows.
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, BasicBlock *MiddleBlock, BasicBlock *VectorHeader, VPlan &Plan, VPTransformState &State)
Set up the values of the IVs correctly when exiting the vector loop.
LoopInfo * LI
Loop Info.
void createInductionResumeValues(const SCEV2ValueTy &ExpandedSCEVs, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create new phi nodes for the induction variables to resume iteration count in the scalar epilogue,...
ProfileSummaryInfo * PSI
void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State)
Fix the non-induction PHIs in Plan.
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
BasicBlock * LoopVectorPreHeader
The vector-loop preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
Value * getOrCreateVectorTripCount(BasicBlock *InsertBlock)
Returns (and creates if needed) the trip count of the widened loop.
IRBuilder Builder
The builder that we use.
void vectorizeInterleaveGroup(const InterleaveGroup< Instruction > *Group, ArrayRef< VPValue * > VPDefs, VPTransformState &State, VPValue *Addr, ArrayRef< VPValue * > StoredValues, VPValue *BlockInMask, bool NeedsMaskForGaps)
Try to vectorize interleaved access group Group with the base address given in Addr,...
void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State)
Create the exit value of first order recurrences in the middle block and update their users.
virtual std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
unsigned UF
The vectorization unroll factor to use.
void fixVectorizedLoop(VPTransformState &State, VPlan &Plan)
Fix the vectorized code, taking care of header phi's, live-outs, and more.
BasicBlock * LoopExitBlock
The unique ExitBlock of the scalar loop if one exists.
SmallVector< BasicBlock *, 4 > LoopBypassBlocks
A list of all bypass blocks. The first block is the entry of the loop.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:454
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:80
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
void replaceSuccessorWith(BasicBlock *OldBB, BasicBlock *NewBB)
Replace specified successor OldBB to point at the provided block.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:149
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:451
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
The group of interleaved loads/stores sharing the same stride and close to each other.
Definition: VectorUtils.h:444
uint32_t getFactor() const
Definition: VectorUtils.h:460
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
Definition: VectorUtils.h:514
uint32_t getIndex(const InstTy *Instr) const
Get the index for the given member.
Definition: VectorUtils.h:521
bool isReverse() const
Definition: VectorUtils.h:459
InstTy * getInsertPos() const
Definition: VectorUtils.h:530
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
Align getAlign() const
Definition: VectorUtils.h:461
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:586
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
Definition: VectorUtils.h:631
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
Definition: VectorUtils.h:642
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
Definition: VectorUtils.h:623
bool invalidateGroups()
Invalidate groups, e.g., in case all blocks in loop will be predicated contrary to original assumptio...
Definition: VectorUtils.h:606
iterator_range< SmallPtrSetIterator< llvm::InterleaveGroup< Instruction > * > > getInterleaveGroups()
Definition: VectorUtils.h:636
void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:184
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
iterator_range< block_iterator > blocks() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
Store the result of a depth first search within basic blocks contained by a single loop.
Definition: LoopIterator.h:97
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
Definition: LoopIterator.h:136
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopInfo.cpp:1222
RPOIterator endRPO() const
Definition: LoopIterator.h:140
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopIterator.h:180
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool requiresScalarEpilogue(VFRange Range) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
bool isAccessInterleaved(Instruction *Instr)
Check if Instr belongs to any interleaved access group.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI)
LoopVectorizationLegality * Legal
Vectorization legality.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
std::pair< InstructionCost, bool > VectorizationCostTy
The vectorization cost is a combination of the cost itself and a boolean indicating whether any of th...
DemandedBits * DB
Demanded bits analysis.
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
Loop * TheLoop
The loop that we evaluate.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallVector< RegisterUsage, 8 > calculateRegisterUsage(ArrayRef< ElementCount > VFs)
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
VectorizationCostTy expectedCost(ElementCount VF, SmallVectorImpl< InstructionVFPair > *Invalid=nullptr)
Returns the expected execution cost.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
bool isEpilogueVectorizationProfitable(const ElementCount VF) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
void collectUniformsAndScalars(ElementCount VF)
Collect Uniform and Scalar values for the given VF.
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const
Returns true if I is an instruction which requires predication and for which our chosen predication s...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr)
Get the interleaved access group that Instr belongs to.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost)
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
bool hasVectorCallVariants() const
Returns true if there is at least one function call in the loop which has a vectorized variant availa...
bool isInvariantAddressOfReduction(Value *V)
Returns True if given address is invariant and is used to store recurrent expression.
bool blockNeedsPredication(BasicBlock *BB) const
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const
Check if this pointer is consecutive when vectorizing.
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
bool isReductionVariable(PHINode *PN) const
Returns True if PN is a reduction variable in this loop.
bool isFixedOrderRecurrence(const PHINode *Phi) const
Returns True if Phi is a fixed-order recurrence in this loop.
const InductionDescriptor * getPointerInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is pointer induction.
const InductionDescriptor * getIntOrFpInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is an integer or floating point induction.
bool isInductionPhi(const Value *V) const
Returns True if V is a Phi node of an induction variable in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool isInvariant(Value *V) const
Returns true if value V is uniform across VF lanes, when VF is provided, and otherwise if V is invari...
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Type * getWidestInductionType()
Returns the widest induction type.
const LoopAccessInfo * getLAI() const
bool prepareToFoldTailByMasking()
Return true if we can vectorize this loop while folding its tail by masking, and mark all respective ...
bool isUniformMemOp(Instruction &I, ElementCount VF) const
A uniform memory op is a load or store which accesses the same memory location on all VF lanes,...
bool isMaskRequired(const Instruction *I) const
Returns true if vector representation of the instruction I requires mask.
const RuntimePointerChecking * getRuntimePointerChecking() const
Returns the information that we collected about runtime memory check.
Planner drives the vectorization process after having passed Legality checks.
std::optional< VectorizationFactor > plan(ElementCount UserVF, unsigned UserIC)
Plan how to best vectorize, return the best VF and its cost, or std::nullopt if vectorization and int...
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
std::pair< DenseMap< const SCEV *, Value * >, DenseMap< const RecurrenceDescriptor *, Value * > > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool IsEpilogueVectorization, const DenseMap< const SCEV *, Value * > *ExpandedSCEVs=nullptr)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
VPlan & getBestPlanFor(ElementCount VF) const
Return the best VPlan for VF.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
void printPlans(raw_ostream &O)
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
void prepareNoAliasMetadata()
Set up the aliasing scopes based on the memchecks.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition: LoopInfo.cpp:66
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:631
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:60
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Definition: LoopInfo.cpp:501
Metadata node.
Definition: Metadata.h:1067
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1071
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1428
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1541
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1434
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:600
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
size_type size() const
Definition: MapVector.h:60
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:191
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:756
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
void setIncomingValueForBlock(const BasicBlock *BB, Value *V)
Set every incoming value(s) for block BB to V.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr, BasicBlock::iterator InsertBefore)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static unsigned getIncomingValueNumForOperand(unsigned i)
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:144
void preserve()
Mark an analysis as preserved.
Definition: Analysis.h:129
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:71
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
Instruction * getLoopExitInstr() const
static unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
TrackingVH< Value > getRecurrenceStartValue() const
SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
bool Need
This flag indicates if we need to add the runtime check.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class represents a constant integer value.
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
This class represents an assumption made using SCEV expressions which can be checked at run-time.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isZero() const
Return true if the expression is a constant zero.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getSmallConstantMaxTripCount(const Loop *L)
Returns the upper bound of the loop trip count as a normal unsigned value.
const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVMContext & getContext() const
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
value_type pop_back_val()
Definition: SetVector.h:285
This class provides computation of slot numbers for LLVM Assembly writing.
Definition: AsmWriter.cpp:693
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:356
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
iterator end() const
Definition: SmallPtrSet.h:385
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
iterator begin() const
Definition: SmallPtrSet.h:380
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
std::optional< unsigned > getVScaleForTuning() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const
Return true if the target supports masked scatter.
bool enableAggressiveInterleaving(bool LoopHasReductions) const
Don't restrict interleaved unrolling to small loops.
bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
bool prefersVectorizedAddressing() const
Return true if target doesn't mind addresses in vectors.
bool hasBranchDivergence(const Function *F=nullptr) const
Return true if branch divergence exists.
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE=nullptr, const SCEV *Ptr=nullptr) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const
Get target-customized preferences for the generic loop unrolling transformation.
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instructions unique non-constant operands.
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
std::optional< unsigned > getMaxVScale() const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool enableOrderedReductions() const
Return true if we should be enabling ordered reductions for the target.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Query the target what the preferred style of tail folding is.
unsigned getRegUsageForType(Type *Ty) const
Returns the estimated number of registers required to represent Ty.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add ...
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isElementTypeLegalForScalableVector(Type *Ty) const
ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const
bool enableMaskedInterleavedAccessVectorization() const
Enable matching of interleaved access groups that contain predicated accesses or gaps and therefore v...
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
const char * getRegisterClassName(unsigned ClassID) const
bool preferEpilogueVectorization() const
Return true if the loop vectorizer should consider vectorizing an otherwise scalar epilogue loop.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
bool isLegalMaskedStore(Type *DataType, Align Alignment) const
Return true if the target supports masked store.
bool enableInterleavedAccessVectorization() const
Enable matching of interleaved access groups.
unsigned getMinTripCountTailFoldingThreshold() const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
unsigned getMaxInterleaveFactor(ElementCount VF) const
unsigned getNumberOfParts(Type *Tp) const
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr) const
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
Query the target whether it would be prefered to create a predicated vector loop, which can avoid the...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool isLegalMaskedLoad(Type *DataType, Align Alignment) const
Return true if the target supports masked load.
Value handle that tracks a Value across RAUW.
Definition: ValueHandle.h:331
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition: Type.h:243
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isTokenTy() const
Return true if this is 'token'.
Definition: Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
op_iterator op_begin()
Definition: User.h:234
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
op_iterator op_end()
Definition: User.h:236
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:70
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:2739
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition: VPlan.h:2807
void execute(VPTransformState *State) override
The method which generates the output IR instructions that correspond to this VPBasicBlock,...
Definition: VPlan.cpp:443
iterator end()
Definition: VPlan.h:2770
iterator begin()
Recipe iterator methods.
Definition: VPlan.h:2768
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition: VPlan.h:2817
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition: VPlan.cpp:210
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition: VPlan.h:2798
bool empty() const
Definition: VPlan.h:2779
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition: VPlan.h:1940
VPRegionBlock * getParent()
Definition: VPlan.h:498
const VPBasicBlock * getExitingBasicBlock() const
Definition: VPlan.cpp:175
void setName(const Twine &newName)
Definition: VPlan.h:491
VPlan * getPlan()
Definition: VPlan.cpp:148
const VPBasicBlock * getEntryBasicBlock() const
Definition: VPlan.cpp:153
VPBlockBase * getSingleSuccessor() const
Definition: VPlan.h:533
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBase NewBlock after BlockPtr.
Definition: VPlan.h:3293
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPBasicBlock * getInsertBlock() const
VPValue * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL={}, const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createOverflowingOp(unsigned Opcode, std::initializer_list< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL={}, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPValue * createNot(VPValue *Operand, DebugLoc DL={}, const Twine &Name="")
VPValue * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL={}, const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition: VPlan.h:2472
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition: VPlanValue.h:421
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition: VPlanValue.h:399
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition: VPlanValue.h:411
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPValue * getStepValue() const
Definition: VPlan.h:2677
VPValue * getStartValue() const
Definition: VPlan.h:2676
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition: VPlan.h:1627
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition: VPlan.h:1671
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition: VPlan.h:1660
This is a concrete Recipe that models a single VPlan-level instruction.
Definition: VPlan.h:1166
@ FirstOrderRecurrenceSplice
Definition: VPlan.h:1172
unsigned getOpcode() const
Definition: VPlan.h:1266
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition: VPlan.h:1997
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition: VPlan.h:2038
VPValue * getMask() const
Return the mask used by this recipe.
Definition: VPlan.h:2044
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition: VPlan.h:2051
unsigned getNumStoreOperands() const
Returns the number of stored operands of this interleave group.
Definition: VPlan.h:2071
static VPLane getLastLaneForVF(const ElementCount &VF)
Definition: VPlan.h:169
static VPLane getFirstLane()
Definition: VPlan.h:167
A value that is used outside the VPlan.
Definition: VPlan.h:678
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition: VPlan.h:718
VPBasicBlock * getParent()
Definition: VPlan.h:743
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition: VPlan.h:809
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPValue * getVPValueOrAddLiveIn(Value *V, VPlan &Plan)
VPValue * createEdgeMask(BasicBlock *Src, BasicBlock *Dst)
A helper function that computes the predicate of the edge between SRC and DST.
VPReplicateRecipe * handleReplication(Instruction *I, VFRange &Range)
Build a VPReplicationRecipe for I.
VPValue * getBlockInMask(BasicBlock *BB) const
Returns the entry mask for the block BB.
VPValue * getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const
A helper that returns the previously computed predicate of the edge between SRC and DST.
iterator_range< mapped_iterator< Use *, std::function< VPValue *(Value *)> > > mapToVPValues(User::op_range Operands)
Returns a range mapping the values of the range Operands to their corresponding VPValues.
void fixHeaderPhis()
Add the incoming values from the backedge to reduction & first-order recurrence cross-iteration phis.
VPRecipeBase * tryToCreateWidenRecipe(Instruction *Instr, ArrayRef< VPValue * > Operands, VFRange &Range, VPBasicBlock *VPBB)
Create and return a widened recipe for I if one can be created within the given VF Range.
void createHeaderMask()
Create the mask for the vector loop header block.
void createBlockInMask(BasicBlock *BB)
A helper function that computes the predicate of the block BB, assuming that the header block of the ...
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
void setFlags(Instruction *I) const
Set the IR flags for I.
Definition: VPlan.h:1089
A recipe for handling reduction phis.
Definition: VPlan.h:1881
bool isInLoop() const
Returns true, if the phi is part of an in-loop reduction.
Definition: VPlan.h:1935
const RecurrenceDescriptor & getRecurrenceDescriptor() const
Definition: VPlan.h:1927
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition: VPlan.h:2086
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition: VPlan.h:2872
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition: VPlan.h:2943
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition: VPlan.h:2134
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition: VPlan.h:835
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition: VPlan.h:895
This class can be used to assign names to VPValues.
Definition: VPlanValue.h:452
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition: VPlanValue.h:203
operand_range operands()
Definition: VPlanValue.h:278
void setOperand(unsigned I, VPValue *New)
Definition: VPlanValue.h:258
unsigned getNumOperands() const
Definition: VPlanValue.h:252
VPValue * getOperand(unsigned N) const
Definition: VPlanValue.h:253
void addOperand(VPValue *Operand)
Definition: VPlanValue.h:247
Value * getUnderlyingValue()
Return the underlying Value attached to this VPValue.
Definition: VPlanValue.h:77
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const
Definition: VPlan.cpp:1302
void replaceAllUsesWith(VPValue *New)
Definition: VPlan.cpp:1270
user_iterator user_begin()
Definition: VPlanValue.h:129
unsigned getNumUsers() const
Definition: VPlanValue.h:112
Value * getLiveInIRValue()
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition: VPlanValue.h:173
user_iterator user_end()
Definition: VPlanValue.h:131
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition: VPlanValue.h:168
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition: VPlan.cpp:1274
user_range users()
Definition: VPlanValue.h:133
A recipe to compute the pointers for widened memory accesses of IndexTy for all parts.
Definition: VPlan.h:1571
A recipe for widening Call instructions.
Definition: VPlan.h:1456
A Recipe for widening the canonical induction variable of the vector loop.
Definition: VPlan.h:2597
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition: VPlan.h:1367
A recipe for handling GEP instructions.
Definition: VPlan.h:1529
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition: VPlan.h:1684
A common base class for widening memory operations.
Definition: VPlan.h:2291
bool Reverse
Whether the consecutive accessed addresses are in reverse order.
Definition: VPlan.h:2299
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
Definition: VPlan.h:2334
Instruction & Ingredient
Definition: VPlan.h:2293
VPValue * getMask() const
Return the mask used by this recipe.
Definition: VPlan.h:2348
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition: VPlan.h:2341
bool isReverse() const
Return whether the consecutive loaded/stored addresses are in reverse order.
Definition: VPlan.h:2338
A recipe for handling phis that are widened in the vector loop.
Definition: VPlan.h:1809
VPValue * getIncomingValue(unsigned I)
Returns the I th incoming VPValue.
Definition: VPlan.h:1848
VPBasicBlock * getIncomingBlock(unsigned I)
Returns the I th incoming VPBasicBlock.
Definition: VPlan.h:1845
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void execute(VPTransformState &State) override
Generate vector values for the pointer induction.
VPWidenRecipe is a recipe for producing a copy of vector type its ingredient.
Definition: VPlan.h:1335
Main class to build the VPlan H-CFG for an incoming IR.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition: VPlan.h:2973
void prepareToExecute(Value *TripCount, Value *VectorTripCount, Value *CanonicalIVStartValue, VPTransformState &State)
Prepare the plan for execution, setting up the required live-in values.
Definition: VPlan.cpp:783
VPBasicBlock * getEntry()
Definition: VPlan.h:3066
VPValue & getVectorTripCount()
The vector trip count.
Definition: VPlan.h:3091
void setName(const Twine &newName)
Definition: VPlan.h:3122
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition: VPlan.h:3094
VPValue * getTripCount() const
The trip count of the original loop.
Definition: VPlan.h:3070
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition: VPlan.h:3084
void removeLiveOut(PHINode *PN)
Definition: VPlan.h:3176
void addLiveOut(PHINode *PN, VPValue *V)
Definition: VPlan.cpp:993
VPBasicBlock * getPreheader()
Definition: VPlan.h:3195
VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition: VPlan.h:3157
static VPlanPtr createInitialVPlan(const SCEV *TripCount, ScalarEvolution &PSE)
Create initial VPlan skeleton, having an "entry" VPBasicBlock (wrapping original scalar pre-header) w...
Definition: VPlan.cpp:769
bool hasVF(ElementCount VF)
Definition: VPlan.h:3104
bool hasUF(unsigned UF) const
Definition: VPlan.h:3111
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition: VPlan.h:3077
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition: VPlan.h:3126
LLVM_DUMP_METHOD void dump() const
Dump the plan to stderr (for debugging).
Definition: VPlan.cpp:990
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition: VPlan.cpp:825
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition: VPlan.h:3165
const MapVector< PHINode *, VPLiveOut * > & getLiveOuts() const
Definition: VPlan.h:3181
VPValue * getSCEVExpansion(const SCEV *S) const
Definition: VPlan.h:3185
VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition: VPlan.cpp:1074
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition: Value.cpp:157
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition: Value.cpp:693
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
VectorBuilder & setEVL(Value *NewExplicitVectorLength)
Definition: VectorBuilder.h:77
VectorBuilder & setMask(Value *NewMask)
Definition: VectorBuilder.h:73
Value * createVectorInstruction(unsigned Opcode, Type *ReturnTy, ArrayRef< Value * > VecOpArray, const Twine &Name=Twine())
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:683
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:217
constexpr bool isNonZero() const
Definition: TypeSize.h:158
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:203
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition: TypeSize.h:243
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr bool isZero() const
Definition: TypeSize.h:156
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:210
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:239
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:224
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition: ilist_node.h:109
A range adaptor for a pair of iterators.
IteratorT end() const
IteratorT begin() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:660
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition: DwarfDebug.h:190
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:765
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:718
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:236
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE)
Get or create a VPValue that corresponds to the expansion of Expr.
Definition: VPlan.cpp:1459
bool isUniformAfterVectorization(VPValue *VPV)
Returns true if VPV is uniform after vectorization.
Definition: VPlan.h:3517
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
@ Offset
Definition: DWP.cpp:456
Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
Definition: LoopUtils.cpp:1820
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:849
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
unsigned getLoadStoreAddressSpace(Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7041
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
const SCEV * createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, Loop *OrigLoop)
std::pair< Instruction *, ElementCount > InstructionVFPair
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:425
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
Definition: LoopUtils.cpp:263
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2073
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
bool VerifySCEV
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition: VPlanCFG.h:226
auto map_range(ContainerTy &&C, FuncTy F)
Definition: STLExtras.h:377
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
cl::opt< bool > EnableVPlanNativePath("enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization."))
Definition: VPlan.cpp:53
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
std::unique_ptr< VPlan > VPlanPtr
Definition: VPlan.h:134
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:116
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
cl::opt< bool > EnableLoopVectorization
Align getLoadStoreAlignment(Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition: STLExtras.h:572
void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition: STLExtras.h:336
Type * ToVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
Definition: VectorUtils.h:133
bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
Definition: DebugInfo.cpp:2433
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, Loop *RemainderLoop, uint64_t UF)
Set weights for UnrolledLoop and RemainderLoop based on weights for OrigLoop and the following distri...
Definition: LoopUtils.cpp:1628
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
ScalarEpilogueLowering
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
@ Invalid
Denotes invalid value.
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
void reportVectorizationInfo(const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
Definition: LoopUtils.cpp:1880
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
InstructionCost Cost
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:613
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
bool verifyVPlanIsValid(const VPlan &Plan)
Verify invariants for general VPlans.
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:491
cl::opt< bool > EnableLoopInterleaving
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
#define OP(n)
Definition: regex2.h:73
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:26
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:50
ElementCountComparator creates a total ordering for ElementCount for the purposes of using it in a se...
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
SmallMapVector< unsigned, unsigned, 4 > LoopInvariantRegs
Holds the number of loop invariant values that are used in the loop.
TargetLibraryInfo * TLI
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LoopVectorizePass(LoopVectorizeOptions Opts={})
BlockFrequencyInfo * BFI
ScalarEvolution * SE
AssumptionCache * AC
LoopVectorizeResult runImpl(Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
Storage for information about made changes.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition: PassManager.h:74
A marker to determine if extra passes after loop vectorization should be run.
Definition: LoopVectorize.h:85
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Flags describing the kind of vector reduction.
Parameters that control the generic loop unrolling transformation.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Definition: VPlan.h:87
ElementCount End
Definition: VPlan.h:92
A recipe for handling first-order recurrence phis.
Definition: VPlan.h:1854
VPIteration represents a single point in the iteration space of the output (vectorized and/or unrolle...
Definition: VPlan.h:219
bool isFirstIteration() const
Definition: VPlan.h:231
BasicBlock * PrevBB
The previous IR BasicBlock created or used.
Definition: VPlan.h:374
SmallDenseMap< VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
Definition: VPlan.h:382
BasicBlock * ExitBB
The last IR BasicBlock in the output IR.
Definition: VPlan.h:378
BasicBlock * getPreheaderBBFor(VPRecipeBase *R)
Returns the BasicBlock* mapped to the pre-header of the loop region containing R.
Definition: VPlan.cpp:348
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
Definition: VPlan.h:236
Value * get(VPValue *Def, unsigned Part, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def and a given Part if IsScalar is false,...
Definition: VPlan.cpp:247
DenseMap< const SCEV *, Value * > ExpandedSCEVs
Map SCEVs to their expanded values.
Definition: VPlan.h:418
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
Definition: VPlan.h:421
void addMetadata(Value *To, Instruction *From)
Add metadata from one instruction to another.
Definition: VPlan.cpp:361
struct llvm::VPTransformState::CFGState CFG
LoopVersioning * LVer
LoopVersioning.
Definition: VPlan.h:414
void addNewMetadata(Instruction *To, const Instruction *Orig)
Add additional metadata to To that was not present on Orig.
Definition: VPlan.cpp:353
VPValue * EVL
If EVL (Explicit Vector Length) is not nullptr, then EVL must be a valid value set during plan transf...
Definition: VPlan.h:252
void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance)
Construct the vector value of a scalarized value V one lane at a time.
Definition: VPlan.cpp:393
void set(VPValue *Def, Value *V, unsigned Part, bool IsScalar=false)
Set the generated vector Value for a given VPValue and a given Part, if IsScalar is false.
Definition: VPlan.h:297
std::optional< VPIteration > Instance
Hold the indices to generate specific scalar instructions.
Definition: VPlan.h:257
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
Definition: VPlan.h:398
VPlan * Plan
Pointer to the VPlan code is generated for.
Definition: VPlan.h:404
InnerLoopVectorizer * ILV
Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
Definition: VPlan.h:401
ElementCount VF
The chosen Vectorization and Unroll Factors of the loop being vectorized.
Definition: VPlan.h:242
void setDebugLocFrom(DebugLoc DL)
Set the debug location in the builder using the debug location DL.
Definition: VPlan.cpp:372
A recipe for widening load operations, using the address to load from and an optional mask.
Definition: VPlan.h:2363
void execute(VPTransformState &State) override
Generate a wide load or gather.
A recipe for widening select instructions.
Definition: VPlan.h:1495
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition: VPlan.h:2402
void execute(VPTransformState &State) override
Generate a wide store or scatter.
VPValue * getStoredValue() const
Return the value stored by this recipe.
Definition: VPlan.h:2419
static void addExplicitVectorLength(VPlan &Plan)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void dropPoisonGeneratingRecipes(VPlan &Plan, function_ref< bool(BasicBlock *)> BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimize(VPlan &Plan, ScalarEvolution &SE)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void VPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, ScalarEvolution &SE, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs, LLVMContext &Ctx)
Insert truncates and extends for any truncated recipe.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Sink users of fixed-order recurrences after the recipe defining their previous value.
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.