LLVM 19.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanHCFGBuilder.h"
62#include "VPlanTransforms.h"
63#include "VPlanVerifier.h"
64#include "llvm/ADT/APInt.h"
65#include "llvm/ADT/ArrayRef.h"
66#include "llvm/ADT/DenseMap.h"
68#include "llvm/ADT/Hashing.h"
69#include "llvm/ADT/MapVector.h"
70#include "llvm/ADT/STLExtras.h"
72#include "llvm/ADT/SmallSet.h"
74#include "llvm/ADT/Statistic.h"
75#include "llvm/ADT/StringRef.h"
76#include "llvm/ADT/Twine.h"
81#include "llvm/Analysis/CFG.h"
97#include "llvm/IR/Attributes.h"
98#include "llvm/IR/BasicBlock.h"
99#include "llvm/IR/CFG.h"
100#include "llvm/IR/Constant.h"
101#include "llvm/IR/Constants.h"
102#include "llvm/IR/DataLayout.h"
103#include "llvm/IR/DebugInfo.h"
105#include "llvm/IR/DebugLoc.h"
106#include "llvm/IR/DerivedTypes.h"
108#include "llvm/IR/Dominators.h"
109#include "llvm/IR/Function.h"
110#include "llvm/IR/IRBuilder.h"
111#include "llvm/IR/InstrTypes.h"
112#include "llvm/IR/Instruction.h"
113#include "llvm/IR/Instructions.h"
115#include "llvm/IR/Intrinsics.h"
116#include "llvm/IR/MDBuilder.h"
117#include "llvm/IR/Metadata.h"
118#include "llvm/IR/Module.h"
119#include "llvm/IR/Operator.h"
120#include "llvm/IR/PatternMatch.h"
122#include "llvm/IR/Type.h"
123#include "llvm/IR/Use.h"
124#include "llvm/IR/User.h"
125#include "llvm/IR/Value.h"
126#include "llvm/IR/ValueHandle.h"
128#include "llvm/IR/Verifier.h"
129#include "llvm/Support/Casting.h"
132#include "llvm/Support/Debug.h"
145#include <algorithm>
146#include <cassert>
147#include <cmath>
148#include <cstdint>
149#include <functional>
150#include <iterator>
151#include <limits>
152#include <map>
153#include <memory>
154#include <string>
155#include <tuple>
156#include <utility>
157
158using namespace llvm;
159
160#define LV_NAME "loop-vectorize"
161#define DEBUG_TYPE LV_NAME
162
163#ifndef NDEBUG
164const char VerboseDebug[] = DEBUG_TYPE "-verbose";
165#endif
166
167/// @{
168/// Metadata attribute names
169const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
171 "llvm.loop.vectorize.followup_vectorized";
173 "llvm.loop.vectorize.followup_epilogue";
174/// @}
175
176STATISTIC(LoopsVectorized, "Number of loops vectorized");
177STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
178STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
179
181 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
182 cl::desc("Enable vectorization of epilogue loops."));
183
185 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
186 cl::desc("When epilogue vectorization is enabled, and a value greater than "
187 "1 is specified, forces the given VF for all applicable epilogue "
188 "loops."));
189
191 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
192 cl::desc("Only loops with vectorization factor equal to or larger than "
193 "the specified value are considered for epilogue vectorization."));
194
195/// Loops with a known constant trip count below this number are vectorized only
196/// if no scalar iteration overheads are incurred.
198 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
199 cl::desc("Loops with a constant trip count that is smaller than this "
200 "value are vectorized only if no scalar iteration overheads "
201 "are incurred."));
202
204 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
205 cl::desc("The maximum allowed number of runtime memory checks"));
206
207// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
208// that predication is preferred, and this lists all options. I.e., the
209// vectorizer will try to fold the tail-loop (epilogue) into the vector body
210// and predicate the instructions accordingly. If tail-folding fails, there are
211// different fallback strategies depending on these values:
213 enum Option {
217 };
218} // namespace PreferPredicateTy
219
221 "prefer-predicate-over-epilogue",
224 cl::desc("Tail-folding and predication preferences over creating a scalar "
225 "epilogue loop."),
227 "scalar-epilogue",
228 "Don't tail-predicate loops, create scalar epilogue"),
230 "predicate-else-scalar-epilogue",
231 "prefer tail-folding, create scalar epilogue if tail "
232 "folding fails."),
234 "predicate-dont-vectorize",
235 "prefers tail-folding, don't attempt vectorization if "
236 "tail-folding fails.")));
237
239 "force-tail-folding-style", cl::desc("Force the tail folding style"),
240 cl::init(TailFoldingStyle::None),
242 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
244 TailFoldingStyle::Data, "data",
245 "Create lane mask for data only, using active.lane.mask intrinsic"),
246 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
247 "data-without-lane-mask",
248 "Create lane mask with compare/stepvector"),
249 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
250 "Create lane mask using active.lane.mask intrinsic, and use "
251 "it for both data and control flow"),
252 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
253 "data-and-control-without-rt-check",
254 "Similar to data-and-control, but remove the runtime check"),
255 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
256 "Use predicated EVL instructions for tail folding. If EVL "
257 "is unsupported, fallback to data-without-lane-mask.")));
258
260 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
261 cl::desc("Maximize bandwidth when selecting vectorization factor which "
262 "will be determined by the smallest type in loop."));
263
265 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
266 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
267
268/// An interleave-group may need masking if it resides in a block that needs
269/// predication, or in order to mask away gaps.
271 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
272 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
273
275 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
276 cl::desc("A flag that overrides the target's number of scalar registers."));
277
279 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
280 cl::desc("A flag that overrides the target's number of vector registers."));
281
283 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
284 cl::desc("A flag that overrides the target's max interleave factor for "
285 "scalar loops."));
286
288 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
289 cl::desc("A flag that overrides the target's max interleave factor for "
290 "vectorized loops."));
291
293 "force-target-instruction-cost", cl::init(0), cl::Hidden,
294 cl::desc("A flag that overrides the target's expected cost for "
295 "an instruction to a single constant value. Mostly "
296 "useful for getting consistent testing."));
297
299 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
300 cl::desc(
301 "Pretend that scalable vectors are supported, even if the target does "
302 "not support them. This flag should only be used for testing."));
303
305 "small-loop-cost", cl::init(20), cl::Hidden,
306 cl::desc(
307 "The cost of a loop that is considered 'small' by the interleaver."));
308
310 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
311 cl::desc("Enable the use of the block frequency analysis to access PGO "
312 "heuristics minimizing code growth in cold regions and being more "
313 "aggressive in hot regions."));
314
315// Runtime interleave loops for load/store throughput.
317 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
318 cl::desc(
319 "Enable runtime interleaving until load/store ports are saturated"));
320
321/// The number of stores in a loop that are allowed to need predication.
323 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
324 cl::desc("Max number of stores to be predicated behind an if."));
325
327 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
328 cl::desc("Count the induction variable only once when interleaving"));
329
331 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
332 cl::desc("Enable if predication of stores during vectorization."));
333
335 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
336 cl::desc("The maximum interleave count to use when interleaving a scalar "
337 "reduction in a nested loop."));
338
339static cl::opt<bool>
340 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
342 cl::desc("Prefer in-loop vector reductions, "
343 "overriding the targets preference."));
344
346 "force-ordered-reductions", cl::init(false), cl::Hidden,
347 cl::desc("Enable the vectorisation of loops with in-order (strict) "
348 "FP reductions"));
349
351 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
352 cl::desc(
353 "Prefer predicating a reduction operation over an after loop select."));
354
355namespace llvm {
357 "enable-vplan-native-path", cl::Hidden,
358 cl::desc("Enable VPlan-native vectorization path with "
359 "support for outer loop vectorization."));
360}
361
362// This flag enables the stress testing of the VPlan H-CFG construction in the
363// VPlan-native vectorization path. It must be used in conjuction with
364// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
365// verification of the H-CFGs built.
367 "vplan-build-stress-test", cl::init(false), cl::Hidden,
368 cl::desc(
369 "Build VPlan for every supported loop nest in the function and bail "
370 "out right after the build (stress test the VPlan H-CFG construction "
371 "in the VPlan-native vectorization path)."));
372
374 "interleave-loops", cl::init(true), cl::Hidden,
375 cl::desc("Enable loop interleaving in Loop vectorization passes"));
377 "vectorize-loops", cl::init(true), cl::Hidden,
378 cl::desc("Run the Loop vectorization passes"));
379
381 "vplan-print-in-dot-format", cl::Hidden,
382 cl::desc("Use dot format instead of plain text when dumping VPlans"));
383
385 "force-widen-divrem-via-safe-divisor", cl::Hidden,
386 cl::desc(
387 "Override cost based safe divisor widening for div/rem instructions"));
388
390 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
392 cl::desc("Try wider VFs if they enable the use of vector variants"));
393
394// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
395// variables not overflowing do not hold. See `emitSCEVChecks`.
396static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
397// Likelyhood of bypassing the vectorized loop because pointers overlap. See
398// `emitMemRuntimeChecks`.
399static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
400// Likelyhood of bypassing the vectorized loop because there are zero trips left
401// after prolog. See `emitIterationCountCheck`.
402static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
403
404/// A helper function that returns true if the given type is irregular. The
405/// type is irregular if its allocated size doesn't equal the store size of an
406/// element of the corresponding vector type.
407static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
408 // Determine if an array of N elements of type Ty is "bitcast compatible"
409 // with a <N x Ty> vector.
410 // This is only true if there is no padding between the array elements.
411 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
412}
413
414/// A helper function that returns the reciprocal of the block probability of
415/// predicated blocks. If we return X, we are assuming the predicated block
416/// will execute once for every X iterations of the loop header.
417///
418/// TODO: We should use actual block probability here, if available. Currently,
419/// we always assume predicated blocks have a 50% chance of executing.
420static unsigned getReciprocalPredBlockProb() { return 2; }
421
422/// Returns "best known" trip count for the specified loop \p L as defined by
423/// the following procedure:
424/// 1) Returns exact trip count if it is known.
425/// 2) Returns expected trip count according to profile data if any.
426/// 3) Returns upper bound estimate if it is known.
427/// 4) Returns std::nullopt if all of the above failed.
428static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
429 Loop *L) {
430 // Check if exact trip count is known.
431 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
432 return ExpectedTC;
433
434 // Check if there is an expected trip count available from profile data.
436 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
437 return *EstimatedTC;
438
439 // Check if upper bound estimate is known.
440 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
441 return ExpectedTC;
442
443 return std::nullopt;
444}
445
446/// Return a vector containing interleaved elements from multiple
447/// smaller input vectors.
449 const Twine &Name) {
450 unsigned Factor = Vals.size();
451 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
452
453 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
454#ifndef NDEBUG
455 for (Value *Val : Vals)
456 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
457#endif
458
459 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
460 // must use intrinsics to interleave.
461 if (VecTy->isScalableTy()) {
462 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
463 return Builder.CreateIntrinsic(
464 WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
465 /*FMFSource=*/nullptr, Name);
466 }
467
468 // Fixed length. Start by concatenating all vectors into a wide vector.
469 Value *WideVec = concatenateVectors(Builder, Vals);
470
471 // Interleave the elements into the wide vector.
472 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
473 return Builder.CreateShuffleVector(
474 WideVec, createInterleaveMask(NumElts, Factor), Name);
475}
476
477namespace {
478// Forward declare GeneratedRTChecks.
479class GeneratedRTChecks;
480
481using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
482} // namespace
483
484namespace llvm {
485
487
488/// InnerLoopVectorizer vectorizes loops which contain only one basic
489/// block to a specified vectorization factor (VF).
490/// This class performs the widening of scalars into vectors, or multiple
491/// scalars. This class also implements the following features:
492/// * It inserts an epilogue loop for handling loops that don't have iteration
493/// counts that are known to be a multiple of the vectorization factor.
494/// * It handles the code generation for reduction variables.
495/// * Scalarization (implementation using scalars) of un-vectorizable
496/// instructions.
497/// InnerLoopVectorizer does not perform any vectorization-legality
498/// checks, and relies on the caller to check for the different legality
499/// aspects. The InnerLoopVectorizer relies on the
500/// LoopVectorizationLegality class to provide information about the induction
501/// and reduction variables that were found to a given vectorization factor.
503public:
506 const TargetLibraryInfo *TLI,
510 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
512 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
513 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
514 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
515 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
517 // Query this against the original loop and save it here because the profile
518 // of the original loop header may change as the transformation happens.
521
523 this->MinProfitableTripCount = VecWidth;
524 else
525 this->MinProfitableTripCount = MinProfitableTripCount;
526 }
527
528 virtual ~InnerLoopVectorizer() = default;
529
530 /// Create a new empty loop that will contain vectorized instructions later
531 /// on, while the old loop will be used as the scalar remainder. Control flow
532 /// is generated around the vectorized (and scalar epilogue) loops consisting
533 /// of various checks and bypasses. Return the pre-header block of the new
534 /// loop and the start value for the canonical induction, if it is != 0. The
535 /// latter is the case when vectorizing the epilogue loop. In the case of
536 /// epilogue vectorization, this function is overriden to handle the more
537 /// complex control flow around the loops. \p ExpandedSCEVs is used to
538 /// look up SCEV expansions for expressions needed during skeleton creation.
539 virtual std::pair<BasicBlock *, Value *>
540 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
541
542 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
543 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
544
545 // Return true if any runtime check is added.
547
548 /// A helper function to scalarize a single Instruction in the innermost loop.
549 /// Generates a sequence of scalar instances for each lane between \p MinLane
550 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
551 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
552 /// Instr's operands.
553 void scalarizeInstruction(const Instruction *Instr,
554 VPReplicateRecipe *RepRecipe,
555 const VPIteration &Instance,
556 VPTransformState &State);
557
558 /// Try to vectorize interleaved access group \p Group with the base address
559 /// given in \p Addr, optionally masking the vector operations if \p
560 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
561 /// values in the vectorized loop.
563 ArrayRef<VPValue *> VPDefs,
565 ArrayRef<VPValue *> StoredValues,
566 VPValue *BlockInMask, bool NeedsMaskForGaps);
567
568 /// Fix the non-induction PHIs in \p Plan.
569 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
570
571 /// Create a new phi node for the induction variable \p OrigPhi to resume
572 /// iteration count in the scalar epilogue, from where the vectorized loop
573 /// left off. \p Step is the SCEV-expanded induction step to use. In cases
574 /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
575 /// and the resume values can come from an additional bypass block, the \p
576 /// AdditionalBypass pair provides information about the bypass block and the
577 /// end value on the edge from bypass to this loop.
579 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
580 ArrayRef<BasicBlock *> BypassBlocks,
581 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
582
583 /// Returns the original loop trip count.
584 Value *getTripCount() const { return TripCount; }
585
586 /// Used to set the trip count after ILV's construction and after the
587 /// preheader block has been executed. Note that this always holds the trip
588 /// count of the original loop for both main loop and epilogue vectorization.
589 void setTripCount(Value *TC) { TripCount = TC; }
590
591protected:
593
594 /// A small list of PHINodes.
596
597 /// A type for scalarized values in the new loop. Each value from the
598 /// original loop, when scalarized, is represented by UF x VF scalar values
599 /// in the new unrolled loop, where UF is the unroll factor and VF is the
600 /// vectorization factor.
602
603 /// Set up the values of the IVs correctly when exiting the vector loop.
604 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
605 Value *VectorTripCount, Value *EndValue,
606 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
607 VPlan &Plan, VPTransformState &State);
608
609 /// Create the exit value of first order recurrences in the middle block and
610 /// update their users.
612 VPTransformState &State);
613
614 /// Iteratively sink the scalarized operands of a predicated instruction into
615 /// the block that was created for it.
616 void sinkScalarOperands(Instruction *PredInst);
617
618 /// Returns (and creates if needed) the trip count of the widened loop.
620
621 /// Returns a bitcasted value to the requested vector type.
622 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
624 const DataLayout &DL);
625
626 /// Emit a bypass check to see if the vector trip count is zero, including if
627 /// it overflows.
629
630 /// Emit a bypass check to see if all of the SCEV assumptions we've
631 /// had to make are correct. Returns the block containing the checks or
632 /// nullptr if no checks have been added.
634
635 /// Emit bypass checks to check any memory assumptions we may have made.
636 /// Returns the block containing the checks or nullptr if no checks have been
637 /// added.
639
640 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
641 /// vector loop preheader, middle block and scalar preheader.
643
644 /// Create new phi nodes for the induction variables to resume iteration count
645 /// in the scalar epilogue, from where the vectorized loop left off.
646 /// In cases where the loop skeleton is more complicated (eg. epilogue
647 /// vectorization) and the resume values can come from an additional bypass
648 /// block, the \p AdditionalBypass pair provides information about the bypass
649 /// block and the end value on the edge from bypass to this loop.
651 const SCEV2ValueTy &ExpandedSCEVs,
652 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
653
654 /// Complete the loop skeleton by adding debug MDs, creating appropriate
655 /// conditional branches in the middle block, preparing the builder and
656 /// running the verifier. Return the preheader of the completed vector loop.
658
659 /// Allow subclasses to override and print debug traces before/after vplan
660 /// execution, when trace information is requested.
661 virtual void printDebugTracesAtStart(){};
662 virtual void printDebugTracesAtEnd(){};
663
664 /// The original loop.
666
667 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
668 /// dynamic knowledge to simplify SCEV expressions and converts them to a
669 /// more usable form.
671
672 /// Loop Info.
674
675 /// Dominator Tree.
677
678 /// Target Library Info.
680
681 /// Target Transform Info.
683
684 /// Assumption Cache.
686
687 /// Interface to emit optimization remarks.
689
690 /// The vectorization SIMD factor to use. Each vector will have this many
691 /// vector elements.
693
695
696 /// The vectorization unroll factor to use. Each scalar is vectorized to this
697 /// many different vector instructions.
698 unsigned UF;
699
700 /// The builder that we use
702
703 // --- Vectorization state ---
704
705 /// The vector-loop preheader.
707
708 /// The scalar-loop preheader.
710
711 /// Middle Block between the vector and the scalar.
713
714 /// The unique ExitBlock of the scalar loop if one exists. Note that
715 /// there can be multiple exiting edges reaching this block.
717
718 /// The scalar loop body.
720
721 /// A list of all bypass blocks. The first block is the entry of the loop.
723
724 /// Store instructions that were predicated.
726
727 /// Trip count of the original loop.
728 Value *TripCount = nullptr;
729
730 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
732
733 /// The legality analysis.
735
736 /// The profitablity analysis.
738
739 // Record whether runtime checks are added.
740 bool AddedSafetyChecks = false;
741
742 // Holds the end values for each induction variable. We save the end values
743 // so we can later fix-up the external users of the induction variables.
745
746 /// BFI and PSI are used to check for profile guided size optimizations.
749
750 // Whether this loop should be optimized for size based on profile guided size
751 // optimizatios.
753
754 /// Structure to hold information about generated runtime checks, responsible
755 /// for cleaning the checks, if vectorization turns out unprofitable.
756 GeneratedRTChecks &RTChecks;
757
758 // Holds the resume values for reductions in the loops, used to set the
759 // correct start value of reduction PHIs when vectorizing the epilogue.
762};
763
765public:
768 const TargetLibraryInfo *TLI,
770 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
773 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
775 ElementCount::getFixed(1),
776 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
777 BFI, PSI, Check) {}
778};
779
780/// Encapsulate information regarding vectorization of a loop and its epilogue.
781/// This information is meant to be updated and used across two stages of
782/// epilogue vectorization.
785 unsigned MainLoopUF = 0;
787 unsigned EpilogueUF = 0;
792 Value *TripCount = nullptr;
794
796 ElementCount EVF, unsigned EUF)
797 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
798 assert(EUF == 1 &&
799 "A high UF for the epilogue loop is likely not beneficial.");
800 }
801};
802
803/// An extension of the inner loop vectorizer that creates a skeleton for a
804/// vectorized loop that has its epilogue (residual) also vectorized.
805/// The idea is to run the vplan on a given loop twice, firstly to setup the
806/// skeleton and vectorize the main loop, and secondly to complete the skeleton
807/// from the first step and vectorize the epilogue. This is achieved by
808/// deriving two concrete strategy classes from this base class and invoking
809/// them in succession from the loop vectorizer planner.
811public:
819 GeneratedRTChecks &Checks)
821 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
822 CM, BFI, PSI, Checks),
823 EPI(EPI) {}
824
825 // Override this function to handle the more complex control flow around the
826 // three loops.
827 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
828 const SCEV2ValueTy &ExpandedSCEVs) final {
829 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
830 }
831
832 /// The interface for creating a vectorized skeleton using one of two
833 /// different strategies, each corresponding to one execution of the vplan
834 /// as described above.
835 virtual std::pair<BasicBlock *, Value *>
836 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
837
838 /// Holds and updates state information required to vectorize the main loop
839 /// and its epilogue in two separate passes. This setup helps us avoid
840 /// regenerating and recomputing runtime safety checks. It also helps us to
841 /// shorten the iteration-count-check path length for the cases where the
842 /// iteration count of the loop is so small that the main vector loop is
843 /// completely skipped.
845};
846
847/// A specialized derived class of inner loop vectorizer that performs
848/// vectorization of *main* loops in the process of vectorizing loops and their
849/// epilogues.
851public:
859 GeneratedRTChecks &Check)
861 EPI, LVL, CM, BFI, PSI, Check) {}
862 /// Implements the interface for creating a vectorized skeleton using the
863 /// *main loop* strategy (ie the first pass of vplan execution).
864 std::pair<BasicBlock *, Value *>
865 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
866
867protected:
868 /// Emits an iteration count bypass check once for the main loop (when \p
869 /// ForEpilogue is false) and once for the epilogue loop (when \p
870 /// ForEpilogue is true).
871 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
872 void printDebugTracesAtStart() override;
873 void printDebugTracesAtEnd() override;
874};
875
876// A specialized derived class of inner loop vectorizer that performs
877// vectorization of *epilogue* loops in the process of vectorizing loops and
878// their epilogues.
880public:
888 GeneratedRTChecks &Checks)
890 EPI, LVL, CM, BFI, PSI, Checks) {
892 }
893 /// Implements the interface for creating a vectorized skeleton using the
894 /// *epilogue loop* strategy (ie the second pass of vplan execution).
895 std::pair<BasicBlock *, Value *>
896 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
897
898protected:
899 /// Emits an iteration count bypass check after the main vector loop has
900 /// finished to see if there are any iterations left to execute by either
901 /// the vector epilogue or the scalar epilogue.
903 BasicBlock *Bypass,
904 BasicBlock *Insert);
905 void printDebugTracesAtStart() override;
906 void printDebugTracesAtEnd() override;
907};
908} // end namespace llvm
909
910/// Look for a meaningful debug location on the instruction or it's
911/// operands.
913 if (!I)
914 return DebugLoc();
915
917 if (I->getDebugLoc() != Empty)
918 return I->getDebugLoc();
919
920 for (Use &Op : I->operands()) {
921 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
922 if (OpInst->getDebugLoc() != Empty)
923 return OpInst->getDebugLoc();
924 }
925
926 return I->getDebugLoc();
927}
928
929/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
930/// is passed, the message relates to that particular instruction.
931#ifndef NDEBUG
932static void debugVectorizationMessage(const StringRef Prefix,
933 const StringRef DebugMsg,
934 Instruction *I) {
935 dbgs() << "LV: " << Prefix << DebugMsg;
936 if (I != nullptr)
937 dbgs() << " " << *I;
938 else
939 dbgs() << '.';
940 dbgs() << '\n';
941}
942#endif
943
944/// Create an analysis remark that explains why vectorization failed
945///
946/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
947/// RemarkName is the identifier for the remark. If \p I is passed it is an
948/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
949/// the location of the remark. \return the remark object that can be
950/// streamed to.
952 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
953 Value *CodeRegion = TheLoop->getHeader();
954 DebugLoc DL = TheLoop->getStartLoc();
955
956 if (I) {
957 CodeRegion = I->getParent();
958 // If there is no debug location attached to the instruction, revert back to
959 // using the loop's.
960 if (I->getDebugLoc())
961 DL = I->getDebugLoc();
962 }
963
964 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
965}
966
967namespace llvm {
968
969/// Return a value for Step multiplied by VF.
971 int64_t Step) {
972 assert(Ty->isIntegerTy() && "Expected an integer step");
973 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
974}
975
976/// Return the runtime value for VF.
978 return B.CreateElementCount(Ty, VF);
979}
980
982 Loop *OrigLoop) {
983 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
984 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
985
986 ScalarEvolution &SE = *PSE.getSE();
987 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
988}
989
991 const StringRef OREMsg, const StringRef ORETag,
992 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
993 Instruction *I) {
994 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
995 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
996 ORE->emit(
997 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
998 << "loop not vectorized: " << OREMsg);
999}
1000
1001void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1002 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1003 Instruction *I) {
1005 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1006 ORE->emit(
1007 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1008 << Msg);
1009}
1010
1011/// Report successful vectorization of the loop. In case an outer loop is
1012/// vectorized, prepend "outer" to the vectorization remark.
1014 VectorizationFactor VF, unsigned IC) {
1016 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
1017 nullptr));
1018 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
1019 ORE->emit([&]() {
1020 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
1021 TheLoop->getHeader())
1022 << "vectorized " << LoopType << "loop (vectorization width: "
1023 << ore::NV("VectorizationFactor", VF.Width)
1024 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
1025 });
1026}
1027
1028} // end namespace llvm
1029
1030#ifndef NDEBUG
1031/// \return string containing a file name and a line # for the given loop.
1032static std::string getDebugLocString(const Loop *L) {
1033 std::string Result;
1034 if (L) {
1035 raw_string_ostream OS(Result);
1036 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1037 LoopDbgLoc.print(OS);
1038 else
1039 // Just print the module name.
1040 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1041 OS.flush();
1042 }
1043 return Result;
1044}
1045#endif
1046
1047namespace llvm {
1048
1049// Loop vectorization cost-model hints how the scalar epilogue loop should be
1050// lowered.
1052
1053 // The default: allowing scalar epilogues.
1055
1056 // Vectorization with OptForSize: don't allow epilogues.
1058
1059 // A special case of vectorisation with OptForSize: loops with a very small
1060 // trip count are considered for vectorization under OptForSize, thereby
1061 // making sure the cost of their loop body is dominant, free of runtime
1062 // guards and scalar iteration overheads.
1064
1065 // Loop hint predicate indicating an epilogue is undesired.
1067
1068 // Directive indicating we must either tail fold or not vectorize
1071
1072using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1073
1074/// LoopVectorizationCostModel - estimates the expected speedups due to
1075/// vectorization.
1076/// In many cases vectorization is not profitable. This can happen because of
1077/// a number of reasons. In this class we mainly attempt to predict the
1078/// expected speedup/slowdowns due to the supported instruction set. We use the
1079/// TargetTransformInfo to query the different backends for the cost of
1080/// different operations.
1082public:
1086 const TargetTransformInfo &TTI,
1092 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1093 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1094 Hints(Hints), InterleaveInfo(IAI) {}
1095
1096 /// \return An upper bound for the vectorization factors (both fixed and
1097 /// scalable). If the factors are 0, vectorization and interleaving should be
1098 /// avoided up front.
1099 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1100
1101 /// \return True if runtime checks are required for vectorization, and false
1102 /// otherwise.
1103 bool runtimeChecksRequired();
1104
1105 /// Setup cost-based decisions for user vectorization factor.
1106 /// \return true if the UserVF is a feasible VF to be chosen.
1110 return expectedCost(UserVF).first.isValid();
1111 }
1112
1113 /// \return The size (in bits) of the smallest and widest types in the code
1114 /// that needs to be vectorized. We ignore values that remain scalar such as
1115 /// 64 bit loop indices.
1116 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1117
1118 /// \return The desired interleave count.
1119 /// If interleave count has been specified by metadata it will be returned.
1120 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1121 /// are the selected vectorization factor and the cost of the selected VF.
1122 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1123
1124 /// Memory access instruction may be vectorized in more than one way.
1125 /// Form of instruction after vectorization depends on cost.
1126 /// This function takes cost-based decisions for Load/Store instructions
1127 /// and collects them in a map. This decisions map is used for building
1128 /// the lists of loop-uniform and loop-scalar instructions.
1129 /// The calculated cost is saved with widening decision in order to
1130 /// avoid redundant calculations.
1132
1133 /// A call may be vectorized in different ways depending on whether we have
1134 /// vectorized variants available and whether the target supports masking.
1135 /// This function analyzes all calls in the function at the supplied VF,
1136 /// makes a decision based on the costs of available options, and stores that
1137 /// decision in a map for use in planning and plan execution.
1139
1140 /// A struct that represents some properties of the register usage
1141 /// of a loop.
1143 /// Holds the number of loop invariant values that are used in the loop.
1144 /// The key is ClassID of target-provided register class.
1146 /// Holds the maximum number of concurrent live intervals in the loop.
1147 /// The key is ClassID of target-provided register class.
1149 };
1150
1151 /// \return Returns information about the register usages of the loop for the
1152 /// given vectorization factors.
1155
1156 /// Collect values we want to ignore in the cost model.
1157 void collectValuesToIgnore();
1158
1159 /// Collect all element types in the loop for which widening is needed.
1161
1162 /// Split reductions into those that happen in the loop, and those that happen
1163 /// outside. In loop reductions are collected into InLoopReductions.
1165
1166 /// Returns true if we should use strict in-order reductions for the given
1167 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1168 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1169 /// of FP operations.
1170 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1171 return !Hints->allowReordering() && RdxDesc.isOrdered();
1172 }
1173
1174 /// \returns The smallest bitwidth each instruction can be represented with.
1175 /// The vector equivalents of these instructions should be truncated to this
1176 /// type.
1178 return MinBWs;
1179 }
1180
1181 /// \returns True if it is more profitable to scalarize instruction \p I for
1182 /// vectorization factor \p VF.
1184 assert(VF.isVector() &&
1185 "Profitable to scalarize relevant only for VF > 1.");
1186 assert(
1187 TheLoop->isInnermost() &&
1188 "cost-model should not be used for outer loops (in VPlan-native path)");
1189
1190 auto Scalars = InstsToScalarize.find(VF);
1191 assert(Scalars != InstsToScalarize.end() &&
1192 "VF not yet analyzed for scalarization profitability");
1193 return Scalars->second.contains(I);
1194 }
1195
1196 /// Returns true if \p I is known to be uniform after vectorization.
1198 assert(
1199 TheLoop->isInnermost() &&
1200 "cost-model should not be used for outer loops (in VPlan-native path)");
1201 // Pseudo probe needs to be duplicated for each unrolled iteration and
1202 // vector lane so that profiled loop trip count can be accurately
1203 // accumulated instead of being under counted.
1204 if (isa<PseudoProbeInst>(I))
1205 return false;
1206
1207 if (VF.isScalar())
1208 return true;
1209
1210 auto UniformsPerVF = Uniforms.find(VF);
1211 assert(UniformsPerVF != Uniforms.end() &&
1212 "VF not yet analyzed for uniformity");
1213 return UniformsPerVF->second.count(I);
1214 }
1215
1216 /// Returns true if \p I is known to be scalar after vectorization.
1218 assert(
1219 TheLoop->isInnermost() &&
1220 "cost-model should not be used for outer loops (in VPlan-native path)");
1221 if (VF.isScalar())
1222 return true;
1223
1224 auto ScalarsPerVF = Scalars.find(VF);
1225 assert(ScalarsPerVF != Scalars.end() &&
1226 "Scalar values are not calculated for VF");
1227 return ScalarsPerVF->second.count(I);
1228 }
1229
1230 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1231 /// for vectorization factor \p VF.
1233 return VF.isVector() && MinBWs.contains(I) &&
1234 !isProfitableToScalarize(I, VF) &&
1236 }
1237
1238 /// Decision that was taken during cost calculation for memory instruction.
1241 CM_Widen, // For consecutive accesses with stride +1.
1242 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1249
1250 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1251 /// instruction \p I and vector width \p VF.
1254 assert(VF.isVector() && "Expected VF >=2");
1255 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1256 }
1257
1258 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1259 /// interleaving group \p Grp and vector width \p VF.
1263 assert(VF.isVector() && "Expected VF >=2");
1264 /// Broadcast this decicion to all instructions inside the group.
1265 /// But the cost will be assigned to one instruction only.
1266 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1267 if (auto *I = Grp->getMember(i)) {
1268 if (Grp->getInsertPos() == I)
1269 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1270 else
1271 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1272 }
1273 }
1274 }
1275
1276 /// Return the cost model decision for the given instruction \p I and vector
1277 /// width \p VF. Return CM_Unknown if this instruction did not pass
1278 /// through the cost modeling.
1280 assert(VF.isVector() && "Expected VF to be a vector VF");
1281 assert(
1282 TheLoop->isInnermost() &&
1283 "cost-model should not be used for outer loops (in VPlan-native path)");
1284
1285 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1286 auto Itr = WideningDecisions.find(InstOnVF);
1287 if (Itr == WideningDecisions.end())
1288 return CM_Unknown;
1289 return Itr->second.first;
1290 }
1291
1292 /// Return the vectorization cost for the given instruction \p I and vector
1293 /// width \p VF.
1295 assert(VF.isVector() && "Expected VF >=2");
1296 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1297 assert(WideningDecisions.contains(InstOnVF) &&
1298 "The cost is not calculated");
1299 return WideningDecisions[InstOnVF].second;
1300 }
1301
1306 std::optional<unsigned> MaskPos;
1308 };
1309
1311 Function *Variant, Intrinsic::ID IID,
1312 std::optional<unsigned> MaskPos,
1314 assert(!VF.isScalar() && "Expected vector VF");
1315 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1316 MaskPos, Cost};
1317 }
1318
1320 ElementCount VF) const {
1321 assert(!VF.isScalar() && "Expected vector VF");
1322 return CallWideningDecisions.at(std::make_pair(CI, VF));
1323 }
1324
1325 /// Return True if instruction \p I is an optimizable truncate whose operand
1326 /// is an induction variable. Such a truncate will be removed by adding a new
1327 /// induction variable with the destination type.
1329 // If the instruction is not a truncate, return false.
1330 auto *Trunc = dyn_cast<TruncInst>(I);
1331 if (!Trunc)
1332 return false;
1333
1334 // Get the source and destination types of the truncate.
1335 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1336 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1337
1338 // If the truncate is free for the given types, return false. Replacing a
1339 // free truncate with an induction variable would add an induction variable
1340 // update instruction to each iteration of the loop. We exclude from this
1341 // check the primary induction variable since it will need an update
1342 // instruction regardless.
1343 Value *Op = Trunc->getOperand(0);
1344 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1345 return false;
1346
1347 // If the truncated value is not an induction variable, return false.
1348 return Legal->isInductionPhi(Op);
1349 }
1350
1351 /// Collects the instructions to scalarize for each predicated instruction in
1352 /// the loop.
1354
1355 /// Collect Uniform and Scalar values for the given \p VF.
1356 /// The sets depend on CM decision for Load/Store instructions
1357 /// that may be vectorized as interleave, gather-scatter or scalarized.
1358 /// Also make a decision on what to do about call instructions in the loop
1359 /// at that VF -- scalarize, call a known vector routine, or call a
1360 /// vector intrinsic.
1362 // Do the analysis once.
1363 if (VF.isScalar() || Uniforms.contains(VF))
1364 return;
1367 collectLoopUniforms(VF);
1368 collectLoopScalars(VF);
1369 }
1370
1371 /// Returns true if the target machine supports masked store operation
1372 /// for the given \p DataType and kind of access to \p Ptr.
1373 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1374 return Legal->isConsecutivePtr(DataType, Ptr) &&
1375 TTI.isLegalMaskedStore(DataType, Alignment);
1376 }
1377
1378 /// Returns true if the target machine supports masked load operation
1379 /// for the given \p DataType and kind of access to \p Ptr.
1380 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1381 return Legal->isConsecutivePtr(DataType, Ptr) &&
1382 TTI.isLegalMaskedLoad(DataType, Alignment);
1383 }
1384
1385 /// Returns true if the target machine can represent \p V as a masked gather
1386 /// or scatter operation.
1388 bool LI = isa<LoadInst>(V);
1389 bool SI = isa<StoreInst>(V);
1390 if (!LI && !SI)
1391 return false;
1392 auto *Ty = getLoadStoreType(V);
1394 if (VF.isVector())
1395 Ty = VectorType::get(Ty, VF);
1396 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1397 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1398 }
1399
1400 /// Returns true if the target machine supports all of the reduction
1401 /// variables found for the given VF.
1403 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1404 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1405 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1406 }));
1407 }
1408
1409 /// Given costs for both strategies, return true if the scalar predication
1410 /// lowering should be used for div/rem. This incorporates an override
1411 /// option so it is not simply a cost comparison.
1413 InstructionCost SafeDivisorCost) const {
1414 switch (ForceSafeDivisor) {
1415 case cl::BOU_UNSET:
1416 return ScalarCost < SafeDivisorCost;
1417 case cl::BOU_TRUE:
1418 return false;
1419 case cl::BOU_FALSE:
1420 return true;
1421 };
1422 llvm_unreachable("impossible case value");
1423 }
1424
1425 /// Returns true if \p I is an instruction which requires predication and
1426 /// for which our chosen predication strategy is scalarization (i.e. we
1427 /// don't have an alternate strategy such as masking available).
1428 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1430
1431 /// Returns true if \p I is an instruction that needs to be predicated
1432 /// at runtime. The result is independent of the predication mechanism.
1433 /// Superset of instructions that return true for isScalarWithPredication.
1434 bool isPredicatedInst(Instruction *I) const;
1435
1436 /// Return the costs for our two available strategies for lowering a
1437 /// div/rem operation which requires speculating at least one lane.
1438 /// First result is for scalarization (will be invalid for scalable
1439 /// vectors); second is for the safe-divisor strategy.
1440 std::pair<InstructionCost, InstructionCost>
1442 ElementCount VF) const;
1443
1444 /// Returns true if \p I is a memory instruction with consecutive memory
1445 /// access that can be widened.
1447
1448 /// Returns true if \p I is a memory instruction in an interleaved-group
1449 /// of memory accesses that can be vectorized with wide vector loads/stores
1450 /// and shuffles.
1452
1453 /// Check if \p Instr belongs to any interleaved access group.
1455 return InterleaveInfo.isInterleaved(Instr);
1456 }
1457
1458 /// Get the interleaved access group that \p Instr belongs to.
1461 return InterleaveInfo.getInterleaveGroup(Instr);
1462 }
1463
1464 /// Returns true if we're required to use a scalar epilogue for at least
1465 /// the final iteration of the original loop.
1466 bool requiresScalarEpilogue(bool IsVectorizing) const {
1468 return false;
1469 // If we might exit from anywhere but the latch, must run the exiting
1470 // iteration in scalar form.
1472 return true;
1473 return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
1474 }
1475
1476 /// Returns true if we're required to use a scalar epilogue for at least
1477 /// the final iteration of the original loop for all VFs in \p Range.
1478 /// A scalar epilogue must either be required for all VFs in \p Range or for
1479 /// none.
1481 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1482 return requiresScalarEpilogue(VF.isVector());
1483 };
1484 bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1485 assert(
1486 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1487 "all VFs in range must agree on whether a scalar epilogue is required");
1488 return IsRequired;
1489 }
1490
1491 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1492 /// loop hint annotation.
1494 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1495 }
1496
1497 /// Returns the TailFoldingStyle that is best for the current loop.
1498 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1499 if (!ChosenTailFoldingStyle)
1501 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1502 : ChosenTailFoldingStyle->second;
1503 }
1504
1505 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1506 /// overflow or not.
1507 /// \param IsScalableVF true if scalable vector factors enabled.
1508 /// \param UserIC User specific interleave count.
1509 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1510 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1512 ChosenTailFoldingStyle =
1514 return;
1515 }
1516
1517 if (!ForceTailFoldingStyle.getNumOccurrences()) {
1518 ChosenTailFoldingStyle = std::make_pair(
1519 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1520 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1521 return;
1522 }
1523
1524 // Set styles when forced.
1525 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1526 ForceTailFoldingStyle.getValue());
1528 return;
1529 // Override forced styles if needed.
1530 // FIXME: use actual opcode/data type for analysis here.
1531 // FIXME: Investigate opportunity for fixed vector factor.
1532 bool EVLIsLegal =
1533 IsScalableVF && UserIC <= 1 &&
1534 TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1536 // FIXME: implement support for max safe dependency distance.
1538 // FIXME: remove this once reductions are supported.
1540 if (!EVLIsLegal) {
1541 // If for some reason EVL mode is unsupported, fallback to
1542 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1543 // in a generic way.
1544 ChosenTailFoldingStyle =
1547 LLVM_DEBUG(
1548 dbgs()
1549 << "LV: Preference for VP intrinsics indicated. Will "
1550 "not try to generate VP Intrinsics "
1551 << (UserIC > 1
1552 ? "since interleave count specified is greater than 1.\n"
1553 : "due to non-interleaving reasons.\n"));
1554 }
1555 }
1556
1557 /// Returns true if all loop blocks should be masked to fold tail loop.
1558 bool foldTailByMasking() const {
1559 // TODO: check if it is possible to check for None style independent of
1560 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1562 }
1563
1564 /// Returns true if the instructions in this block requires predication
1565 /// for any reason, e.g. because tail folding now requires a predicate
1566 /// or because the block in the original loop was predicated.
1569 }
1570
1571 /// Returns true if VP intrinsics with explicit vector length support should
1572 /// be generated in the tail folded loop.
1573 bool foldTailWithEVL() const {
1575 // FIXME: remove this once vp_reverse is supported.
1576 none_of(
1577 WideningDecisions,
1578 [](const std::pair<std::pair<Instruction *, ElementCount>,
1579 std::pair<InstWidening, InstructionCost>>
1580 &Data) { return Data.second.first == CM_Widen_Reverse; });
1581 }
1582
1583 /// Returns true if the Phi is part of an inloop reduction.
1584 bool isInLoopReduction(PHINode *Phi) const {
1585 return InLoopReductions.contains(Phi);
1586 }
1587
1588 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1589 /// with factor VF. Return the cost of the instruction, including
1590 /// scalarization overhead if it's needed.
1592
1593 /// Estimate cost of a call instruction CI if it were vectorized with factor
1594 /// VF. Return the cost of the instruction, including scalarization overhead
1595 /// if it's needed.
1597
1598 /// Invalidates decisions already taken by the cost model.
1600 WideningDecisions.clear();
1601 CallWideningDecisions.clear();
1602 Uniforms.clear();
1603 Scalars.clear();
1604 }
1605
1606 /// The vectorization cost is a combination of the cost itself and a boolean
1607 /// indicating whether any of the contributing operations will actually
1608 /// operate on vector values after type legalization in the backend. If this
1609 /// latter value is false, then all operations will be scalarized (i.e. no
1610 /// vectorization has actually taken place).
1611 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1612
1613 /// Returns the expected execution cost. The unit of the cost does
1614 /// not matter because we use the 'cost' units to compare different
1615 /// vector widths. The cost that is returned is *not* normalized by
1616 /// the factor width. If \p Invalid is not nullptr, this function
1617 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1618 /// each instruction that has an Invalid cost for the given VF.
1622
1623 bool hasPredStores() const { return NumPredStores > 0; }
1624
1625 /// Returns true if epilogue vectorization is considered profitable, and
1626 /// false otherwise.
1627 /// \p VF is the vectorization factor chosen for the original loop.
1629
1630private:
1631 unsigned NumPredStores = 0;
1632
1633 /// \return An upper bound for the vectorization factors for both
1634 /// fixed and scalable vectorization, where the minimum-known number of
1635 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1636 /// disabled or unsupported, then the scalable part will be equal to
1637 /// ElementCount::getScalable(0).
1638 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1639 ElementCount UserVF,
1640 bool FoldTailByMasking);
1641
1642 /// \return the maximized element count based on the targets vector
1643 /// registers and the loop trip-count, but limited to a maximum safe VF.
1644 /// This is a helper function of computeFeasibleMaxVF.
1645 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1646 unsigned SmallestType,
1647 unsigned WidestType,
1648 ElementCount MaxSafeVF,
1649 bool FoldTailByMasking);
1650
1651 /// \return the maximum legal scalable VF, based on the safe max number
1652 /// of elements.
1653 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1654
1655 /// Returns the execution time cost of an instruction for a given vector
1656 /// width. Vector width of one means scalar.
1657 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1658
1659 /// The cost-computation logic from getInstructionCost which provides
1660 /// the vector type as an output parameter.
1661 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1662 Type *&VectorTy);
1663
1664 /// Return the cost of instructions in an inloop reduction pattern, if I is
1665 /// part of that pattern.
1666 std::optional<InstructionCost>
1667 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1669
1670 /// Calculate vectorization cost of memory instruction \p I.
1671 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1672
1673 /// The cost computation for scalarized memory instruction.
1674 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1675
1676 /// The cost computation for interleaving group of memory instructions.
1677 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1678
1679 /// The cost computation for Gather/Scatter instruction.
1680 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1681
1682 /// The cost computation for widening instruction \p I with consecutive
1683 /// memory access.
1684 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1685
1686 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1687 /// Load: scalar load + broadcast.
1688 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1689 /// element)
1690 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1691
1692 /// Estimate the overhead of scalarizing an instruction. This is a
1693 /// convenience wrapper for the type-based getScalarizationOverhead API.
1694 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1696
1697 /// Returns true if an artificially high cost for emulated masked memrefs
1698 /// should be used.
1699 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1700
1701 /// Map of scalar integer values to the smallest bitwidth they can be legally
1702 /// represented as. The vector equivalents of these values should be truncated
1703 /// to this type.
1705
1706 /// A type representing the costs for instructions if they were to be
1707 /// scalarized rather than vectorized. The entries are Instruction-Cost
1708 /// pairs.
1709 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1710
1711 /// A set containing all BasicBlocks that are known to present after
1712 /// vectorization as a predicated block.
1714 PredicatedBBsAfterVectorization;
1715
1716 /// Records whether it is allowed to have the original scalar loop execute at
1717 /// least once. This may be needed as a fallback loop in case runtime
1718 /// aliasing/dependence checks fail, or to handle the tail/remainder
1719 /// iterations when the trip count is unknown or doesn't divide by the VF,
1720 /// or as a peel-loop to handle gaps in interleave-groups.
1721 /// Under optsize and when the trip count is very small we don't allow any
1722 /// iterations to execute in the scalar loop.
1723 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1724
1725 /// Control finally chosen tail folding style. The first element is used if
1726 /// the IV update may overflow, the second element - if it does not.
1727 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1728 ChosenTailFoldingStyle;
1729
1730 /// A map holding scalar costs for different vectorization factors. The
1731 /// presence of a cost for an instruction in the mapping indicates that the
1732 /// instruction will be scalarized when vectorizing with the associated
1733 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1735
1736 /// Holds the instructions known to be uniform after vectorization.
1737 /// The data is collected per VF.
1739
1740 /// Holds the instructions known to be scalar after vectorization.
1741 /// The data is collected per VF.
1743
1744 /// Holds the instructions (address computations) that are forced to be
1745 /// scalarized.
1747
1748 /// PHINodes of the reductions that should be expanded in-loop.
1749 SmallPtrSet<PHINode *, 4> InLoopReductions;
1750
1751 /// A Map of inloop reduction operations and their immediate chain operand.
1752 /// FIXME: This can be removed once reductions can be costed correctly in
1753 /// VPlan. This was added to allow quick lookup of the inloop operations.
1754 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1755
1756 /// Returns the expected difference in cost from scalarizing the expression
1757 /// feeding a predicated instruction \p PredInst. The instructions to
1758 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1759 /// non-negative return value implies the expression will be scalarized.
1760 /// Currently, only single-use chains are considered for scalarization.
1761 InstructionCost computePredInstDiscount(Instruction *PredInst,
1762 ScalarCostsTy &ScalarCosts,
1763 ElementCount VF);
1764
1765 /// Collect the instructions that are uniform after vectorization. An
1766 /// instruction is uniform if we represent it with a single scalar value in
1767 /// the vectorized loop corresponding to each vector iteration. Examples of
1768 /// uniform instructions include pointer operands of consecutive or
1769 /// interleaved memory accesses. Note that although uniformity implies an
1770 /// instruction will be scalar, the reverse is not true. In general, a
1771 /// scalarized instruction will be represented by VF scalar values in the
1772 /// vectorized loop, each corresponding to an iteration of the original
1773 /// scalar loop.
1774 void collectLoopUniforms(ElementCount VF);
1775
1776 /// Collect the instructions that are scalar after vectorization. An
1777 /// instruction is scalar if it is known to be uniform or will be scalarized
1778 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1779 /// to the list if they are used by a load/store instruction that is marked as
1780 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1781 /// VF values in the vectorized loop, each corresponding to an iteration of
1782 /// the original scalar loop.
1783 void collectLoopScalars(ElementCount VF);
1784
1785 /// Keeps cost model vectorization decision and cost for instructions.
1786 /// Right now it is used for memory instructions only.
1788 std::pair<InstWidening, InstructionCost>>;
1789
1790 DecisionList WideningDecisions;
1791
1792 using CallDecisionList =
1793 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1794
1795 CallDecisionList CallWideningDecisions;
1796
1797 /// Returns true if \p V is expected to be vectorized and it needs to be
1798 /// extracted.
1799 bool needsExtract(Value *V, ElementCount VF) const {
1800 Instruction *I = dyn_cast<Instruction>(V);
1801 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1803 return false;
1804
1805 // Assume we can vectorize V (and hence we need extraction) if the
1806 // scalars are not computed yet. This can happen, because it is called
1807 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1808 // the scalars are collected. That should be a safe assumption in most
1809 // cases, because we check if the operands have vectorizable types
1810 // beforehand in LoopVectorizationLegality.
1811 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1812 };
1813
1814 /// Returns a range containing only operands needing to be extracted.
1815 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1816 ElementCount VF) const {
1818 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1819 }
1820
1821public:
1822 /// The loop that we evaluate.
1824
1825 /// Predicated scalar evolution analysis.
1827
1828 /// Loop Info analysis.
1830
1831 /// Vectorization legality.
1833
1834 /// Vector target information.
1836
1837 /// Target Library Info.
1839
1840 /// Demanded bits analysis.
1842
1843 /// Assumption cache.
1845
1846 /// Interface to emit optimization remarks.
1848
1850
1851 /// Loop Vectorize Hint.
1853
1854 /// The interleave access information contains groups of interleaved accesses
1855 /// with the same stride and close to each other.
1857
1858 /// Values to ignore in the cost model.
1860
1861 /// Values to ignore in the cost model when VF > 1.
1863
1864 /// All element types found in the loop.
1866};
1867} // end namespace llvm
1868
1869namespace {
1870/// Helper struct to manage generating runtime checks for vectorization.
1871///
1872/// The runtime checks are created up-front in temporary blocks to allow better
1873/// estimating the cost and un-linked from the existing IR. After deciding to
1874/// vectorize, the checks are moved back. If deciding not to vectorize, the
1875/// temporary blocks are completely removed.
1876class GeneratedRTChecks {
1877 /// Basic block which contains the generated SCEV checks, if any.
1878 BasicBlock *SCEVCheckBlock = nullptr;
1879
1880 /// The value representing the result of the generated SCEV checks. If it is
1881 /// nullptr, either no SCEV checks have been generated or they have been used.
1882 Value *SCEVCheckCond = nullptr;
1883
1884 /// Basic block which contains the generated memory runtime checks, if any.
1885 BasicBlock *MemCheckBlock = nullptr;
1886
1887 /// The value representing the result of the generated memory runtime checks.
1888 /// If it is nullptr, either no memory runtime checks have been generated or
1889 /// they have been used.
1890 Value *MemRuntimeCheckCond = nullptr;
1891
1892 DominatorTree *DT;
1893 LoopInfo *LI;
1895
1896 SCEVExpander SCEVExp;
1897 SCEVExpander MemCheckExp;
1898
1899 bool CostTooHigh = false;
1900 const bool AddBranchWeights;
1901
1902 Loop *OuterLoop = nullptr;
1903
1904public:
1905 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1907 bool AddBranchWeights)
1908 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1909 MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1910
1911 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1912 /// accurately estimate the cost of the runtime checks. The blocks are
1913 /// un-linked from the IR and is added back during vector code generation. If
1914 /// there is no vector code generation, the check blocks are removed
1915 /// completely.
1916 void Create(Loop *L, const LoopAccessInfo &LAI,
1917 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1918
1919 // Hard cutoff to limit compile-time increase in case a very large number of
1920 // runtime checks needs to be generated.
1921 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1922 // profile info.
1923 CostTooHigh =
1925 if (CostTooHigh)
1926 return;
1927
1928 BasicBlock *LoopHeader = L->getHeader();
1929 BasicBlock *Preheader = L->getLoopPreheader();
1930
1931 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1932 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1933 // may be used by SCEVExpander. The blocks will be un-linked from their
1934 // predecessors and removed from LI & DT at the end of the function.
1935 if (!UnionPred.isAlwaysTrue()) {
1936 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1937 nullptr, "vector.scevcheck");
1938
1939 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1940 &UnionPred, SCEVCheckBlock->getTerminator());
1941 }
1942
1943 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1944 if (RtPtrChecking.Need) {
1945 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1946 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1947 "vector.memcheck");
1948
1949 auto DiffChecks = RtPtrChecking.getDiffChecks();
1950 if (DiffChecks) {
1951 Value *RuntimeVF = nullptr;
1952 MemRuntimeCheckCond = addDiffRuntimeChecks(
1953 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1954 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1955 if (!RuntimeVF)
1956 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1957 return RuntimeVF;
1958 },
1959 IC);
1960 } else {
1961 MemRuntimeCheckCond = addRuntimeChecks(
1962 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1964 }
1965 assert(MemRuntimeCheckCond &&
1966 "no RT checks generated although RtPtrChecking "
1967 "claimed checks are required");
1968 }
1969
1970 if (!MemCheckBlock && !SCEVCheckBlock)
1971 return;
1972
1973 // Unhook the temporary block with the checks, update various places
1974 // accordingly.
1975 if (SCEVCheckBlock)
1976 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1977 if (MemCheckBlock)
1978 MemCheckBlock->replaceAllUsesWith(Preheader);
1979
1980 if (SCEVCheckBlock) {
1981 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1982 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1983 Preheader->getTerminator()->eraseFromParent();
1984 }
1985 if (MemCheckBlock) {
1986 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1987 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1988 Preheader->getTerminator()->eraseFromParent();
1989 }
1990
1991 DT->changeImmediateDominator(LoopHeader, Preheader);
1992 if (MemCheckBlock) {
1993 DT->eraseNode(MemCheckBlock);
1994 LI->removeBlock(MemCheckBlock);
1995 }
1996 if (SCEVCheckBlock) {
1997 DT->eraseNode(SCEVCheckBlock);
1998 LI->removeBlock(SCEVCheckBlock);
1999 }
2000
2001 // Outer loop is used as part of the later cost calculations.
2002 OuterLoop = L->getParentLoop();
2003 }
2004
2005 InstructionCost getCost() {
2006 if (SCEVCheckBlock || MemCheckBlock)
2007 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
2008
2009 if (CostTooHigh) {
2011 Cost.setInvalid();
2012 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
2013 return Cost;
2014 }
2015
2016 InstructionCost RTCheckCost = 0;
2017 if (SCEVCheckBlock)
2018 for (Instruction &I : *SCEVCheckBlock) {
2019 if (SCEVCheckBlock->getTerminator() == &I)
2020 continue;
2023 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2024 RTCheckCost += C;
2025 }
2026 if (MemCheckBlock) {
2027 InstructionCost MemCheckCost = 0;
2028 for (Instruction &I : *MemCheckBlock) {
2029 if (MemCheckBlock->getTerminator() == &I)
2030 continue;
2033 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2034 MemCheckCost += C;
2035 }
2036
2037 // If the runtime memory checks are being created inside an outer loop
2038 // we should find out if these checks are outer loop invariant. If so,
2039 // the checks will likely be hoisted out and so the effective cost will
2040 // reduce according to the outer loop trip count.
2041 if (OuterLoop) {
2042 ScalarEvolution *SE = MemCheckExp.getSE();
2043 // TODO: If profitable, we could refine this further by analysing every
2044 // individual memory check, since there could be a mixture of loop
2045 // variant and invariant checks that mean the final condition is
2046 // variant.
2047 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
2048 if (SE->isLoopInvariant(Cond, OuterLoop)) {
2049 // It seems reasonable to assume that we can reduce the effective
2050 // cost of the checks even when we know nothing about the trip
2051 // count. Assume that the outer loop executes at least twice.
2052 unsigned BestTripCount = 2;
2053
2054 // If exact trip count is known use that.
2055 if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
2056 BestTripCount = SmallTC;
2058 // Else use profile data if available.
2059 if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
2060 BestTripCount = *EstimatedTC;
2061 }
2062
2063 BestTripCount = std::max(BestTripCount, 1U);
2064 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2065
2066 // Let's ensure the cost is always at least 1.
2067 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2069
2070 if (BestTripCount > 1)
2072 << "We expect runtime memory checks to be hoisted "
2073 << "out of the outer loop. Cost reduced from "
2074 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2075
2076 MemCheckCost = NewMemCheckCost;
2077 }
2078 }
2079
2080 RTCheckCost += MemCheckCost;
2081 }
2082
2083 if (SCEVCheckBlock || MemCheckBlock)
2084 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2085 << "\n");
2086
2087 return RTCheckCost;
2088 }
2089
2090 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2091 /// unused.
2092 ~GeneratedRTChecks() {
2093 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2094 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2095 if (!SCEVCheckCond)
2096 SCEVCleaner.markResultUsed();
2097
2098 if (!MemRuntimeCheckCond)
2099 MemCheckCleaner.markResultUsed();
2100
2101 if (MemRuntimeCheckCond) {
2102 auto &SE = *MemCheckExp.getSE();
2103 // Memory runtime check generation creates compares that use expanded
2104 // values. Remove them before running the SCEVExpanderCleaners.
2105 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2106 if (MemCheckExp.isInsertedInstruction(&I))
2107 continue;
2108 SE.forgetValue(&I);
2109 I.eraseFromParent();
2110 }
2111 }
2112 MemCheckCleaner.cleanup();
2113 SCEVCleaner.cleanup();
2114
2115 if (SCEVCheckCond)
2116 SCEVCheckBlock->eraseFromParent();
2117 if (MemRuntimeCheckCond)
2118 MemCheckBlock->eraseFromParent();
2119 }
2120
2121 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2122 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2123 /// depending on the generated condition.
2124 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2125 BasicBlock *LoopVectorPreHeader,
2126 BasicBlock *LoopExitBlock) {
2127 if (!SCEVCheckCond)
2128 return nullptr;
2129
2130 Value *Cond = SCEVCheckCond;
2131 // Mark the check as used, to prevent it from being removed during cleanup.
2132 SCEVCheckCond = nullptr;
2133 if (auto *C = dyn_cast<ConstantInt>(Cond))
2134 if (C->isZero())
2135 return nullptr;
2136
2137 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2138
2139 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2140 // Create new preheader for vector loop.
2141 if (OuterLoop)
2142 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2143
2144 SCEVCheckBlock->getTerminator()->eraseFromParent();
2145 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2146 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2147 SCEVCheckBlock);
2148
2149 DT->addNewBlock(SCEVCheckBlock, Pred);
2150 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2151
2152 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2153 if (AddBranchWeights)
2155 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2156 return SCEVCheckBlock;
2157 }
2158
2159 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2160 /// the branches to branch to the vector preheader or \p Bypass, depending on
2161 /// the generated condition.
2162 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2163 BasicBlock *LoopVectorPreHeader) {
2164 // Check if we generated code that checks in runtime if arrays overlap.
2165 if (!MemRuntimeCheckCond)
2166 return nullptr;
2167
2168 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2169 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2170 MemCheckBlock);
2171
2172 DT->addNewBlock(MemCheckBlock, Pred);
2173 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2174 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2175
2176 if (OuterLoop)
2177 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2178
2179 BranchInst &BI =
2180 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2181 if (AddBranchWeights) {
2183 }
2184 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2185 MemCheckBlock->getTerminator()->setDebugLoc(
2186 Pred->getTerminator()->getDebugLoc());
2187
2188 // Mark the check as used, to prevent it from being removed during cleanup.
2189 MemRuntimeCheckCond = nullptr;
2190 return MemCheckBlock;
2191 }
2192};
2193} // namespace
2194
2196 return Style == TailFoldingStyle::Data ||
2197 Style == TailFoldingStyle::DataAndControlFlow ||
2198 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2199}
2200
2202 return Style == TailFoldingStyle::DataAndControlFlow ||
2203 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2204}
2205
2206// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2207// vectorization. The loop needs to be annotated with #pragma omp simd
2208// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2209// vector length information is not provided, vectorization is not considered
2210// explicit. Interleave hints are not allowed either. These limitations will be
2211// relaxed in the future.
2212// Please, note that we are currently forced to abuse the pragma 'clang
2213// vectorize' semantics. This pragma provides *auto-vectorization hints*
2214// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2215// provides *explicit vectorization hints* (LV can bypass legal checks and
2216// assume that vectorization is legal). However, both hints are implemented
2217// using the same metadata (llvm.loop.vectorize, processed by
2218// LoopVectorizeHints). This will be fixed in the future when the native IR
2219// representation for pragma 'omp simd' is introduced.
2220static bool isExplicitVecOuterLoop(Loop *OuterLp,
2222 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2223 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2224
2225 // Only outer loops with an explicit vectorization hint are supported.
2226 // Unannotated outer loops are ignored.
2228 return false;
2229
2230 Function *Fn = OuterLp->getHeader()->getParent();
2231 if (!Hints.allowVectorization(Fn, OuterLp,
2232 true /*VectorizeOnlyWhenForced*/)) {
2233 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2234 return false;
2235 }
2236
2237 if (Hints.getInterleave() > 1) {
2238 // TODO: Interleave support is future work.
2239 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2240 "outer loops.\n");
2241 Hints.emitRemarkWithHints();
2242 return false;
2243 }
2244
2245 return true;
2246}
2247
2251 // Collect inner loops and outer loops without irreducible control flow. For
2252 // now, only collect outer loops that have explicit vectorization hints. If we
2253 // are stress testing the VPlan H-CFG construction, we collect the outermost
2254 // loop of every loop nest.
2255 if (L.isInnermost() || VPlanBuildStressTest ||
2257 LoopBlocksRPO RPOT(&L);
2258 RPOT.perform(LI);
2259 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2260 V.push_back(&L);
2261 // TODO: Collect inner loops inside marked outer loops in case
2262 // vectorization fails for the outer loop. Do not invoke
2263 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2264 // already known to be reducible. We can use an inherited attribute for
2265 // that.
2266 return;
2267 }
2268 }
2269 for (Loop *InnerL : L)
2270 collectSupportedLoops(*InnerL, LI, ORE, V);
2271}
2272
2273//===----------------------------------------------------------------------===//
2274// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2275// LoopVectorizationCostModel and LoopVectorizationPlanner.
2276//===----------------------------------------------------------------------===//
2277
2278/// Compute the transformed value of Index at offset StartValue using step
2279/// StepValue.
2280/// For integer induction, returns StartValue + Index * StepValue.
2281/// For pointer induction, returns StartValue[Index * StepValue].
2282/// FIXME: The newly created binary instructions should contain nsw/nuw
2283/// flags, which can be found from the original scalar operations.
2284static Value *
2286 Value *Step,
2288 const BinaryOperator *InductionBinOp) {
2289 Type *StepTy = Step->getType();
2290 Value *CastedIndex = StepTy->isIntegerTy()
2291 ? B.CreateSExtOrTrunc(Index, StepTy)
2292 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2293 if (CastedIndex != Index) {
2294 CastedIndex->setName(CastedIndex->getName() + ".cast");
2295 Index = CastedIndex;
2296 }
2297
2298 // Note: the IR at this point is broken. We cannot use SE to create any new
2299 // SCEV and then expand it, hoping that SCEV's simplification will give us
2300 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2301 // lead to various SCEV crashes. So all we can do is to use builder and rely
2302 // on InstCombine for future simplifications. Here we handle some trivial
2303 // cases only.
2304 auto CreateAdd = [&B](Value *X, Value *Y) {
2305 assert(X->getType() == Y->getType() && "Types don't match!");
2306 if (auto *CX = dyn_cast<ConstantInt>(X))
2307 if (CX->isZero())
2308 return Y;
2309 if (auto *CY = dyn_cast<ConstantInt>(Y))
2310 if (CY->isZero())
2311 return X;
2312 return B.CreateAdd(X, Y);
2313 };
2314
2315 // We allow X to be a vector type, in which case Y will potentially be
2316 // splatted into a vector with the same element count.
2317 auto CreateMul = [&B](Value *X, Value *Y) {
2318 assert(X->getType()->getScalarType() == Y->getType() &&
2319 "Types don't match!");
2320 if (auto *CX = dyn_cast<ConstantInt>(X))
2321 if (CX->isOne())
2322 return Y;
2323 if (auto *CY = dyn_cast<ConstantInt>(Y))
2324 if (CY->isOne())
2325 return X;
2326 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2327 if (XVTy && !isa<VectorType>(Y->getType()))
2328 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2329 return B.CreateMul(X, Y);
2330 };
2331
2332 switch (InductionKind) {
2334 assert(!isa<VectorType>(Index->getType()) &&
2335 "Vector indices not supported for integer inductions yet");
2336 assert(Index->getType() == StartValue->getType() &&
2337 "Index type does not match StartValue type");
2338 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2339 return B.CreateSub(StartValue, Index);
2340 auto *Offset = CreateMul(Index, Step);
2341 return CreateAdd(StartValue, Offset);
2342 }
2344 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2346 assert(!isa<VectorType>(Index->getType()) &&
2347 "Vector indices not supported for FP inductions yet");
2348 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2349 assert(InductionBinOp &&
2350 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2351 InductionBinOp->getOpcode() == Instruction::FSub) &&
2352 "Original bin op should be defined for FP induction");
2353
2354 Value *MulExp = B.CreateFMul(Step, Index);
2355 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2356 "induction");
2357 }
2359 return nullptr;
2360 }
2361 llvm_unreachable("invalid enum");
2362}
2363
2364std::optional<unsigned> getMaxVScale(const Function &F,
2365 const TargetTransformInfo &TTI) {
2366 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2367 return MaxVScale;
2368
2369 if (F.hasFnAttribute(Attribute::VScaleRange))
2370 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2371
2372 return std::nullopt;
2373}
2374
2375/// For the given VF and UF and maximum trip count computed for the loop, return
2376/// whether the induction variable might overflow in the vectorized loop. If not,
2377/// then we know a runtime overflow check always evaluates to false and can be
2378/// removed.
2381 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2382 // Always be conservative if we don't know the exact unroll factor.
2383 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2384
2385 Type *IdxTy = Cost->Legal->getWidestInductionType();
2386 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2387
2388 // We know the runtime overflow check is known false iff the (max) trip-count
2389 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2390 // the vector loop induction variable.
2391 if (unsigned TC =
2392 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2393 uint64_t MaxVF = VF.getKnownMinValue();
2394 if (VF.isScalable()) {
2395 std::optional<unsigned> MaxVScale =
2396 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2397 if (!MaxVScale)
2398 return false;
2399 MaxVF *= *MaxVScale;
2400 }
2401
2402 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2403 }
2404
2405 return false;
2406}
2407
2408// Return whether we allow using masked interleave-groups (for dealing with
2409// strided loads/stores that reside in predicated blocks, or for dealing
2410// with gaps).
2412 // If an override option has been passed in for interleaved accesses, use it.
2413 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2415
2417}
2418
2419// Try to vectorize the interleave group that \p Instr belongs to.
2420//
2421// E.g. Translate following interleaved load group (factor = 3):
2422// for (i = 0; i < N; i+=3) {
2423// R = Pic[i]; // Member of index 0
2424// G = Pic[i+1]; // Member of index 1
2425// B = Pic[i+2]; // Member of index 2
2426// ... // do something to R, G, B
2427// }
2428// To:
2429// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2430// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2431// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2432// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2433//
2434// Or translate following interleaved store group (factor = 3):
2435// for (i = 0; i < N; i+=3) {
2436// ... do something to R, G, B
2437// Pic[i] = R; // Member of index 0
2438// Pic[i+1] = G; // Member of index 1
2439// Pic[i+2] = B; // Member of index 2
2440// }
2441// To:
2442// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2443// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2444// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2445// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2446// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2449 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2450 VPValue *BlockInMask, bool NeedsMaskForGaps) {
2451 Instruction *Instr = Group->getInsertPos();
2452 const DataLayout &DL = Instr->getModule()->getDataLayout();
2453
2454 // Prepare for the vector type of the interleaved load/store.
2455 Type *ScalarTy = getLoadStoreType(Instr);
2456 unsigned InterleaveFactor = Group->getFactor();
2457 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2458
2459 // Prepare for the new pointers.
2460 SmallVector<Value *, 2> AddrParts;
2461 unsigned Index = Group->getIndex(Instr);
2462
2463 // TODO: extend the masked interleaved-group support to reversed access.
2464 assert((!BlockInMask || !Group->isReverse()) &&
2465 "Reversed masked interleave-group not supported.");
2466
2467 Value *Idx;
2468 // If the group is reverse, adjust the index to refer to the last vector lane
2469 // instead of the first. We adjust the index from the first vector lane,
2470 // rather than directly getting the pointer for lane VF - 1, because the
2471 // pointer operand of the interleaved access is supposed to be uniform. For
2472 // uniform instructions, we're only required to generate a value for the
2473 // first vector lane in each unroll iteration.
2474 if (Group->isReverse()) {
2475 Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2476 Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
2480 } else
2482
2483 for (unsigned Part = 0; Part < UF; Part++) {
2484 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2485 if (auto *I = dyn_cast<Instruction>(AddrPart))
2486 State.setDebugLocFrom(I->getDebugLoc());
2487
2488 // Notice current instruction could be any index. Need to adjust the address
2489 // to the member of index 0.
2490 //
2491 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2492 // b = A[i]; // Member of index 0
2493 // Current pointer is pointed to A[i+1], adjust it to A[i].
2494 //
2495 // E.g. A[i+1] = a; // Member of index 1
2496 // A[i] = b; // Member of index 0
2497 // A[i+2] = c; // Member of index 2 (Current instruction)
2498 // Current pointer is pointed to A[i+2], adjust it to A[i].
2499
2500 bool InBounds = false;
2501 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2502 InBounds = gep->isInBounds();
2503 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2504 AddrParts.push_back(AddrPart);
2505 }
2506
2507 State.setDebugLocFrom(Instr->getDebugLoc());
2508 Value *PoisonVec = PoisonValue::get(VecTy);
2509
2510 auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
2511 unsigned Part, Value *MaskForGaps) -> Value * {
2512 if (VF.isScalable()) {
2513 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2514 assert(InterleaveFactor == 2 &&
2515 "Unsupported deinterleave factor for scalable vectors");
2516 auto *BlockInMaskPart = State.get(BlockInMask, Part);
2517 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2518 auto *MaskTy =
2520 return Builder.CreateIntrinsic(
2521 MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
2522 /*FMFSource=*/nullptr, "interleaved.mask");
2523 }
2524
2525 if (!BlockInMask)
2526 return MaskForGaps;
2527
2528 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2529 Value *ShuffledMask = Builder.CreateShuffleVector(
2530 BlockInMaskPart,
2531 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2532 "interleaved.mask");
2533 return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2534 MaskForGaps)
2535 : ShuffledMask;
2536 };
2537
2538 // Vectorize the interleaved load group.
2539 if (isa<LoadInst>(Instr)) {
2540 Value *MaskForGaps = nullptr;
2541 if (NeedsMaskForGaps) {
2542 MaskForGaps =
2544 assert(MaskForGaps && "Mask for Gaps is required but it is null");
2545 }
2546
2547 // For each unroll part, create a wide load for the group.
2548 SmallVector<Value *, 2> NewLoads;
2549 for (unsigned Part = 0; Part < UF; Part++) {
2550 Instruction *NewLoad;
2551 if (BlockInMask || MaskForGaps) {
2553 "masked interleaved groups are not allowed.");
2554 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2555 NewLoad =
2556 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2557 GroupMask, PoisonVec, "wide.masked.vec");
2558 }
2559 else
2560 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2561 Group->getAlign(), "wide.vec");
2562 Group->addMetadata(NewLoad);
2563 NewLoads.push_back(NewLoad);
2564 }
2565
2566 if (VecTy->isScalableTy()) {
2567 assert(InterleaveFactor == 2 &&
2568 "Unsupported deinterleave factor for scalable vectors");
2569
2570 for (unsigned Part = 0; Part < UF; ++Part) {
2571 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2572 // so must use intrinsics to deinterleave.
2574 Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
2575 /*FMFSource=*/nullptr, "strided.vec");
2576 unsigned J = 0;
2577 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2578 Instruction *Member = Group->getMember(I);
2579
2580 if (!Member)
2581 continue;
2582
2583 Value *StridedVec = Builder.CreateExtractValue(DI, I);
2584 // If this member has different type, cast the result type.
2585 if (Member->getType() != ScalarTy) {
2586 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2587 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2588 }
2589
2590 if (Group->isReverse())
2591 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2592
2593 State.set(VPDefs[J], StridedVec, Part);
2594 ++J;
2595 }
2596 }
2597
2598 return;
2599 }
2600
2601 // For each member in the group, shuffle out the appropriate data from the
2602 // wide loads.
2603 unsigned J = 0;
2604 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2605 Instruction *Member = Group->getMember(I);
2606
2607 // Skip the gaps in the group.
2608 if (!Member)
2609 continue;
2610
2611 auto StrideMask =
2612 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2613 for (unsigned Part = 0; Part < UF; Part++) {
2614 Value *StridedVec = Builder.CreateShuffleVector(
2615 NewLoads[Part], StrideMask, "strided.vec");
2616
2617 // If this member has different type, cast the result type.
2618 if (Member->getType() != ScalarTy) {
2619 assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2620 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2621 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2622 }
2623
2624 if (Group->isReverse())
2625 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2626
2627 State.set(VPDefs[J], StridedVec, Part);
2628 }
2629 ++J;
2630 }
2631 return;
2632 }
2633
2634 // The sub vector type for current instruction.
2635 auto *SubVT = VectorType::get(ScalarTy, VF);
2636
2637 // Vectorize the interleaved store group.
2638 Value *MaskForGaps =
2640 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2641 "masked interleaved groups are not allowed.");
2642 assert((!MaskForGaps || !VF.isScalable()) &&
2643 "masking gaps for scalable vectors is not yet supported.");
2644 for (unsigned Part = 0; Part < UF; Part++) {
2645 // Collect the stored vector from each member.
2646 SmallVector<Value *, 4> StoredVecs;
2647 unsigned StoredIdx = 0;
2648 for (unsigned i = 0; i < InterleaveFactor; i++) {
2649 assert((Group->getMember(i) || MaskForGaps) &&
2650 "Fail to get a member from an interleaved store group");
2651 Instruction *Member = Group->getMember(i);
2652
2653 // Skip the gaps in the group.
2654 if (!Member) {
2655 Value *Undef = PoisonValue::get(SubVT);
2656 StoredVecs.push_back(Undef);
2657 continue;
2658 }
2659
2660 Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2661 ++StoredIdx;
2662
2663 if (Group->isReverse())
2664 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2665
2666 // If this member has different type, cast it to a unified type.
2667
2668 if (StoredVec->getType() != SubVT)
2669 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2670
2671 StoredVecs.push_back(StoredVec);
2672 }
2673
2674 // Interleave all the smaller vectors into one wider vector.
2675 Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
2676 Instruction *NewStoreInstr;
2677 if (BlockInMask || MaskForGaps) {
2678 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2679 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2680 Group->getAlign(), GroupMask);
2681 } else
2682 NewStoreInstr =
2683 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2684
2685 Group->addMetadata(NewStoreInstr);
2686 }
2687}
2688
2690 VPReplicateRecipe *RepRecipe,
2691 const VPIteration &Instance,
2692 VPTransformState &State) {
2693 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2694
2695 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2696 // the first lane and part.
2697 if (isa<NoAliasScopeDeclInst>(Instr))
2698 if (!Instance.isFirstIteration())
2699 return;
2700
2701 // Does this instruction return a value ?
2702 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2703
2704 Instruction *Cloned = Instr->clone();
2705 if (!IsVoidRetTy) {
2706 Cloned->setName(Instr->getName() + ".cloned");
2707#if !defined(NDEBUG)
2708 // Verify that VPlan type inference results agree with the type of the
2709 // generated values.
2710 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2711 "inferred type and type from generated instructions do not match");
2712#endif
2713 }
2714
2715 RepRecipe->setFlags(Cloned);
2716
2717 if (auto DL = Instr->getDebugLoc())
2718 State.setDebugLocFrom(DL);
2719
2720 // Replace the operands of the cloned instructions with their scalar
2721 // equivalents in the new loop.
2722 for (const auto &I : enumerate(RepRecipe->operands())) {
2723 auto InputInstance = Instance;
2724 VPValue *Operand = I.value();
2726 InputInstance.Lane = VPLane::getFirstLane();
2727 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2728 }
2729 State.addNewMetadata(Cloned, Instr);
2730
2731 // Place the cloned scalar in the new loop.
2732 State.Builder.Insert(Cloned);
2733
2734 State.set(RepRecipe, Cloned, Instance);
2735
2736 // If we just cloned a new assumption, add it the assumption cache.
2737 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2739
2740 // End if-block.
2741 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2742 if (IfPredicateInstr)
2743 PredicatedInstructions.push_back(Cloned);
2744}
2745
2746Value *
2748 if (VectorTripCount)
2749 return VectorTripCount;
2750
2751 Value *TC = getTripCount();
2752 IRBuilder<> Builder(InsertBlock->getTerminator());
2753
2754 Type *Ty = TC->getType();
2755 // This is where we can make the step a runtime constant.
2756 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2757
2758 // If the tail is to be folded by masking, round the number of iterations N
2759 // up to a multiple of Step instead of rounding down. This is done by first
2760 // adding Step-1 and then rounding down. Note that it's ok if this addition
2761 // overflows: the vector induction variable will eventually wrap to zero given
2762 // that it starts at zero and its Step is a power of two; the loop will then
2763 // exit, with the last early-exit vector comparison also producing all-true.
2764 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2765 // is accounted for in emitIterationCountCheck that adds an overflow check.
2766 if (Cost->foldTailByMasking()) {
2768 "VF*UF must be a power of 2 when folding tail by masking");
2769 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2770 TC = Builder.CreateAdd(
2771 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2772 }
2773
2774 // Now we need to generate the expression for the part of the loop that the
2775 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2776 // iterations are not required for correctness, or N - Step, otherwise. Step
2777 // is equal to the vectorization factor (number of SIMD elements) times the
2778 // unroll factor (number of SIMD instructions).
2779 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2780
2781 // There are cases where we *must* run at least one iteration in the remainder
2782 // loop. See the cost model for when this can happen. If the step evenly
2783 // divides the trip count, we set the remainder to be equal to the step. If
2784 // the step does not evenly divide the trip count, no adjustment is necessary
2785 // since there will already be scalar iterations. Note that the minimum
2786 // iterations check ensures that N >= Step.
2787 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2788 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2789 R = Builder.CreateSelect(IsZero, Step, R);
2790 }
2791
2792 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2793
2794 return VectorTripCount;
2795}
2796
2798 const DataLayout &DL) {
2799 // Verify that V is a vector type with same number of elements as DstVTy.
2800 auto *DstFVTy = cast<VectorType>(DstVTy);
2801 auto VF = DstFVTy->getElementCount();
2802 auto *SrcVecTy = cast<VectorType>(V->getType());
2803 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2804 Type *SrcElemTy = SrcVecTy->getElementType();
2805 Type *DstElemTy = DstFVTy->getElementType();
2806 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2807 "Vector elements must have same size");
2808
2809 // Do a direct cast if element types are castable.
2810 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2811 return Builder.CreateBitOrPointerCast(V, DstFVTy);
2812 }
2813 // V cannot be directly casted to desired vector type.
2814 // May happen when V is a floating point vector but DstVTy is a vector of
2815 // pointers or vice-versa. Handle this using a two-step bitcast using an
2816 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2817 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2818 "Only one type should be a pointer type");
2819 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2820 "Only one type should be a floating point type");
2821 Type *IntTy =
2822 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2823 auto *VecIntTy = VectorType::get(IntTy, VF);
2824 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2825 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2826}
2827
2829 Value *Count = getTripCount();
2830 // Reuse existing vector loop preheader for TC checks.
2831 // Note that new preheader block is generated for vector loop.
2832 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2833 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2834
2835 // Generate code to check if the loop's trip count is less than VF * UF, or
2836 // equal to it in case a scalar epilogue is required; this implies that the
2837 // vector trip count is zero. This check also covers the case where adding one
2838 // to the backedge-taken count overflowed leading to an incorrect trip count
2839 // of zero. In this case we will also jump to the scalar loop.
2840 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2842
2843 // If tail is to be folded, vector loop takes care of all iterations.
2844 Type *CountTy = Count->getType();
2845 Value *CheckMinIters = Builder.getFalse();
2846 auto CreateStep = [&]() -> Value * {
2847 // Create step with max(MinProTripCount, UF * VF).
2849 return createStepForVF(Builder, CountTy, VF, UF);
2850
2851 Value *MinProfTC =
2853 if (!VF.isScalable())
2854 return MinProfTC;
2856 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2857 };
2858
2859 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2860 if (Style == TailFoldingStyle::None)
2861 CheckMinIters =
2862 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2863 else if (VF.isScalable() &&
2866 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2867 // an overflow to zero when updating induction variables and so an
2868 // additional overflow check is required before entering the vector loop.
2869
2870 // Get the maximum unsigned value for the type.
2871 Value *MaxUIntTripCount =
2872 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2873 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2874
2875 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2876 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2877 }
2878
2879 // Create new preheader for vector loop.
2881 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2882 "vector.ph");
2883
2884 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2885 DT->getNode(Bypass)->getIDom()) &&
2886 "TC check is expected to dominate Bypass");
2887
2888 // Update dominator for Bypass & LoopExit (if needed).
2889 DT->changeImmediateDominator(Bypass, TCCheckBlock);
2890 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2891 // If there is an epilogue which must run, there's no edge from the
2892 // middle block to exit blocks and thus no need to update the immediate
2893 // dominator of the exit blocks.
2895
2896 BranchInst &BI =
2897 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2900 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2901 LoopBypassBlocks.push_back(TCCheckBlock);
2902}
2903
2905 BasicBlock *const SCEVCheckBlock =
2906 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2907 if (!SCEVCheckBlock)
2908 return nullptr;
2909
2910 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2912 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2913 "Cannot SCEV check stride or overflow when optimizing for size");
2914
2915
2916 // Update dominator only if this is first RT check.
2917 if (LoopBypassBlocks.empty()) {
2918 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2919 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2920 // If there is an epilogue which must run, there's no edge from the
2921 // middle block to exit blocks and thus no need to update the immediate
2922 // dominator of the exit blocks.
2923 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2924 }
2925
2926 LoopBypassBlocks.push_back(SCEVCheckBlock);
2927 AddedSafetyChecks = true;
2928 return SCEVCheckBlock;
2929}
2930
2932 // VPlan-native path does not do any analysis for runtime checks currently.
2934 return nullptr;
2935
2936 BasicBlock *const MemCheckBlock =
2937 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2938
2939 // Check if we generated code that checks in runtime if arrays overlap. We put
2940 // the checks into a separate block to make the more common case of few
2941 // elements faster.
2942 if (!MemCheckBlock)
2943 return nullptr;
2944
2945 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2946 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2947 "Cannot emit memory checks when optimizing for size, unless forced "
2948 "to vectorize.");
2949 ORE->emit([&]() {
2950 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2953 << "Code-size may be reduced by not forcing "
2954 "vectorization, or by source-code modifications "
2955 "eliminating the need for runtime checks "
2956 "(e.g., adding 'restrict').";
2957 });
2958 }
2959
2960 LoopBypassBlocks.push_back(MemCheckBlock);
2961
2962 AddedSafetyChecks = true;
2963
2964 return MemCheckBlock;
2965}
2966
2970 assert(LoopVectorPreHeader && "Invalid loop structure");
2971 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2972 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2973 "multiple exit loop without required epilogue?");
2974
2977 LI, nullptr, Twine(Prefix) + "middle.block");
2980 nullptr, Twine(Prefix) + "scalar.ph");
2981
2982 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
2983
2984 // Set up the middle block terminator. Two cases:
2985 // 1) If we know that we must execute the scalar epilogue, emit an
2986 // unconditional branch.
2987 // 2) Otherwise, we must have a single unique exit block (due to how we
2988 // implement the multiple exit case). In this case, set up a conditional
2989 // branch from the middle block to the loop scalar preheader, and the
2990 // exit block. completeLoopSkeleton will update the condition to use an
2991 // iteration check, if required to decide whether to execute the remainder.
2992 BranchInst *BrInst =
2993 Cost->requiresScalarEpilogue(VF.isVector())
2996 Builder.getTrue());
2997 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
2999
3000 // Update dominator for loop exit. During skeleton creation, only the vector
3001 // pre-header and the middle block are created. The vector loop is entirely
3002 // created during VPlan exection.
3003 if (!Cost->requiresScalarEpilogue(VF.isVector()))
3004 // If there is an epilogue which must run, there's no edge from the
3005 // middle block to exit blocks and thus no need to update the immediate
3006 // dominator of the exit blocks.
3008}
3009
3011 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
3012 ArrayRef<BasicBlock *> BypassBlocks,
3013 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3015 assert(VectorTripCount && "Expected valid arguments");
3016
3017 Instruction *OldInduction = Legal->getPrimaryInduction();
3018 Value *&EndValue = IVEndValues[OrigPhi];
3019 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3020 if (OrigPhi == OldInduction) {
3021 // We know what the end value is.
3022 EndValue = VectorTripCount;
3023 } else {
3025
3026 // Fast-math-flags propagate from the original induction instruction.
3027 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3028 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3029
3031 Step, II.getKind(), II.getInductionBinOp());
3032 EndValue->setName("ind.end");
3033
3034 // Compute the end value for the additional bypass (if applicable).
3035 if (AdditionalBypass.first) {
3036 B.SetInsertPoint(AdditionalBypass.first,
3037 AdditionalBypass.first->getFirstInsertionPt());
3038 EndValueFromAdditionalBypass =
3039 emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
3040 Step, II.getKind(), II.getInductionBinOp());
3041 EndValueFromAdditionalBypass->setName("ind.end");
3042 }
3043 }
3044
3045 // Create phi nodes to merge from the backedge-taken check block.
3046 PHINode *BCResumeVal =
3047 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3049 // Copy original phi DL over to the new one.
3050 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3051
3052 // The new PHI merges the original incoming value, in case of a bypass,
3053 // or the value at the end of the vectorized loop.
3054 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3055
3056 // Fix the scalar body counter (PHI node).
3057 // The old induction's phi node in the scalar body needs the truncated
3058 // value.
3059 for (BasicBlock *BB : BypassBlocks)
3060 BCResumeVal->addIncoming(II.getStartValue(), BB);
3061
3062 if (AdditionalBypass.first)
3063 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3064 EndValueFromAdditionalBypass);
3065 return BCResumeVal;
3066}
3067
3068/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
3069/// expansion results.
3071 const SCEV2ValueTy &ExpandedSCEVs) {
3072 const SCEV *Step = ID.getStep();
3073 if (auto *C = dyn_cast<SCEVConstant>(Step))
3074 return C->getValue();
3075 if (auto *U = dyn_cast<SCEVUnknown>(Step))
3076 return U->getValue();
3077 auto I = ExpandedSCEVs.find(Step);
3078 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
3079 return I->second;
3080}
3081
3083 const SCEV2ValueTy &ExpandedSCEVs,
3084 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3085 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3086 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3087 "Inconsistent information about additional bypass.");
3088 // We are going to resume the execution of the scalar loop.
3089 // Go over all of the induction variables that we found and fix the
3090 // PHIs that are left in the scalar version of the loop.
3091 // The starting values of PHI nodes depend on the counter of the last
3092 // iteration in the vectorized loop.
3093 // If we come from a bypass edge then we need to start from the original
3094 // start value.
3095 for (const auto &InductionEntry : Legal->getInductionVars()) {
3096 PHINode *OrigPhi = InductionEntry.first;
3097 const InductionDescriptor &II = InductionEntry.second;
3098 PHINode *BCResumeVal = createInductionResumeValue(
3099 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
3100 AdditionalBypass);
3101 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3102 }
3103}
3104
3106 // The trip counts should be cached by now.
3107 Value *Count = getTripCount();
3109
3110 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3111
3112 // Add a check in the middle block to see if we have completed
3113 // all of the iterations in the first vector loop. Three cases:
3114 // 1) If we require a scalar epilogue, there is no conditional branch as
3115 // we unconditionally branch to the scalar preheader. Do nothing.
3116 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3117 // Thus if tail is to be folded, we know we don't need to run the
3118 // remainder and we can use the previous value for the condition (true).
3119 // 3) Otherwise, construct a runtime check.
3120 if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
3121 !Cost->foldTailByMasking()) {
3122 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3123 // of the corresponding compare because they may have ended up with
3124 // different line numbers and we want to avoid awkward line stepping while
3125 // debugging. Eg. if the compare has got a line number inside the loop.
3126 // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3127 // operands. Perform simplification directly on VPlan once the branch is
3128 // modeled there.
3130 B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3131 Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
3132 BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
3133 BI.setCondition(CmpN);
3134 if (hasBranchWeightMD(*ScalarLatchTerm)) {
3135 // Assume that `Count % VectorTripCount` is equally distributed.
3136 unsigned TripCount = UF * VF.getKnownMinValue();
3137 assert(TripCount > 0 && "trip count should not be zero");
3138 const uint32_t Weights[] = {1, TripCount - 1};
3139 setBranchWeights(BI, Weights);
3140 }
3141 }
3142
3143#ifdef EXPENSIVE_CHECKS
3144 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3145#endif
3146
3147 return LoopVectorPreHeader;
3148}
3149
3150std::pair<BasicBlock *, Value *>
3152 const SCEV2ValueTy &ExpandedSCEVs) {
3153 /*
3154 In this function we generate a new loop. The new loop will contain
3155 the vectorized instructions while the old loop will continue to run the
3156 scalar remainder.
3157
3158 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3159 / | preheader are expanded here. Eventually all required SCEV
3160 / | expansion should happen here.
3161 / v
3162 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3163 | / |
3164 | / v
3165 || [ ] <-- vector pre header.
3166 |/ |
3167 | v
3168 | [ ] \
3169 | [ ]_| <-- vector loop (created during VPlan execution).
3170 | |
3171 | v
3172 \ -[ ] <--- middle-block.
3173 \/ |
3174 /\ v
3175 | ->[ ] <--- new preheader.
3176 | |
3177 (opt) v <-- edge from middle to exit iff epilogue is not required.
3178 | [ ] \
3179 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3180 \ |
3181 \ v
3182 >[ ] <-- exit block(s).
3183 ...
3184 */
3185
3186 // Create an empty vector loop, and prepare basic blocks for the runtime
3187 // checks.
3189
3190 // Now, compare the new count to zero. If it is zero skip the vector loop and
3191 // jump to the scalar loop. This check also covers the case where the
3192 // backedge-taken count is uint##_max: adding one to it will overflow leading
3193 // to an incorrect trip count of zero. In this (rare) case we will also jump
3194 // to the scalar loop.
3196
3197 // Generate the code to check any assumptions that we've made for SCEV
3198 // expressions.
3200
3201 // Generate the code that checks in runtime if arrays overlap. We put the
3202 // checks into a separate block to make the more common case of few elements
3203 // faster.
3205
3206 // Emit phis for the new starting index of the scalar loop.
3207 createInductionResumeValues(ExpandedSCEVs);
3208
3209 return {completeLoopSkeleton(), nullptr};
3210}
3211
3212// Fix up external users of the induction variable. At this point, we are
3213// in LCSSA form, with all external PHIs that use the IV having one input value,
3214// coming from the remainder loop. We need those PHIs to also have a correct
3215// value for the IV when arriving directly from the middle block.
3217 const InductionDescriptor &II,
3218 Value *VectorTripCount, Value *EndValue,
3219 BasicBlock *MiddleBlock,
3220 BasicBlock *VectorHeader, VPlan &Plan,
3221 VPTransformState &State) {
3222 // There are two kinds of external IV usages - those that use the value
3223 // computed in the last iteration (the PHI) and those that use the penultimate
3224 // value (the value that feeds into the phi from the loop latch).
3225 // We allow both, but they, obviously, have different values.
3226
3227 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3228
3229 DenseMap<Value *, Value *> MissingVals;
3230
3231 // An external user of the last iteration's value should see the value that
3232 // the remainder loop uses to initialize its own IV.
3234 for (User *U : PostInc->users()) {
3235 Instruction *UI = cast<Instruction>(U);
3236 if (!OrigLoop->contains(UI)) {
3237 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3238 MissingVals[UI] = EndValue;
3239 }
3240 }
3241
3242 // An external user of the penultimate value need to see EndValue - Step.
3243 // The simplest way to get this is to recompute it from the constituent SCEVs,
3244 // that is Start + (Step * (CRD - 1)).
3245 for (User *U : OrigPhi->users()) {
3246 auto *UI = cast<Instruction>(U);
3247 if (!OrigLoop->contains(UI)) {
3248 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3249 IRBuilder<> B(MiddleBlock->getTerminator());
3250
3251 // Fast-math-flags propagate from the original induction instruction.
3252 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3253 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3254
3255 Value *CountMinusOne = B.CreateSub(
3256 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3257 CountMinusOne->setName("cmo");
3258
3259 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
3260 assert(StepVPV && "step must have been expanded during VPlan execution");
3261 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
3262 : State.get(StepVPV, {0, 0});
3263 Value *Escape =
3264 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
3265 II.getKind(), II.getInductionBinOp());
3266 Escape->setName("ind.escape");
3267 MissingVals[UI] = Escape;
3268 }
3269 }
3270
3271 for (auto &I : MissingVals) {
3272 PHINode *PHI = cast<PHINode>(I.first);
3273 // One corner case we have to handle is two IVs "chasing" each-other,
3274 // that is %IV2 = phi [...], [ %IV1, %latch ]
3275 // In this case, if IV1 has an external use, we need to avoid adding both
3276 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3277 // don't already have an incoming value for the middle block.
3278 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3279 PHI->addIncoming(I.second, MiddleBlock);
3280 Plan.removeLiveOut(PHI);
3281 }
3282 }
3283}
3284
3285namespace {
3286
3287struct CSEDenseMapInfo {
3288 static bool canHandle(const Instruction *I) {
3289 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3290 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3291 }
3292
3293 static inline Instruction *getEmptyKey() {
3295 }
3296
3297 static inline Instruction *getTombstoneKey() {
3299 }
3300
3301 static unsigned getHashValue(const Instruction *I) {
3302 assert(canHandle(I) && "Unknown instruction!");
3303 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3304 I->value_op_end()));
3305 }
3306
3307 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3308 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3309 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3310 return LHS == RHS;
3311 return LHS->isIdenticalTo(RHS);
3312 }
3313};
3314
3315} // end anonymous namespace
3316
3317///Perform cse of induction variable instructions.
3318static void cse(BasicBlock *BB) {
3319 // Perform simple cse.
3321 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3322 if (!CSEDenseMapInfo::canHandle(&In))
3323 continue;
3324
3325 // Check if we can replace this instruction with any of the
3326 // visited instructions.
3327 if (Instruction *V = CSEMap.lookup(&In)) {
3328 In.replaceAllUsesWith(V);
3329 In.eraseFromParent();
3330 continue;
3331 }
3332
3333 CSEMap[&In] = &In;
3334 }
3335}
3336
3339 ElementCount VF) const {
3340 // We only need to calculate a cost if the VF is scalar; for actual vectors
3341 // we should already have a pre-calculated cost at each VF.
3342 if (!VF.isScalar())
3343 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
3344
3346 Type *RetTy = CI->getType();
3348 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
3349 return *RedCost;
3350
3352 for (auto &ArgOp : CI->args())
3353 Tys.push_back(ArgOp->getType());
3354
3355 InstructionCost ScalarCallCost =
3357
3358 // If this is an intrinsic we may have a lower cost for it.
3360 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
3361 return std::min(ScalarCallCost, IntrinsicCost);
3362 }
3363 return ScalarCallCost;
3364}
3365
3367 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3368 return Elt;
3369 return VectorType::get(Elt, VF);
3370}
3371
3374 ElementCount VF) const {
3376 assert(ID && "Expected intrinsic call!");
3377 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3378 FastMathFlags FMF;
3379 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3380 FMF = FPMO->getFastMathFlags();
3381
3384 SmallVector<Type *> ParamTys;
3385 std::transform(FTy->param_begin(), FTy->param_end(),
3386 std::back_inserter(ParamTys),
3387 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3388
3389 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3390 dyn_cast<IntrinsicInst>(CI));
3391 return TTI.getIntrinsicInstrCost(CostAttrs,
3393}
3394
3396 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3397 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3398 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3399}
3400
3402 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3403 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3404 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3405}
3406
3408 VPlan &Plan) {
3409 // Fix widened non-induction PHIs by setting up the PHI operands.
3411 fixNonInductionPHIs(Plan, State);
3412
3413 // At this point every instruction in the original loop is widened to a
3414 // vector form. Now we need to fix the recurrences in the loop. These PHI
3415 // nodes are currently empty because we did not want to introduce cycles.
3416 // This is the second stage of vectorizing recurrences. Note that fixing
3417 // reduction phis are already modeled in VPlan.
3418 // TODO: Also model fixing fixed-order recurrence phis in VPlan.
3419 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3420 VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
3421 for (VPRecipeBase &R : HeaderVPBB->phis()) {
3422 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3423 fixFixedOrderRecurrence(FOR, State);
3424 }
3425
3426 // Forget the original basic block.
3429
3430 // After vectorization, the exit blocks of the original loop will have
3431 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
3432 // looked through single-entry phis.
3433 SmallVector<BasicBlock *> ExitBlocks;
3434 OrigLoop->getExitBlocks(ExitBlocks);
3435 for (BasicBlock *Exit : ExitBlocks)
3436 for (PHINode &PN : Exit->phis())
3438
3439 VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
3440 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3441 if (Cost->requiresScalarEpilogue(VF.isVector())) {
3442 // No edge from the middle block to the unique exit block has been inserted
3443 // and there is nothing to fix from vector loop; phis should have incoming
3444 // from scalar loop only.
3445 } else {
3446 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
3447 // the cost model.
3448
3449 // If we inserted an edge from the middle block to the unique exit block,
3450 // update uses outside the loop (phis) to account for the newly inserted
3451 // edge.
3452
3453 // Fix-up external users of the induction variables.
3454 for (const auto &Entry : Legal->getInductionVars())
3455 fixupIVUsers(Entry.first, Entry.second,
3457 IVEndValues[Entry.first], LoopMiddleBlock,
3458 VectorLoop->getHeader(), Plan, State);
3459 }
3460
3461 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3462 // in the exit block, so update the builder.
3463 State.Builder.SetInsertPoint(State.CFG.ExitBB,
3464 State.CFG.ExitBB->getFirstNonPHIIt());
3465 for (const auto &KV : Plan.getLiveOuts())
3466 KV.second->fixPhi(Plan, State);
3467
3469 sinkScalarOperands(&*PI);
3470
3471 // Remove redundant induction instructions.
3472 cse(VectorLoop->getHeader());
3473
3474 // Set/update profile weights for the vector and remainder loops as original
3475 // loop iterations are now distributed among them. Note that original loop
3476 // represented by LoopScalarBody becomes remainder loop after vectorization.
3477 //
3478 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3479 // end up getting slightly roughened result but that should be OK since
3480 // profile is not inherently precise anyway. Note also possible bypass of
3481 // vector code caused by legality checks is ignored, assigning all the weight
3482 // to the vector loop, optimistically.
3483 //
3484 // For scalable vectorization we can't know at compile time how many iterations
3485 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3486 // vscale of '1'.
3489 VF.getKnownMinValue() * UF);
3490}
3491
3494 // This is the second phase of vectorizing first-order recurrences. An
3495 // overview of the transformation is described below. Suppose we have the
3496 // following loop.
3497 //
3498 // for (int i = 0; i < n; ++i)
3499 // b[i] = a[i] - a[i - 1];
3500 //
3501 // There is a first-order recurrence on "a". For this loop, the shorthand
3502 // scalar IR looks like:
3503 //
3504 // scalar.ph:
3505 // s_init = a[-1]
3506 // br scalar.body
3507 //
3508 // scalar.body:
3509 // i = phi [0, scalar.ph], [i+1, scalar.body]
3510 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3511 // s2 = a[i]
3512 // b[i] = s2 - s1
3513 // br cond, scalar.body, ...
3514 //
3515 // In this example, s1 is a recurrence because it's value depends on the
3516 // previous iteration. In the first phase of vectorization, we created a
3517 // vector phi v1 for s1. We now complete the vectorization and produce the
3518 // shorthand vector IR shown below (for VF = 4, UF = 1).
3519 //
3520 // vector.ph:
3521 // v_init = vector(..., ..., ..., a[-1])
3522 // br vector.body
3523 //
3524 // vector.body
3525 // i = phi [0, vector.ph], [i+4, vector.body]
3526 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3527 // v2 = a[i, i+1, i+2, i+3];
3528 // v3 = vector(v1(3), v2(0, 1, 2))
3529 // b[i, i+1, i+2, i+3] = v2 - v3
3530 // br cond, vector.body, middle.block
3531 //
3532 // middle.block:
3533 // x = v2(3)
3534 // br scalar.ph
3535 //
3536 // scalar.ph:
3537 // s_init = phi [x, middle.block], [a[-1], otherwise]
3538 // br scalar.body
3539 //
3540 // After execution completes the vector loop, we extract the next value of
3541 // the recurrence (x) to use as the initial value in the scalar loop.
3542
3543 // Extract the last vector element in the middle block. This will be the
3544 // initial value for the recurrence when jumping to the scalar loop.
3545 VPValue *PreviousDef = PhiR->getBackedgeValue();
3546 Value *Incoming = State.get(PreviousDef, UF - 1);
3547 auto *ExtractForScalar = Incoming;
3548 auto *IdxTy = Builder.getInt32Ty();
3549 Value *RuntimeVF = nullptr;
3550 if (VF.isVector()) {
3551 auto *One = ConstantInt::get(IdxTy, 1);
3553 RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3554 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3555 ExtractForScalar =
3556 Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
3557 }
3558
3559 auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
3560 assert(PhiR->getNumUsers() == 1 &&
3561 RecurSplice->getOpcode() ==
3563 "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3564 SmallVector<VPLiveOut *> LiveOuts;
3565 for (VPUser *U : RecurSplice->users())
3566 if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
3567 LiveOuts.push_back(LiveOut);
3568
3569 if (!LiveOuts.empty()) {
3570 // Extract the second last element in the middle block if the
3571 // Phi is used outside the loop. We need to extract the phi itself
3572 // and not the last element (the phi update in the current iteration). This
3573 // will be the value when jumping to the exit block from the
3574 // LoopMiddleBlock, when the scalar loop is not run at all.
3575 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3576 if (VF.isVector()) {
3577 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3578 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3579 Incoming, Idx, "vector.recur.extract.for.phi");
3580 } else {
3581 assert(UF > 1 && "VF and UF cannot both be 1");
3582 // When loop is unrolled without vectorizing, initialize
3583 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
3584 // value of `Incoming`. This is analogous to the vectorized case above:
3585 // extracting the second last element when VF > 1.
3586 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3587 }
3588
3589 for (VPLiveOut *LiveOut : LiveOuts) {
3590 assert(!Cost->requiresScalarEpilogue(VF.isVector()));
3591 PHINode *LCSSAPhi = LiveOut->getPhi();
3592 LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3593 State.Plan->removeLiveOut(LCSSAPhi);
3594 }
3595 }
3596
3597 // Fix the initial value of the original recurrence in the scalar loop.
3599 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3600 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3601 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3602 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3603 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3604 Start->addIncoming(Incoming, BB);
3605 }
3606
3607 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3608 Phi->setName("scalar.recur");
3609}
3610
3612 // The basic block and loop containing the predicated instruction.
3613 auto *PredBB = PredInst->getParent();
3614 auto *VectorLoop = LI->getLoopFor(PredBB);
3615
3616 // Initialize a worklist with the operands of the predicated instruction.
3617 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3618
3619 // Holds instructions that we need to analyze again. An instruction may be
3620 // reanalyzed if we don't yet know if we can sink it or not.
3621 SmallVector<Instruction *, 8> InstsToReanalyze;
3622
3623 // Returns true if a given use occurs in the predicated block. Phi nodes use
3624 // their operands in their corresponding predecessor blocks.
3625 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3626 auto *I = cast<Instruction>(U.getUser());
3627 BasicBlock *BB = I->getParent();
3628 if (auto *Phi = dyn_cast<PHINode>(I))
3629 BB = Phi->getIncomingBlock(
3630 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3631 return BB == PredBB;
3632 };
3633
3634 // Iteratively sink the scalarized operands of the predicated instruction
3635 // into the block we created for it. When an instruction is sunk, it's
3636 // operands are then added to the worklist. The algorithm ends after one pass
3637 // through the worklist doesn't sink a single instruction.
3638 bool Changed;
3639 do {
3640 // Add the instructions that need to be reanalyzed to the worklist, and
3641 // reset the changed indicator.
3642 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3643 InstsToReanalyze.clear();
3644 Changed = false;
3645
3646 while (!Worklist.empty()) {
3647 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3648
3649 // We can't sink an instruction if it is a phi node, is not in the loop,
3650 // may have side effects or may read from memory.
3651 // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3652 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3653 I->mayHaveSideEffects() || I->mayReadFromMemory())
3654 continue;
3655
3656 // If the instruction is already in PredBB, check if we can sink its
3657 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3658 // sinking the scalar instruction I, hence it appears in PredBB; but it
3659 // may have failed to sink I's operands (recursively), which we try
3660 // (again) here.
3661 if (I->getParent() == PredBB) {
3662 Worklist.insert(I->op_begin(), I->op_end());
3663 continue;
3664 }
3665
3666 // It's legal to sink the instruction if all its uses occur in the
3667 // predicated block. Otherwise, there's nothing to do yet, and we may
3668 // need to reanalyze the instruction.
3669 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3670 InstsToReanalyze.push_back(I);
3671 continue;
3672 }
3673
3674 // Move the instruction to the beginning of the predicated block, and add
3675 // it's operands to the worklist.
3676 I->moveBefore(&*PredBB->getFirstInsertionPt());
3677 Worklist.insert(I->op_begin(), I->op_end());
3678
3679 // The sinking may have enabled other instructions to be sunk, so we will
3680 // need to iterate.
3681 Changed = true;
3682 }
3683 } while (Changed);
3684}
3685
3687 VPTransformState &State) {
3688 auto Iter = vp_depth_first_deep(Plan.getEntry());
3689 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3690 for (VPRecipeBase &P : VPBB->phis()) {
3691 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3692 if (!VPPhi)
3693 continue;
3694 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3695 // Make sure the builder has a valid insert point.
3696 Builder.SetInsertPoint(NewPhi);
3697 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3698 VPValue *Inc = VPPhi->getIncomingValue(i);
3699 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3700 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
3701 }
3702 }
3703 }
3704}
3705
3706void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3707 // We should not collect Scalars more than once per VF. Right now, this
3708 // function is called from collectUniformsAndScalars(), which already does
3709 // this check. Collecting Scalars for VF=1 does not make any sense.
3710 assert(VF.isVector() && !Scalars.contains(VF) &&
3711 "This function should not be visited twice for the same VF");
3712
3713 // This avoids any chances of creating a REPLICATE recipe during planning
3714 // since that would result in generation of scalarized code during execution,
3715 // which is not supported for scalable vectors.
3716 if (VF.isScalable()) {
3717 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3718 return;
3719 }
3720
3722
3723 // These sets are used to seed the analysis with pointers used by memory
3724 // accesses that will remain scalar.
3726 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3727 auto *Latch = TheLoop->getLoopLatch();
3728
3729 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3730 // The pointer operands of loads and stores will be scalar as long as the
3731 // memory access is not a gather or scatter operation. The value operand of a
3732 // store will remain scalar if the store is scalarized.
3733 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3734 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3735 assert(WideningDecision != CM_Unknown &&
3736 "Widening decision should be ready at this moment");
3737 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3738 if (Ptr == Store->getValueOperand())
3739 return WideningDecision == CM_Scalarize;
3740 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3741 "Ptr is neither a value or pointer operand");
3742 return WideningDecision != CM_GatherScatter;
3743 };
3744
3745 // A helper that returns true if the given value is a bitcast or
3746 // getelementptr instruction contained in the loop.
3747 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
3748 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
3749 isa<GetElementPtrInst>(V)) &&
3751 };
3752
3753 // A helper that evaluates a memory access's use of a pointer. If the use will
3754 // be a scalar use and the pointer is only used by memory accesses, we place
3755 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3756 // PossibleNonScalarPtrs.
3757 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3758 // We only care about bitcast and getelementptr instructions contained in
3759 // the loop.
3760 if (!isLoopVaryingBitCastOrGEP(Ptr))
3761 return;
3762
3763 // If the pointer has already been identified as scalar (e.g., if it was
3764 // also identified as uniform), there's nothing to do.
3765 auto *I = cast<Instruction>(Ptr);
3766 if (Worklist.count(I))
3767 return;
3768
3769 // If the use of the pointer will be a scalar use, and all users of the
3770 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3771 // place the pointer in PossibleNonScalarPtrs.
3772 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3773 return isa<LoadInst>(U) || isa<StoreInst>(U);
3774 }))
3775 ScalarPtrs.insert(I);
3776 else
3777 PossibleNonScalarPtrs.insert(I);
3778 };
3779
3780 // We seed the scalars analysis with three classes of instructions: (1)
3781 // instructions marked uniform-after-vectorization and (2) bitcast,
3782 // getelementptr and (pointer) phi instructions used by memory accesses
3783 // requiring a scalar use.
3784 //
3785 // (1) Add to the worklist all instructions that have been identified as
3786 // uniform-after-vectorization.
3787 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3788
3789 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3790 // memory accesses requiring a scalar use. The pointer operands of loads and
3791 // stores will be scalar as long as the memory accesses is not a gather or
3792 // scatter operation. The value operand of a store will remain scalar if the
3793 // store is scalarized.
3794 for (auto *BB : TheLoop->blocks())
3795 for (auto &I : *BB) {
3796 if (auto *Load = dyn_cast<LoadInst>(&I)) {
3797 evaluatePtrUse(Load, Load->getPointerOperand());
3798 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3799 evaluatePtrUse(Store, Store->getPointerOperand());
3800 evaluatePtrUse(Store, Store->getValueOperand());
3801 }
3802 }
3803 for (auto *I : ScalarPtrs)
3804 if (!PossibleNonScalarPtrs.count(I)) {
3805 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3806 Worklist.insert(I);
3807 }
3808
3809 // Insert the forced scalars.
3810 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3811 // induction variable when the PHI user is scalarized.
3812 auto ForcedScalar = ForcedScalars.find(VF);
3813 if (ForcedScalar != ForcedScalars.end())
3814 for (auto *I : ForcedScalar->second) {
3815 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3816 Worklist.insert(I);
3817 }
3818
3819 // Expand the worklist by looking through any bitcasts and getelementptr
3820 // instructions we've already identified as scalar. This is similar to the
3821 // expansion step in collectLoopUniforms(); however, here we're only
3822 // expanding to include additional bitcasts and getelementptr instructions.
3823 unsigned Idx = 0;
3824 while (Idx != Worklist.size()) {
3825 Instruction *Dst = Worklist[Idx++];
3826 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
3827 continue;
3828 auto *Src = cast<Instruction>(Dst->getOperand(0));
3829 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3830 auto *J = cast<Instruction>(U);
3831 return !TheLoop->contains(J) || Worklist.count(J) ||
3832 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3833 isScalarUse(J, Src));
3834 })) {
3835 Worklist.insert(Src);
3836 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3837 }
3838 }
3839
3840 // An induction variable will remain scalar if all users of the induction
3841 // variable and induction variable update remain scalar.
3842 for (const auto &Induction : Legal->getInductionVars()) {
3843 auto *Ind = Induction.first;
3844 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3845
3846 // If tail-folding is applied, the primary induction variable will be used
3847 // to feed a vector compare.
3848 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3849 continue;
3850
3851 // Returns true if \p Indvar is a pointer induction that is used directly by
3852 // load/store instruction \p I.
3853 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3854 Instruction *I) {
3855 return Induction.second.getKind() ==
3857 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3858 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
3859 };
3860
3861 // Determine if all users of the induction variable are scalar after
3862 // vectorization.
3863 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3864 auto *I = cast<Instruction>(U);
3865 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3866 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3867 });
3868 if (!ScalarInd)
3869 continue;
3870
3871 // If the induction variable update is a fixed-order recurrence, neither the
3872 // induction variable or its update should be marked scalar after
3873 // vectorization.
3874 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3875 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3876 continue;
3877
3878 // Determine if all users of the induction variable update instruction are
3879 // scalar after vectorization.
3880 auto ScalarIndUpdate =
3881 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3882 auto *I = cast<Instruction>(U);
3883 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3884 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3885 });
3886 if (!ScalarIndUpdate)
3887 continue;
3888
3889 // The induction variable and its update instruction will remain scalar.
3890 Worklist.insert(Ind);
3891 Worklist.insert(IndUpdate);
3892 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3893 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3894 << "\n");
3895 }
3896
3897 Scalars[VF].insert(Worklist.begin(), Worklist.end());
3898}
3899
3901 Instruction *I, ElementCount VF) const {
3902 if (!isPredicatedInst(I))
3903 return false;
3904
3905 // Do we have a non-scalar lowering for this predicated
3906 // instruction? No - it is scalar with predication.
3907 switch(I->getOpcode()) {
3908 default:
3909 return true;
3910 case Instruction::Call:
3911 if (VF.isScalar())
3912 return true;
3913 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3914 .Kind == CM_Scalarize;
3915 case Instruction::Load:
3916 case Instruction::Store: {
3918 auto *Ty = getLoadStoreType(I);
3919 Type *VTy = Ty;
3920 if (VF.isVector())
3921 VTy = VectorType::get(Ty, VF);
3922 const Align Alignment = getLoadStoreAlignment(I);
3923 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3924 TTI.isLegalMaskedGather(VTy, Alignment))
3925 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3926 TTI.isLegalMaskedScatter(VTy, Alignment));
3927 }
3928 case Instruction::UDiv:
3929 case Instruction::SDiv:
3930 case Instruction::SRem:
3931 case Instruction::URem: {
3932 // We have the option to use the safe-divisor idiom to avoid predication.
3933 // The cost based decision here will always select safe-divisor for
3934 // scalable vectors as scalarization isn't legal.
3935 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3936 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3937 }
3938 }
3939}
3940
3942 if (!blockNeedsPredicationForAnyReason(I->getParent()))
3943 return false;
3944
3945 // Can we prove this instruction is safe to unconditionally execute?
3946 // If not, we must use some form of predication.
3947 switch(I->getOpcode()) {
3948 default:
3949 return false;
3950 case Instruction::Load:
3951 case Instruction::Store: {
3952 if (!Legal->isMaskRequired(I))
3953 return false;
3954 // When we know the load's address is loop invariant and the instruction
3955 // in the original scalar loop was unconditionally executed then we
3956 // don't need to mark it as a predicated instruction. Tail folding may
3957 // introduce additional predication, but we're guaranteed to always have
3958 // at least one active lane. We call Legal->blockNeedsPredication here
3959 // because it doesn't query tail-folding. For stores, we need to prove
3960 // both speculation safety (which follows from the same argument as loads),
3961 // but also must prove the value being stored is correct. The easiest
3962 // form of the later is to require that all values stored are the same.
3964 (isa<LoadInst>(I) ||
3965 (isa<StoreInst>(I) &&
3966 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
3967 !Legal->blockNeedsPredication(I->getParent()))
3968 return false;
3969 return true;
3970 }
3971 case Instruction::UDiv:
3972 case Instruction::SDiv:
3973 case Instruction::SRem:
3974 case Instruction::URem:
3975 // TODO: We can use the loop-preheader as context point here and get
3976 // context sensitive reasoning
3978 case Instruction::Call:
3979 return Legal->isMaskRequired(I);
3980 }
3981}
3982
3983std::pair<InstructionCost, InstructionCost>
3985 ElementCount VF) const {
3986 assert(I->getOpcode() == Instruction::UDiv ||
3987 I->getOpcode() == Instruction::SDiv ||
3988 I->getOpcode() == Instruction::SRem ||
3989 I->getOpcode() == Instruction::URem);
3991
3993
3994 // Scalarization isn't legal for scalable vector types
3995 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3996 if (!VF.isScalable()) {
3997 // Get the scalarization cost and scale this amount by the probability of
3998 // executing the predicated block. If the instruction is not predicated,
3999 // we fall through to the next case.
4000 ScalarizationCost = 0;
4001
4002 // These instructions have a non-void type, so account for the phi nodes
4003 // that we will create. This cost is likely to be zero. The phi node
4004 // cost, if any, should be scaled by the block probability because it
4005 // models a copy at the end of each predicated block.
4006 ScalarizationCost += VF.getKnownMinValue() *
4007 TTI.getCFInstrCost(Instruction::PHI, CostKind);
4008
4009 // The cost of the non-predicated instruction.
4010 ScalarizationCost += VF.getKnownMinValue() *
4011 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4012
4013 // The cost of insertelement and extractelement instructions needed for
4014 // scalarization.
4015 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4016
4017 // Scale the cost by the probability of executing the predicated blocks.
4018 // This assumes the predicated block for each vector lane is equally
4019 // likely.
4020 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4021 }
4022 InstructionCost SafeDivisorCost = 0;
4023
4024 auto *VecTy = ToVectorTy(I->getType(), VF);
4025
4026 // The cost of the select guard to ensure all lanes are well defined
4027 // after we speculate above any internal control flow.
4028 SafeDivisorCost += TTI.getCmpSelInstrCost(
4029 Instruction::Select, VecTy,
4030 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4032
4033 // Certain instructions can be cheaper to vectorize if they have a constant
4034 // second vector operand. One example of this are shifts on x86.
4035 Value *Op2 = I->getOperand(1);
4036 auto Op2Info = TTI.getOperandInfo(Op2);
4037 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
4038 Legal->isInvariant(Op2))
4040
4041 SmallVector<const Value *, 4> Operands(I->operand_values());
4042 SafeDivisorCost += TTI.getArithmeticInstrCost(
4043 I->getOpcode(), VecTy, CostKind,
4044 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4045 Op2Info, Operands, I);
4046 return {ScalarizationCost, SafeDivisorCost};
4047}
4048
4050 Instruction *I, ElementCount VF) {
4051 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4053 "Decision should not be set yet.");
4054 auto *Group = getInterleavedAccessGroup(I);
4055 assert(Group && "Must have a group.");
4056
4057 // If the instruction's allocated size doesn't equal it's type size, it
4058 // requires padding and will be scalarized.
4059 auto &DL = I->getModule()->getDataLayout();
4060 auto *ScalarTy = getLoadStoreType(I);
4061 if (hasIrregularType(ScalarTy, DL))
4062 return false;
4063
4064 // If the group involves a non-integral pointer, we may not be able to
4065 // losslessly cast all values to a common type.
4066 unsigned InterleaveFactor = Group->getFactor();
4067 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4068 for (unsigned i = 0; i < InterleaveFactor; i++) {
4069 Instruction *Member = Group->getMember(i);
4070 if (!Member)
4071 continue;
4072 auto *MemberTy = getLoadStoreType(Member);
4073 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4074 // Don't coerce non-integral pointers to integers or vice versa.
4075 if (MemberNI != ScalarNI) {
4076 // TODO: Consider adding special nullptr value case here
4077 return false;
4078 } else if (MemberNI && ScalarNI &&
4079 ScalarTy->getPointerAddressSpace() !=
4080 MemberTy->getPointerAddressSpace()) {
4081 return false;
4082 }
4083 }
4084
4085 // Check if masking is required.
4086 // A Group may need masking for one of two reasons: it resides in a block that
4087 // needs predication, or it was decided to use masking to deal with gaps
4088 // (either a gap at the end of a load-access that may result in a speculative
4089 // load, or any gaps in a store-access).
4090 bool PredicatedAccessRequiresMasking =
4091 blockNeedsPredicationForAnyReason(I->getParent()) &&
4093 bool LoadAccessWithGapsRequiresEpilogMasking =
4094 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4096 bool StoreAccessWithGapsRequiresMasking =
4097 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4098 if (!PredicatedAccessRequiresMasking &&
4099 !LoadAccessWithGapsRequiresEpilogMasking &&
4100 !StoreAccessWithGapsRequiresMasking)
4101 return true;
4102
4103 // If masked interleaving is required, we expect that the user/target had
4104 // enabled it, because otherwise it either wouldn't have been created or
4105 // it should have been invalidated by the CostModel.
4107 "Masked interleave-groups for predicated accesses are not enabled.");
4108
4109 if (Group->isReverse())
4110 return false;
4111
4112 auto *Ty = getLoadStoreType(I);
4113 const Align Alignment = getLoadStoreAlignment(I);
4114 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4115 : TTI.isLegalMaskedStore(Ty, Alignment);
4116}
4117
4119 Instruction *I, ElementCount VF) {
4120 // Get and ensure we have a valid memory instruction.
4121 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4122
4124 auto *ScalarTy = getLoadStoreType(I);
4125
4126 // In order to be widened, the pointer should be consecutive, first of all.
4127 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4128 return false;
4129
4130 // If the instruction is a store located in a predicated block, it will be
4131 // scalarized.
4132 if (isScalarWithPredication(I, VF))
4133 return false;
4134
4135 // If the instruction's allocated size doesn't equal it's type size, it
4136 // requires padding and will be scalarized.
4137 auto &DL = I->getModule()->getDataLayout();
4138 if (hasIrregularType(ScalarTy, DL))
4139 return false;
4140
4141 return true;
4142}
4143
4144void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4145 // We should not collect Uniforms more than once per VF. Right now,
4146 // this function is called from collectUniformsAndScalars(), which
4147 // already does this check. Collecting Uniforms for VF=1 does not make any
4148 // sense.
4149
4150 assert(VF.isVector() && !Uniforms.contains(VF) &&
4151 "This function should not be visited twice for the same VF");
4152
4153 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4154 // not analyze again. Uniforms.count(VF) will return 1.
4155 Uniforms[VF].clear();
4156
4157 // We now know that the loop is vectorizable!
4158 // Collect instructions inside the loop that will remain uniform after
4159 // vectorization.
4160
4161 // Global values, params and instructions outside of current loop are out of
4162 // scope.
4163 auto isOutOfScope = [&](Value *V) -> bool {
4164 Instruction *I = dyn_cast<Instruction>(V);
4165 return (!I || !TheLoop->contains(I));
4166 };
4167
4168 // Worklist containing uniform instructions demanding lane 0.
4169 SetVector<Instruction *> Worklist;
4170 BasicBlock *Latch = TheLoop->getLoopLatch();
4171
4172 // Add uniform instructions demanding lane 0 to the worklist. Instructions
4173 // that are scalar with predication must not be considered uniform after
4174 // vectorization, because that would create an erroneous replicating region
4175 // where only a single instance out of VF should be formed.
4176 // TODO: optimize such seldom cases if found important, see PR40816.
4177 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4178 if (isOutOfScope(I)) {
4179 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4180 << *I << "\n");
4181 return;
4182 }
4183 if (isScalarWithPredication(I, VF)) {
4184 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4185 << *I << "\n");
4186 return;
4187 }
4188 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4189 Worklist.insert(I);
4190 };
4191
4192 // Start with the conditional branch. If the branch condition is an
4193 // instruction contained in the loop that is only used by the branch, it is
4194 // uniform.
4195 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4196 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4197 addToWorklistIfAllowed(Cmp);
4198
4199 auto PrevVF = VF.divideCoefficientBy(2);
4200 // Return true if all lanes perform the same memory operation, and we can
4201 // thus chose to execute only one.
4202 auto isUniformMemOpUse = [&](Instruction *I) {
4203 // If the value was already known to not be uniform for the previous
4204 // (smaller VF), it cannot be uniform for the larger VF.
4205 if (PrevVF.isVector()) {
4206 auto Iter = Uniforms.find(PrevVF);
4207 if (Iter != Uniforms.end() && !Iter->second.contains(I))
4208 return false;
4209 }
4210 if (!Legal->isUniformMemOp(*I, VF))
4211 return false;
4212 if (isa<LoadInst>(I))
4213 // Loading the same address always produces the same result - at least
4214 // assuming aliasing and ordering which have already been checked.
4215 return true;
4216 // Storing the same value on every iteration.
4217 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4218 };
4219
4220 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4221 InstWidening WideningDecision = getWideningDecision(I, VF);
4222 assert(WideningDecision != CM_Unknown &&
4223 "Widening decision should be ready at this moment");
4224
4225 if (isUniformMemOpUse(I))
4226 return true;
4227
4228 return (WideningDecision == CM_Widen ||
4229 WideningDecision == CM_Widen_Reverse ||
4230 WideningDecision == CM_Interleave);
4231 };
4232
4233 // Returns true if Ptr is the pointer operand of a memory access instruction
4234 // I, I is known to not require scalarization, and the pointer is not also
4235 // stored.
4236 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4237 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
4238 return false;
4239 return getLoadStorePointerOperand(I) == Ptr &&
4240 (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
4241 };
4242
4243 // Holds a list of values which are known to have at least one uniform use.
4244 // Note that there may be other uses which aren't uniform. A "uniform use"
4245 // here is something which only demands lane 0 of the unrolled iterations;
4246 // it does not imply that all lanes produce the same value (e.g. this is not
4247 // the usual meaning of uniform)
4248 SetVector<Value *> HasUniformUse;
4249
4250 // Scan the loop for instructions which are either a) known to have only
4251 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4252 for (auto *BB : TheLoop->blocks())
4253 for (auto &I : *BB) {
4254 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4255 switch (II->getIntrinsicID()) {
4256 case Intrinsic::sideeffect:
4257 case Intrinsic::experimental_noalias_scope_decl:
4258 case Intrinsic::assume:
4259 case Intrinsic::lifetime_start:
4260 case Intrinsic::lifetime_end:
4262 addToWorklistIfAllowed(&I);
4263 break;
4264 default:
4265 break;
4266 }
4267 }
4268
4269 // ExtractValue instructions must be uniform, because the operands are
4270 // known to be loop-invariant.
4271 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4272 assert(isOutOfScope(EVI->getAggregateOperand()) &&
4273 "Expected aggregate value to be loop invariant");
4274 addToWorklistIfAllowed(EVI);
4275 continue;
4276 }
4277
4278 // If there's no pointer operand, there's nothing to do.
4280 if (!Ptr)
4281 continue;
4282
4283 if (isUniformMemOpUse(&I))
4284 addToWorklistIfAllowed(&I);
4285
4286 if (isVectorizedMemAccessUse(&I, Ptr))
4287 HasUniformUse.insert(Ptr);
4288 }
4289
4290 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4291 // demanding) users. Since loops are assumed to be in LCSSA form, this
4292 // disallows uses outside the loop as well.
4293 for (auto *V : HasUniformUse) {
4294 if (isOutOfScope(V))
4295 continue;
4296 auto *I = cast<Instruction>(V);
4297 auto UsersAreMemAccesses =
4298 llvm::all_of(I->users(), [&](User *U) -> bool {
4299 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4300 });
4301 if (UsersAreMemAccesses)
4302 addToWorklistIfAllowed(I);
4303 }
4304
4305 // Expand Worklist in topological order: whenever a new instruction
4306 // is added , its users should be already inside Worklist. It ensures
4307 // a uniform instruction will only be used by uniform instructions.
4308 unsigned idx = 0;
4309 while (idx != Worklist.size()) {
4310 Instruction *I = Worklist[idx++];
4311
4312 for (auto *OV : I->operand_values()) {
4313 // isOutOfScope operands cannot be uniform instructions.
4314 if (isOutOfScope(OV))
4315 continue;
4316 // First order recurrence Phi's should typically be considered
4317 // non-uniform.
4318 auto *OP = dyn_cast<PHINode>(OV);
4320 continue;
4321 // If all the users of the operand are uniform, then add the
4322 // operand into the uniform worklist.
4323 auto *OI = cast<Instruction>(OV);
4324 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4325 auto *J = cast<Instruction>(U);
4326 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4327 }))
4328 addToWorklistIfAllowed(OI);
4329 }
4330 }
4331
4332 // For an instruction to be added into Worklist above, all its users inside
4333 // the loop should also be in Worklist. However, this condition cannot be
4334 // true for phi nodes that form a cyclic dependence. We must process phi
4335 // nodes separately. An induction variable will remain uniform if all users
4336 // of the induction variable and induction variable update remain uniform.
4337 // The code below handles both pointer and non-pointer induction variables.
4338 for (const auto &Induction : Legal->getInductionVars()) {
4339 auto *Ind = Induction.first;
4340 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4341
4342 // Determine if all users of the induction variable are uniform after
4343 // vectorization.
4344 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4345 auto *I = cast<Instruction>(U);
4346 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4347 isVectorizedMemAccessUse(I, Ind);
4348 });
4349 if (!UniformInd)
4350 continue;
4351
4352 // Determine if all users of the induction variable update instruction are
4353 // uniform after vectorization.
4354 auto UniformIndUpdate =
4355 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4356 auto *I = cast<Instruction>(U);
4357 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4358 isVectorizedMemAccessUse(I, IndUpdate);
4359 });
4360 if (!UniformIndUpdate)
4361 continue;
4362
4363 // The induction variable and its update instruction will remain uniform.
4364 addToWorklistIfAllowed(Ind);
4365 addToWorklistIfAllowed(IndUpdate);
4366 }
4367
4368 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4369}
4370
4372 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4373
4375 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4376 "runtime pointer checks needed. Enable vectorization of this "
4377 "loop with '#pragma clang loop vectorize(enable)' when "
4378 "compiling with -Os/-Oz",
4379 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4380 return true;
4381 }
4382
4383 if (!PSE.getPredicate().isAlwaysTrue()) {
4384 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4385 "runtime SCEV checks needed. Enable vectorization of this "
4386 "loop with '#pragma clang loop vectorize(enable)' when "
4387 "compiling with -Os/-Oz",
4388 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4389 return true;
4390 }
4391
4392 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4393 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4394 reportVectorizationFailure("Runtime stride check for small trip count",
4395 "runtime stride == 1 checks needed. Enable vectorization of "
4396 "this loop without such check by compiling with -Os/-Oz",
4397 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4398 return true;
4399 }
4400
4401 return false;
4402}
4403
4405LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4407 return ElementCount::getScalable(0);
4408
4410 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4411 "ScalableVectorizationDisabled", ORE, TheLoop);
4412 return ElementCount::getScalable(0);
4413 }
4414
4415 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4416
4417 auto MaxScalableVF = ElementCount::getScalable(
4418 std::numeric_limits<ElementCount::ScalarTy>::max());
4419
4420 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4421 // FIXME: While for scalable vectors this is currently sufficient, this should
4422 // be replaced by a more detailed mechanism that filters out specific VFs,
4423 // instead of invalidating vectorization for a whole set of VFs based on the
4424 // MaxVF.
4425
4426 // Disable scalable vectorization if the loop contains unsupported reductions.
4427 if (!canVectorizeReductions(MaxScalableVF)) {
4429 "Scalable vectorization not supported for the reduction "
4430 "operations found in this loop.",
4431 "ScalableVFUnfeasible", ORE, TheLoop);
4432 return ElementCount::getScalable(0);
4433 }
4434
4435 // Disable scalable vectorization if the loop contains any instructions
4436 // with element types not supported for scalable vectors.
4437 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4438 return !Ty->isVoidTy() &&
4440 })) {
4441 reportVectorizationInfo("Scalable vectorization is not supported "
4442 "for all element types found in this loop.",
4443 "ScalableVFUnfeasible", ORE, TheLoop);
4444 return ElementCount::getScalable(0);
4445 }
4446
4448 return MaxScalableVF;
4449
4450 // Limit MaxScalableVF by the maximum safe dependence distance.
4451 if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI))
4452 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
4453 else
4454 MaxScalableVF = ElementCount::getScalable(0);
4455
4456 if (!MaxScalableVF)
4458 "Max legal vector width too small, scalable vectorization "
4459 "unfeasible.",
4460 "ScalableVFUnfeasible", ORE, TheLoop);
4461
4462 return MaxScalableVF;
4463}
4464
4465FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4466 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4468 unsigned SmallestType, WidestType;
4469 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4470
4471 // Get the maximum safe dependence distance in bits computed by LAA.
4472 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4473 // the memory accesses that is most restrictive (involved in the smallest
4474 // dependence distance).
4475 unsigned MaxSafeElements =
4477
4478 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4479 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4480
4481 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4482 << ".\n");
4483 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4484 << ".\n");
4485
4486 // First analyze the UserVF, fall back if the UserVF should be ignored.
4487 if (UserVF) {
4488 auto MaxSafeUserVF =
4489 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4490
4491 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4492 // If `VF=vscale x N` is safe, then so is `VF=N`
4493 if (UserVF.isScalable())
4494 return FixedScalableVFPair(
4495 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4496 else
4497 return UserVF;
4498 }
4499
4500 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4501
4502 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4503 // is better to ignore the hint and let the compiler choose a suitable VF.
4504 if (!UserVF.isScalable()) {
4505 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4506 << " is unsafe, clamping to max safe VF="
4507 << MaxSafeFixedVF << ".\n");
4508 ORE->emit([&]() {
4509 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4511 TheLoop->getHeader())
4512 << "User-specified vectorization factor "
4513 << ore::NV("UserVectorizationFactor", UserVF)
4514 << " is unsafe, clamping to maximum safe vectorization factor "
4515 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4516 });
4517 return MaxSafeFixedVF;
4518 }
4519
4521 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4522 << " is ignored because scalable vectors are not "
4523 "available.\n");
4524 ORE->emit([&]() {
4525 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4527 TheLoop->getHeader())
4528 << "User-specified vectorization factor "
4529 << ore::NV("UserVectorizationFactor", UserVF)
4530 << " is ignored because the target does not support scalable "
4531 "vectors. The compiler will pick a more suitable value.";
4532 });
4533 } else {
4534 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4535 << " is unsafe. Ignoring scalable UserVF.\n");
4536 ORE->emit([&]() {
4537 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4539 TheLoop->getHeader())
4540 << "User-specified vectorization factor "
4541 << ore::NV("UserVectorizationFactor", UserVF)
4542 << " is unsafe. Ignoring the hint to let the compiler pick a "
4543 "more suitable value.";
4544 });
4545 }
4546 }
4547
4548 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4549 << " / " << WidestType << " bits.\n");
4550
4553 if (auto MaxVF =
4554 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4555 MaxSafeFixedVF, FoldTailByMasking))
4556 Result.FixedVF = MaxVF;
4557
4558 if (auto MaxVF =
4559 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4560 MaxSafeScalableVF, FoldTailByMasking))
4561 if (MaxVF.isScalable()) {
4562 Result.ScalableVF = MaxVF;
4563 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4564 << "\n");
4565 }
4566
4567 return Result;
4568}
4569
4573 // TODO: It may by useful to do since it's still likely to be dynamically
4574 // uniform if the target can skip.
4576 "Not inserting runtime ptr check for divergent target",
4577 "runtime pointer checks needed. Not enabled for divergent target",
4578 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4580 }
4581
4582 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4583 unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
4584 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4585 if (TC == 1) {
4586 reportVectorizationFailure("Single iteration (non) loop",
4587 "loop trip count is one, irrelevant for vectorization",
4588 "SingleIterationLoop", ORE, TheLoop);
4590 }
4591
4592 switch (ScalarEpilogueStatus) {
4594 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4596 [[fallthrough]];
4598 LLVM_DEBUG(
4599 dbgs() << "LV: vector predicate hint/switch found.\n"
4600 << "LV: Not allowing scalar epilogue, creating predicated "
4601 << "vector loop.\n");
4602 break;
4604 // fallthrough as a special case of OptForSize
4606 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4607 LLVM_DEBUG(
4608 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4609 else
4610 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4611 << "count.\n");
4612
4613 // Bail if runtime checks are required, which are not good when optimising
4614 // for size.
4617
4618 break;
4619 }
4620
4621 // The only loops we can vectorize without a scalar epilogue, are loops with
4622 // a bottom-test and a single exiting block. We'd have to handle the fact
4623 // that not every instruction executes on the last iteration. This will
4624 // require a lane mask which varies through the vector loop body. (TODO)
4626 // If there was a tail-folding hint/switch, but we can't fold the tail by
4627 // masking, fallback to a vectorization with a scalar epilogue.
4628 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4629 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4630 "scalar epilogue instead.\n");
4631 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4632 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4633 }
4635 }
4636
4637 // Now try the tail folding
4638
4639 // Invalidate interleave groups that require an epilogue if we can't mask
4640 // the interleave-group.
4642 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4643 "No decisions should have been taken at this point");
4644 // Note: There is no need to invalidate any cost modeling decisions here, as
4645 // non where taken so far.
4647 }
4648
4649 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4650
4651 // Avoid tail folding if the trip count is known to be a multiple of any VF
4652 // we choose.
4653 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4654 MaxFactors.FixedVF.getFixedValue();
4655 if (MaxFactors.ScalableVF) {
4656 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4657 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4658 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4659 *MaxPowerOf2RuntimeVF,
4660 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4661 } else
4662 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4663 }
4664
4665 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4666 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4667 "MaxFixedVF must be a power of 2");
4668 unsigned MaxVFtimesIC =
4669 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4670 ScalarEvolution *SE = PSE.getSE();
4671 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4672 const SCEV *ExitCount = SE->getAddExpr(
4673 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4674 const SCEV *Rem = SE->getURemExpr(
4675 SE->applyLoopGuards(ExitCount, TheLoop),
4676 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4677 if (Rem->isZero()) {
4678 // Accept MaxFixedVF if we do not have a tail.
4679 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4680 return MaxFactors;
4681 }
4682 }
4683
4684 // If we don't know the precise trip count, or if the trip count that we
4685 // found modulo the vectorization factor is not zero, try to fold the tail
4686 // by masking.
4687 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4688 setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4689 if (foldTailByMasking()) {
4691 LLVM_DEBUG(
4692 dbgs()
4693 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4694 "try to generate VP Intrinsics with scalable vector "
4695 "factors only.\n");
4696 // Tail folded loop using VP intrinsics restricts the VF to be scalable
4697 // for now.
4698 // TODO: extend it for fixed vectors, if required.
4699 assert(MaxFactors.ScalableVF.isScalable() &&
4700 "Expected scalable vector factor.");
4701
4702 MaxFactors.FixedVF = ElementCount::getFixed(1);
4703 }
4704 return MaxFactors;
4705 }
4706
4707 // If there was a tail-folding hint/switch, but we can't fold the tail by
4708 // masking, fallback to a vectorization with a scalar epilogue.
4709 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4710 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4711 "scalar epilogue instead.\n");
4712 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4713 return MaxFactors;
4714 }
4715
4716 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4717 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4719 }
4720
4721 if (TC == 0) {
4723 "Unable to calculate the loop count due to complex control flow",
4724 "unable to calculate the loop count due to complex control flow",
4725 "UnknownLoopCountComplexCFG", ORE, TheLoop);
4727 }
4728
4730 "Cannot optimize for size and vectorize at the same time.",
4731 "cannot optimize for size and vectorize at the same time. "
4732 "Enable vectorization of this loop with '#pragma clang loop "
4733 "vectorize(enable)' when compiling with -Os/-Oz",
4734 "NoTailLoopWithOptForSize", ORE, TheLoop);
4736}
4737
4738ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4739 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4740 ElementCount MaxSafeVF, bool FoldTailByMasking) {
4741 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4742 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4743 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4745
4746 // Convenience function to return the minimum of two ElementCounts.
4747 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4748 assert((LHS.isScalable() == RHS.isScalable()) &&
4749 "Scalable flags must match");
4750 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4751 };
4752
4753 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4754 // Note that both WidestRegister and WidestType may not be a powers of 2.
4755 auto MaxVectorElementCount = ElementCount::get(
4756 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4757 ComputeScalableMaxVF);
4758 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4759 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4760 << (MaxVectorElementCount * WidestType) << " bits.\n");
4761
4762 if (!MaxVectorElementCount) {
4763 LLVM_DEBUG(dbgs() << "LV: The target has no "
4764 << (ComputeScalableMaxVF ? "scalable" : "fixed")
4765 << " vector registers.\n");
4766 return ElementCount::getFixed(1);
4767 }
4768
4769 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4770 if (MaxVectorElementCount.isScalable() &&
4771 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4772 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4773 auto Min = Attr.getVScaleRangeMin();
4774 WidestRegisterMinEC *= Min;
4775 }
4776
4777 // When a scalar epilogue is required, at least one iteration of the scalar
4778 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4779 // max VF that results in a dead vector loop.
4780 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4781 MaxTripCount -= 1;
4782
4783 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4784 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4785 // If upper bound loop trip count (TC) is known at compile time there is no
4786 // point in choosing VF greater than TC (as done in the loop below). Select
4787 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4788 // scalable, we only fall back on a fixed VF when the TC is less than or
4789 // equal to the known number of lanes.
4790 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4791 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4792 "exceeding the constant trip count: "
4793 << ClampedUpperTripCount << "\n");
4794 return ElementCount::get(
4795 ClampedUpperTripCount,
4796 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4797 }
4798
4800 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4802 ElementCount MaxVF = MaxVectorElementCount;
4803 if (MaximizeBandwidth ||
4804 (MaximizeBandwidth.getNumOccurrences() == 0 &&
4807 auto MaxVectorElementCountMaxBW = ElementCount::get(
4808 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4809 ComputeScalableMaxVF);
4810 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4811
4812 // Collect all viable vectorization factors larger than the default MaxVF
4813 // (i.e. MaxVectorElementCount).
4815 for (ElementCount VS = MaxVectorElementCount * 2;
4816 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4817 VFs.push_back(VS);
4818
4819 // For each VF calculate its register usage.
4820 auto RUs = calculateRegisterUsage(VFs);
4821
4822 // Select the largest VF which doesn't require more registers than existing
4823 // ones.
4824 for (int i = RUs.size() - 1; i >= 0; --i) {
4825 bool Selected = true;
4826 for (auto &pair : RUs[i].MaxLocalUsers) {
4827 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
4828 if (pair.second > TargetNumRegisters)
4829 Selected = false;
4830 }
4831 if (Selected) {
4832 MaxVF = VFs[i];
4833 break;
4834 }
4835 }
4836 if (ElementCount MinVF =
4837 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4838 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4839 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4840 << ") with target's minimum: " << MinVF << '\n');
4841 MaxVF = MinVF;
4842 }
4843 }
4844
4845 // Invalidate any widening decisions we might have made, in case the loop
4846 // requires prediction (decided later), but we have already made some
4847 // load/store widening decisions.
4849 }
4850 return MaxVF;
4851}
4852
4853/// Convenience function that returns the value of vscale_range iff
4854/// vscale_range.min == vscale_range.max or otherwise returns the value
4855/// returned by the corresponding TTI method.
4856static std::optional<unsigned>
4858 const Function *Fn = L->getHeader()->getParent();
4859 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4860 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4861 auto Min = Attr.getVScaleRangeMin();
4862 auto Max = Attr.getVScaleRangeMax();
4863 if (Max && Min == Max)
4864 return Max;
4865 }
4866
4867 return TTI.getVScaleForTuning();
4868}
4869
4870bool LoopVectorizationPlanner::isMoreProfitable(
4871 const VectorizationFactor &A, const VectorizationFactor &B) const {
4872 InstructionCost CostA = A.Cost;
4873 InstructionCost CostB = B.Cost;
4874
4875 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4876
4877 if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
4878 // If the trip count is a known (possibly small) constant, the trip count
4879 // will be rounded up to an integer number of iterations under
4880 // FoldTailByMasking. The total cost in that case will be
4881 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4882 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4883 // some extra overheads, but for the purpose of comparing the costs of
4884 // different VFs we can use this to compare the total loop-body cost
4885 // expected after vectorization.
4886 auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4887 InstructionCost VectorCost,
4888 InstructionCost ScalarCost) {
4889 return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
4890 : VectorCost * (MaxTripCount / VF) +
4891 ScalarCost * (MaxTripCount % VF);
4892 };
4893 auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
4894 auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
4895
4896 return RTCostA < RTCostB;
4897 }
4898
4899 // Improve estimate for the vector width if it is scalable.
4900 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4901 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4902 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4903 if (A.Width.isScalable())
4904 EstimatedWidthA *= *VScale;
4905 if (B.Width.isScalable())
4906 EstimatedWidthB *= *VScale;
4907 }
4908
4909 // Assume vscale may be larger than 1 (or the value being tuned for),
4910 // so that scalable vectorization is slightly favorable over fixed-width
4911 // vectorization.
4912 if (A.Width.isScalable() && !B.Width.isScalable())
4913 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
4914
4915 // To avoid the need for FP division:
4916 // (CostA / A.Width) < (CostB / B.Width)
4917 // <=> (CostA * B.Width) < (CostB * A.Width)
4918 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
4919}
4920
4923 Loop *TheLoop) {
4924 if (InvalidCosts.empty())
4925 return;
4926
4927 // Emit a report of VFs with invalid costs in the loop.
4928
4929 // Group the remarks per instruction, keeping the instruction order from
4930 // InvalidCosts.
4931 std::map<Instruction *, unsigned> Numbering;
4932 unsigned I = 0;
4933 for (auto &Pair : InvalidCosts)
4934 if (!Numbering.count(Pair.first))
4935 Numbering[Pair.first] = I++;
4936
4937 // Sort the list, first on instruction(number) then on VF.
4938 sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
4939 if (Numbering[A.first] != Numbering[B.first])
4940 return Numbering[A.first] < Numbering[B.first];
4942 return ECC(A.second, B.second);
4943 });
4944
4945 // For a list of ordered instruction-vf pairs:
4946 // [(load, vf1), (load, vf2), (store, vf1)]
4947 // Group the instructions together to emit separate remarks for:
4948 // load (vf1, vf2)
4949 // store (vf1)
4950 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
4951 auto Subset = ArrayRef<InstructionVFPair>();
4952 do {
4953 if (Subset.empty())
4954 Subset = Tail.take_front(1);
4955
4956 Instruction *I = Subset.front().first;
4957
4958 // If the next instruction is different, or if there are no other pairs,
4959 // emit a remark for the collated subset. e.g.
4960 // [(load, vf1), (load, vf2))]
4961 // to emit:
4962 // remark: invalid costs for 'load' at VF=(vf, vf2)
4963 if (Subset == Tail || Tail[Subset.size()].first != I) {
4964 std::string OutString;
4965 raw_string_ostream OS(OutString);
4966 assert(!Subset.empty() && "Unexpected empty range");
4967 OS << "Instruction with invalid costs prevented vectorization at VF=(";
4968 for (const auto &Pair : Subset)
4969 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4970 OS << "):";
4971 if (auto *CI = dyn_cast<CallInst>(I))
4972 OS << " call to " << CI->getCalledFunction()->getName();
4973 else
4974 OS << " " << I->getOpcodeName();
4975 OS.flush();
4976 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
4977 Tail = Tail.drop_front(Subset.size());
4978 Subset = {};
4979 } else
4980 // Grow the subset by one element
4981 Subset = Tail.take_front(Subset.size() + 1);
4982 } while (!Tail.empty());
4983}
4984
4985VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
4986 const ElementCountSet &VFCandidates) {
4987 InstructionCost ExpectedCost =
4989 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4990 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4991 assert(VFCandidates.count(ElementCount::getFixed(1)) &&
4992 "Expected Scalar VF to be a candidate");
4993
4994 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4995 ExpectedCost);
4996 VectorizationFactor ChosenFactor = ScalarCost;
4997
4998 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4999 if (ForceVectorization && VFCandidates.size() > 1) {
5000 // Ignore scalar width, because the user explicitly wants vectorization.
5001 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5002 // evaluation.
5003 ChosenFactor.Cost = InstructionCost::getMax();
5004 }
5005
5006 SmallVector<InstructionVFPair> InvalidCosts;
5007 for (const auto &i : VFCandidates) {
5008 // The cost for scalar VF=1 is already calculated, so ignore it.
5009 if (i.isScalar())
5010 continue;
5011
5013 CM.expectedCost(i, &InvalidCosts);
5014 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5015
5016#ifndef NDEBUG
5017 unsigned AssumedMinimumVscale =
5018 getVScaleForTuning(OrigLoop, TTI).value_or(1);
5019 unsigned Width =
5020 Candidate.Width.isScalable()
5021 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5022 : Candidate.Width.getFixedValue();
5023 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5024 << " costs: " << (Candidate.Cost / Width));
5025 if (i.isScalable())
5026 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5027 << AssumedMinimumVscale << ")");
5028 LLVM_DEBUG(dbgs() << ".\n");
5029#endif
5030
5031 if (!C.second && !ForceVectorization) {
5032 LLVM_DEBUG(
5033 dbgs() << "LV: Not considering vector loop of width " << i
5034 << " because it will not generate any vector instructions.\n");
5035 continue;
5036 }
5037
5038 // If profitable add it to ProfitableVF list.
5039 if (isMoreProfitable(Candidate, ScalarCost))
5040 ProfitableVFs.push_back(Candidate);
5041
5042 if (isMoreProfitable(Candidate, ChosenFactor))
5043 ChosenFactor = Candidate;
5044 }
5045
5046 emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
5047
5050 "There are conditional stores.",
5051 "store that is conditionally executed prevents vectorization",
5052 "ConditionalStore", ORE, OrigLoop);
5053 ChosenFactor = ScalarCost;
5054 }
5055
5056 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5057 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
5058 << "LV: Vectorization seems to be not beneficial, "
5059 << "but was forced by a user.\n");
5060 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5061 return ChosenFactor;
5062}
5063
5064bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
5065 ElementCount VF) const {
5066 // Cross iteration phis such as reductions need special handling and are
5067 // currently unsupported.
5068 if (any_of(OrigLoop->getHeader()->phis(),
5069 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
5070 return false;
5071
5072 // Phis with uses outside of the loop require special handling and are
5073 // currently unsupported.
5074 for (const auto &Entry : Legal->getInductionVars()) {
5075 // Look for uses of the value of the induction at the last iteration.
5076 Value *PostInc =
5077 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
5078 for (User *U : PostInc->users())
5079 if (!OrigLoop->contains(cast<Instruction>(U)))
5080 return false;
5081 // Look for uses of penultimate value of the induction.
5082 for (User *U : Entry.first->users())
5083 if (!OrigLoop->contains(cast<Instruction>(U)))
5084 return false;
5085 }
5086
5087 // Epilogue vectorization code has not been auditted to ensure it handles
5088 // non-latch exits properly. It may be fine, but it needs auditted and
5089 // tested.
5090 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
5091 return false;
5092
5093 return true;
5094}
5095
5097 const ElementCount VF) const {
5098 // FIXME: We need a much better cost-model to take different parameters such
5099 // as register pressure, code size increase and cost of extra branches into
5100 // account. For now we apply a very crude heuristic and only consider loops
5101 // with vectorization factors larger than a certain value.
5102
5103 // Allow the target to opt out entirely.
5105 return false;
5106
5107 // We also consider epilogue vectorization unprofitable for targets that don't
5108 // consider interleaving beneficial (eg. MVE).
5109 if (TTI.getMaxInterleaveFactor(VF) <= 1)
5110 return false;
5111
5112 unsigned Multiplier = 1;
5113 if (VF.isScalable())
5114 Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
5115 if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
5116 return true;
5117 return false;
5118}
5119
5121 const ElementCount MainLoopVF, unsigned IC) {
5124 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
5125 return Result;
5126 }
5127
5128 if (!CM.isScalarEpilogueAllowed()) {
5129 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
5130 "epilogue is allowed.\n");
5131 return Result;
5132 }
5133
5134 // Not really a cost consideration, but check for unsupported cases here to
5135 // simplify the logic.
5136 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
5137 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
5138 "is not a supported candidate.\n");
5139 return Result;
5140 }
5141
5143 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
5145 if (hasPlanWithVF(ForcedEC))
5146 return {ForcedEC, 0, 0};
5147 else {
5148 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
5149 "viable.\n");
5150 return Result;
5151 }
5152 }
5153
5154 if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
5155 OrigLoop->getHeader()->getParent()->hasMinSize()) {
5156 LLVM_DEBUG(
5157 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
5158 return Result;
5159 }
5160
5161 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
5162 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5163 "this loop\n");
5164 return Result;
5165 }
5166
5167 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5168 // the main loop handles 8 lanes per iteration. We could still benefit from
5169 // vectorizing the epilogue loop with VF=4.
5170 ElementCount EstimatedRuntimeVF = MainLoopVF;
5171 if (MainLoopVF.isScalable()) {
5172 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5173 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
5174 EstimatedRuntimeVF *= *VScale;
5175 }
5176
5177 ScalarEvolution &SE = *PSE.getSE();
5178 Type *TCType = Legal->getWidestInductionType();
5179 const SCEV *RemainingIterations = nullptr;
5180 for (auto &NextVF : ProfitableVFs) {
5181 // Skip candidate VFs without a corresponding VPlan.
5182 if (!hasPlanWithVF(NextVF.Width))
5183 continue;
5184
5185 // Skip candidate VFs with widths >= the estimate runtime VF (scalable
5186 // vectors) or the VF of the main loop (fixed vectors).
5187 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5188 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
5189 ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
5190 continue;
5191
5192 // If NextVF is greater than the number of remaining iterations, the
5193 // epilogue loop would be dead. Skip such factors.
5194 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
5195 // TODO: extend to support scalable VFs.
5196 if (!RemainingIterations) {
5197 const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
5198 RemainingIterations = SE.getURemExpr(
5199 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
5200 }
5201 if (SE.isKnownPredicate(
5203 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
5204 RemainingIterations))
5205 continue;
5206 }
5207
5208 if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
5209 Result = NextVF;
5210 }
5211
5212 if (Result != VectorizationFactor::Disabled())
5213 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5214 << Result.Width << "\n");
5215 return Result;
5216}
5217
5218std::pair<unsigned, unsigned>
5220 unsigned MinWidth = -1U;
5221 unsigned MaxWidth = 8;
5223 // For in-loop reductions, no element types are added to ElementTypesInLoop
5224 // if there are no loads/stores in the loop. In this case, check through the
5225 // reduction variables to determine the maximum width.
5226 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5227 // Reset MaxWidth so that we can find the smallest type used by recurrences
5228 // in the loop.
5229 MaxWidth = -1U;
5230 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
5231 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5232 // When finding the min width used by the recurrence we need to account
5233 // for casts on the input operands of the recurrence.
5234 MaxWidth = std::min<unsigned>(
5235 MaxWidth, std::min<unsigned>(
5238 }
5239 } else {
5240 for (Type *T : ElementTypesInLoop) {
5241 MinWidth = std::min<unsigned>(
5242 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5243 MaxWidth = std::max<unsigned>(
5244 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5245 }
5246 }
5247 return {MinWidth, MaxWidth};
5248}
5249
5251 ElementTypesInLoop.clear();
5252 // For each block.
5253 for (BasicBlock *BB : TheLoop->blocks()) {
5254 // For each instruction in the loop.
5255 for (Instruction &I : BB->instructionsWithoutDebug()) {
5256 Type *T = I.getType();
5257
5258 // Skip ignored values.
5259 if (ValuesToIgnore.count(&I))
5260 continue;
5261
5262 // Only examine Loads, Stores and PHINodes.
5263 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5264 continue;
5265
5266 // Examine PHI nodes that are reduction variables. Update the type to
5267 // account for the recurrence type.
5268 if (auto *PN = dyn_cast<PHINode>(&I)) {
5269 if (!Legal->isReductionVariable(PN))
5270 continue;
5271 const RecurrenceDescriptor &RdxDesc =
5272 Legal->getReductionVars().find(PN)->second;
5275 RdxDesc.getRecurrenceType(),
5277 continue;
5278 T = RdxDesc.getRecurrenceType();
5279 }
5280
5281 // Examine the stored values.
5282 if (auto *ST = dyn_cast<StoreInst>(&I))
5283 T = ST->getValueOperand()->getType();
5284
5285 assert(T->isSized() &&
5286 "Expected the load/store/recurrence type to be sized");
5287
5288 ElementTypesInLoop.insert(T);
5289 }
5290 }
5291}
5292
5293unsigned
5295 InstructionCost LoopCost) {
5296 // -- The interleave heuristics --
5297 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5298 // There are many micro-architectural considerations that we can't predict
5299 // at this level. For example, frontend pressure (on decode or fetch) due to
5300 // code size, or the number and capabilities of the execution ports.
5301 //
5302 // We use the following heuristics to select the interleave count:
5303 // 1. If the code has reductions, then we interleave to break the cross
5304 // iteration dependency.
5305 // 2. If the loop is really small, then we interleave to reduce the loop
5306 // overhead.
5307 // 3. We don't interleave if we think that we will spill registers to memory
5308 // due to the increased register pressure.
5309
5311 return 1;
5312
5313 // Do not interleave if EVL is preferred and no User IC is specified.
5314 if (foldTailWithEVL()) {
5315 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
5316 "Unroll factor forced to be 1.\n");
5317 return 1;
5318 }
5319
5320 // We used the distance for the interleave count.
5322 return 1;
5323
5324 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5325 const bool HasReductions = !Legal->getReductionVars().empty();
5326
5327 // If we did not calculate the cost for VF (because the user selected the VF)
5328 // then we calculate the cost of VF here.
5329 if (LoopCost == 0) {
5330 LoopCost = expectedCost(VF).first;
5331 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5332
5333 // Loop body is free and there is no need for interleaving.
5334 if (LoopCost == 0)
5335 return 1;
5336 }
5337
5339 // We divide by these constants so assume that we have at least one
5340 // instruction that uses at least one register.
5341 for (auto& pair : R.MaxLocalUsers) {
5342 pair.second = std::max(pair.second, 1U);
5343 }
5344
5345 // We calculate the interleave count using the following formula.
5346 // Subtract the number of loop invariants from the number of available
5347 // registers. These registers are used by all of the interleaved instances.
5348 // Next, divide the remaining registers by the number of registers that is
5349 // required by the loop, in order to estimate how many parallel instances
5350 // fit without causing spills. All of this is rounded down if necessary to be
5351 // a power of two. We want power of two interleave count to simplify any
5352 // addressing operations or alignment considerations.
5353 // We also want power of two interleave counts to ensure that the induction
5354 // variable of the vector loop wraps to zero, when tail is folded by masking;
5355 // this currently happens when OptForSize, in which case IC is set to 1 above.
5356 unsigned IC = UINT_MAX;
5357
5358 for (auto& pair : R.MaxLocalUsers) {
5359 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5360 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5361 << " registers of "
5362 << TTI.getRegisterClassName(pair.first) << " register class\n");
5363 if (VF.isScalar()) {
5364 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5365 TargetNumRegisters = ForceTargetNumScalarRegs;
5366 } else {
5367 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5368 TargetNumRegisters = ForceTargetNumVectorRegs;
5369 }
5370 unsigned MaxLocalUsers = pair.second;
5371 unsigned LoopInvariantRegs = 0;
5372 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5373 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5374
5375 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5376 MaxLocalUsers);
5377 // Don't count the induction variable as interleaved.
5379 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5380 std::max(1U, (MaxLocalUsers - 1)));
5381 }
5382
5383 IC = std::min(IC, TmpIC);
5384 }
5385
5386 // Clamp the interleave ranges to reasonable counts.
5387 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5388
5389 // Check if the user has overridden the max.
5390 if (VF.isScalar()) {
5391 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5392 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5393 } else {
5394 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5395 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5396 }
5397
5398 unsigned EstimatedVF = VF.getKnownMinValue();
5399 if (VF.isScalable()) {
5400 if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
5401 EstimatedVF *= *VScale;
5402 }
5403 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
5404
5405 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5406 if (KnownTC > 0) {
5407 // At least one iteration must be scalar when this constraint holds. So the
5408 // maximum available iterations for interleaving is one less.
5409 unsigned AvailableTC =
5410 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5411
5412 // If trip count is known we select between two prospective ICs, where
5413 // 1) the aggressive IC is capped by the trip count divided by VF
5414 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5415 // The final IC is selected in a way that the epilogue loop trip count is
5416 // minimized while maximizing the IC itself, so that we either run the
5417 // vector loop at least once if it generates a small epilogue loop, or else
5418 // we run the vector loop at least twice.
5419
5420 unsigned InterleaveCountUB = bit_floor(
5421 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
5422 unsigned InterleaveCountLB = bit_floor(std::max(
5423 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5424 MaxInterleaveCount = InterleaveCountLB;
5425
5426 if (InterleaveCountUB != InterleaveCountLB) {
5427 unsigned TailTripCountUB =
5428 (AvailableTC % (EstimatedVF * InterleaveCountUB));
5429 unsigned TailTripCountLB =
5430 (AvailableTC % (EstimatedVF * InterleaveCountLB));
5431 // If both produce same scalar tail, maximize the IC to do the same work
5432 // in fewer vector loop iterations
5433 if (TailTripCountUB == TailTripCountLB)
5434 MaxInterleaveCount = InterleaveCountUB;
5435 }
5436 } else if (BestKnownTC && *BestKnownTC > 0) {
5437 // At least one iteration must be scalar when this constraint holds. So the
5438 // maximum available iterations for interleaving is one less.
5439 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5440 ? (*BestKnownTC) - 1
5441 : *BestKnownTC;
5442
5443 // If trip count is an estimated compile time constant, limit the
5444 // IC to be capped by the trip count divided by VF * 2, such that the vector
5445 // loop runs at least twice to make interleaving seem profitable when there
5446 // is an epilogue loop present. Since exact Trip count is not known we
5447 // choose to be conservative in our IC estimate.
5448 MaxInterleaveCount = bit_floor(std::max(
5449 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5450 }
5451
5452 assert(MaxInterleaveCount > 0 &&
5453 "Maximum interleave count must be greater than 0");
5454
5455 // Clamp the calculated IC to be between the 1 and the max interleave count
5456 // that the target and trip count allows.
5457 if (IC > MaxInterleaveCount)
5458 IC = MaxInterleaveCount;
5459 else
5460 // Make sure IC is greater than 0.
5461 IC = std::max(1u, IC);
5462
5463 assert(IC > 0 && "Interleave count must be greater than 0.");
5464
5465 // Interleave if we vectorized this loop and there is a reduction that could
5466 // benefit from interleaving.
5467 if (VF.isVector() && HasReductions) {
5468 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5469 return IC;
5470 }
5471
5472 // For any scalar loop that either requires runtime checks or predication we
5473 // are better off leaving this to the unroller. Note that if we've already
5474 // vectorized the loop we will have done the runtime check and so interleaving
5475 // won't require further checks.
5476 bool ScalarInterleavingRequiresPredication =
5477 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5478 return Legal->blockNeedsPredication(BB);
5479 }));
5480 bool ScalarInterleavingRequiresRuntimePointerCheck =
5482
5483 // We want to interleave small loops in order to reduce the loop overhead and
5484 // potentially expose ILP opportunities.
5485 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5486 << "LV: IC is " << IC << '\n'
5487 << "LV: VF is " << VF << '\n');
5488 const bool AggressivelyInterleaveReductions =
5489 TTI.enableAggressiveInterleaving(HasReductions);
5490 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5491 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5492 // We assume that the cost overhead is 1 and we use the cost model
5493 // to estimate the cost of the loop and interleave until the cost of the
5494 // loop overhead is about 5% of the cost of the loop.
5495 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5496 SmallLoopCost / *LoopCost.getValue()));
5497
5498 // Interleave until store/load ports (estimated by max interleave count) are
5499 // saturated.
5500 unsigned NumStores = Legal->getNumStores();
5501 unsigned NumLoads = Legal->getNumLoads();
5502 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5503 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5504
5505 // There is little point in interleaving for reductions containing selects
5506 // and compares when VF=1 since it may just create more overhead than it's
5507 // worth for loops with small trip counts. This is because we still have to
5508 // do the final reduction after the loop.
5509 bool HasSelectCmpReductions =
5510 HasReductions &&
5511 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5512 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5513 return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5514 RdxDesc.getRecurrenceKind());
5515 });
5516 if (HasSelectCmpReductions) {
5517 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5518 return 1;
5519 }
5520
5521 // If we have a scalar reduction (vector reductions are already dealt with
5522 // by this point), we can increase the critical path length if the loop
5523 // we're interleaving is inside another loop. For tree-wise reductions
5524 // set the limit to 2, and for ordered reductions it's best to disable
5525 // interleaving entirely.
5526 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5527 bool HasOrderedReductions =
5528 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5529 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5530 return RdxDesc.isOrdered();
5531 });
5532 if (HasOrderedReductions) {
5533 LLVM_DEBUG(
5534 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5535 return 1;
5536 }
5537
5538 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5539 SmallIC = std::min(SmallIC, F);
5540 StoresIC = std::min(StoresIC, F);
5541 LoadsIC = std::min(LoadsIC, F);
5542 }
5543
5545 std::max(StoresIC, LoadsIC) > SmallIC) {
5546 LLVM_DEBUG(
5547 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5548 return std::max(StoresIC, LoadsIC);
5549 }
5550
5551 // If there are scalar reductions and TTI has enabled aggressive
5552 // interleaving for reductions, we will interleave to expose ILP.
5553 if (VF.isScalar() && AggressivelyInterleaveReductions) {
5554 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5555 // Interleave no less than SmallIC but not as aggressive as the normal IC
5556 // to satisfy the rare situation when resources are too limited.
5557 return std::max(IC / 2, SmallIC);
5558 } else {
5559 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5560 return SmallIC;
5561 }
5562 }
5563
5564 // Interleave if this is a large loop (small loops are already dealt with by
5565 // this point) that could benefit from interleaving.
5566 if (AggressivelyInterleaveReductions) {
5567 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5568 return IC;
5569 }
5570
5571 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5572 return 1;
5573}
5574
5577 // This function calculates the register usage by measuring the highest number
5578 // of values that are alive at a single location. Obviously, this is a very
5579 // rough estimation. We scan the loop in a topological order in order and
5580 // assign a number to each instruction. We use RPO to ensure that defs are
5581 // met before their users. We assume that each instruction that has in-loop
5582 // users starts an interval. We record every time that an in-loop value is
5583 // used, so we have a list of the first and last occurrences of each
5584 // instruction. Next, we transpose this data structure into a multi map that
5585 // holds the list of intervals that *end* at a specific location. This multi
5586 // map allows us to perform a linear search. We scan the instructions linearly
5587 // and record each time that a new interval starts, by placing it in a set.
5588 // If we find this value in the multi-map then we remove it from the set.
5589 // The max register usage is the maximum size of the set.
5590 // We also search for instructions that are defined outside the loop, but are
5591 // used inside the loop. We need this number separately from the max-interval
5592 // usage number because when we unroll, loop-invariant values do not take
5593 // more register.
5595 DFS.perform(LI);
5596
5597 RegisterUsage RU;
5598
5599 // Each 'key' in the map opens a new interval. The values
5600 // of the map are the index of the 'last seen' usage of the
5601 // instruction that is the key.
5603
5604 // Maps instruction to its index.
5606 // Marks the end of each interval.
5607 IntervalMap EndPoint;
5608 // Saves the list of instruction indices that are used in the loop.
5610 // Saves the list of values that are used in the loop but are defined outside
5611 // the loop (not including non-instruction values such as arguments and
5612 // constants).
5613 SmallSetVector<Instruction *, 8> LoopInvariants;
5614
5615 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5616 for (Instruction &I : BB->instructionsWithoutDebug()) {
5617 IdxToInstr.push_back(&I);
5618
5619 // Save the end location of each USE.
5620 for (Value *U : I.operands()) {
5621 auto *Instr = dyn_cast<Instruction>(U);
5622
5623 // Ignore non-instruction values such as arguments, constants, etc.
5624 // FIXME: Might need some motivation why these values are ignored. If
5625 // for example an argument is used inside the loop it will increase the
5626 // register pressure (so shouldn't we add it to LoopInvariants).
5627 if (!Instr)
5628 continue;
5629
5630 // If this instruction is outside the loop then record it and continue.
5631 if (!TheLoop->contains(Instr)) {
5632 LoopInvariants.insert(Instr);
5633 continue;
5634 }
5635
5636 // Overwrite previous end points.
5637 EndPoint[Instr] = IdxToInstr.size();
5638 Ends.insert(Instr);
5639 }
5640 }
5641 }
5642
5643 // Saves the list of intervals that end with the index in 'key'.
5644 using InstrList = SmallVector<Instruction *, 2>;
5645 DenseMap<unsigned, InstrList> TransposeEnds;
5646
5647 // Transpose the EndPoints to a list of values that end at each index.
5648 for (auto &Interval : EndPoint)
5649 TransposeEnds[Interval.second].push_back(Interval.first);
5650
5651 SmallPtrSet<Instruction *, 8> OpenIntervals;
5654
5655 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5656
5657 const auto &TTICapture = TTI;
5658 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5659 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5660 return 0;
5661 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5662 };
5663
5664 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5665 Instruction *I = IdxToInstr[i];
5666
5667 // Remove all of the instructions that end at this location.
5668 InstrList &List = TransposeEnds[i];
5669 for (Instruction *ToRemove : List)
5670 OpenIntervals.erase(ToRemove);
5671
5672 // Ignore instructions that are never used within the loop.
5673 if (!Ends.count(I))
5674 continue;
5675
5676 // Skip ignored values.
5677 if (ValuesToIgnore.count(I))
5678 continue;
5679
5681
5682 // For each VF find the maximum usage of registers.
5683 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5684 // Count the number of registers used, per register class, given all open
5685 // intervals.
5686 // Note that elements in this SmallMapVector will be default constructed
5687 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5688 // there is no previous entry for ClassID.
5690
5691 if (VFs[j].isScalar()) {
5692 for (auto *Inst : OpenIntervals) {
5693 unsigned ClassID =
5694 TTI.getRegisterClassForType(false, Inst->getType());
5695 // FIXME: The target might use more than one register for the type
5696 // even in the scalar case.
5697 RegUsage[ClassID] += 1;
5698 }
5699 } else {
5701 for (auto *Inst : OpenIntervals) {
5702 // Skip ignored values for VF > 1.
5703 if (VecValuesToIgnore.count(Inst))
5704 continue;
5705 if (isScalarAfterVectorization(Inst, VFs[j])) {
5706 unsigned ClassID =
5707 TTI.getRegisterClassForType(false, Inst->getType());
5708 // FIXME: The target might use more than one register for the type
5709 // even in the scalar case.
5710 RegUsage[ClassID] += 1;
5711 } else {
5712 unsigned ClassID =
5713 TTI.getRegisterClassForType(true, Inst->getType());
5714 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5715 }
5716 }
5717 }
5718
5719 for (auto& pair : RegUsage) {
5720 auto &Entry = MaxUsages[j][pair.first];
5721 Entry = std::max(Entry, pair.second);
5722 }
5723 }
5724
5725 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5726 << OpenIntervals.size() << '\n');
5727
5728 // Add the current instruction to the list of open intervals.
5729 OpenIntervals.insert(I);
5730 }
5731
5732 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5733 // Note that elements in this SmallMapVector will be default constructed
5734 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5735 // there is no previous entry for ClassID.
5737
5738 for (auto *Inst : LoopInvariants) {
5739 // FIXME: The target might use more than one register for the type
5740 // even in the scalar case.
5741 bool IsScalar = all_of(Inst->users(), [&](User *U) {
5742 auto *I = cast<Instruction>(U);
5743 return TheLoop != LI->getLoopFor(I->getParent()) ||
5744 isScalarAfterVectorization(I, VFs[i]);
5745 });
5746
5747 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
5748 unsigned ClassID =
5749 TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5750 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5751 }
5752
5753 LLVM_DEBUG({
5754 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5755 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5756 << " item\n";
5757 for (const auto &pair : MaxUsages[i]) {
5758 dbgs() << "LV(REG): RegisterClass: "
5759 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5760 << " registers\n";
5761 }
5762 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5763 << " item\n";
5764 for (const auto &pair : Invariant) {
5765 dbgs() << "LV(REG): RegisterClass: "
5766 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5767 << " registers\n";
5768 }
5769 });
5770
5771 RU.LoopInvariantRegs = Invariant;
5772 RU.MaxLocalUsers = MaxUsages[i];
5773 RUs[i] = RU;
5774 }
5775
5776 return RUs;
5777}
5778
5779bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5780 ElementCount VF) {
5781 // TODO: Cost model for emulated masked load/store is completely
5782 // broken. This hack guides the cost model to use an artificially
5783 // high enough value to practically disable vectorization with such
5784 // operations, except where previously deployed legality hack allowed
5785 // using very low cost values. This is to avoid regressions coming simply
5786 // from moving "masked load/store" check from legality to cost model.
5787 // Masked Load/Gather emulation was previously never allowed.
5788 // Limited number of Masked Store/Scatter emulation was allowed.
5790 "Expecting a scalar emulated instruction");
5791 return isa<LoadInst>(I) ||
5792 (isa<StoreInst>(I) &&
5793 NumPredStores > NumberOfStoresToPredicate);
5794}
5795
5797 // If we aren't vectorizing the loop, or if we've already collected the
5798 // instructions to scalarize, there's nothing to do. Collection may already
5799 // have occurred if we have a user-selected VF and are now computing the
5800 // expected cost for interleaving.
5801 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5802 return;
5803
5804 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5805 // not profitable to scalarize any instructions, the presence of VF in the
5806 // map will indicate that we've analyzed it already.
5807 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5808
5809 PredicatedBBsAfterVectorization[VF].clear();
5810
5811 // Find all the instructions that are scalar with predication in the loop and
5812 // determine if it would be better to not if-convert the blocks they are in.
5813 // If so, we also record the instructions to scalarize.
5814 for (BasicBlock *BB : TheLoop->blocks()) {
5816 continue;
5817 for (Instruction &I : *BB)
5818 if (isScalarWithPredication(&I, VF)) {
5819 ScalarCostsTy ScalarCosts;
5820 // Do not apply discount if scalable, because that would lead to
5821 // invalid scalarization costs.
5822 // Do not apply discount logic if hacked cost is needed
5823 // for emulated masked memrefs.
5824 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5825 !useEmulatedMaskMemRefHack(&I, VF) &&
5826 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5827 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5828 // Remember that BB will remain after vectorization.
5829 PredicatedBBsAfterVectorization[VF].insert(BB);
5830 }
5831 }
5832}
5833
5834InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5835 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5836 assert(!isUniformAfterVectorization(PredInst, VF) &&
5837 "Instruction marked uniform-after-vectorization will be predicated");
5838
5839 // Initialize the discount to zero, meaning that the scalar version and the
5840 // vector version cost the same.
5841 InstructionCost Discount = 0;
5842
5843 // Holds instructions to analyze. The instructions we visit are mapped in
5844 // ScalarCosts. Those instructions are the ones that would be scalarized if
5845 // we find that the scalar version costs less.
5847
5848 // Returns true if the given instruction can be scalarized.
5849 auto canBeScalarized = [&](Instruction *I) -> bool {
5850 // We only attempt to scalarize instructions forming a single-use chain
5851 // from the original predicated block that would otherwise be vectorized.
5852 // Although not strictly necessary, we give up on instructions we know will
5853 // already be scalar to avoid traversing chains that are unlikely to be
5854 // beneficial.
5855 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5857 return false;
5858
5859 // If the instruction is scalar with predication, it will be analyzed
5860 // separately. We ignore it within the context of PredInst.
5861 if (isScalarWithPredication(I, VF))
5862 return false;
5863
5864 // If any of the instruction's operands are uniform after vectorization,
5865 // the instruction cannot be scalarized. This prevents, for example, a
5866 // masked load from being scalarized.
5867 //
5868 // We assume we will only emit a value for lane zero of an instruction
5869 // marked uniform after vectorization, rather than VF identical values.
5870 // Thus, if we scalarize an instruction that uses a uniform, we would
5871 // create uses of values corresponding to the lanes we aren't emitting code
5872 // for. This behavior can be changed by allowing getScalarValue to clone
5873 // the lane zero values for uniforms rather than asserting.
5874 for (Use &U : I->operands())
5875 if (auto *J = dyn_cast<Instruction>(U.get()))
5876 if (isUniformAfterVectorization(J, VF))
5877 return false;
5878
5879 // Otherwise, we can scalarize the instruction.
5880 return true;
5881 };
5882
5883 // Compute the expected cost discount from scalarizing the entire expression
5884 // feeding the predicated instruction. We currently only consider expressions
5885 // that are single-use instruction chains.
5886 Worklist.push_back(PredInst);
5887 while (!Worklist.empty()) {
5888 Instruction *I = Worklist.pop_back_val();
5889
5890 // If we've already analyzed the instruction, there's nothing to do.
5891 if (ScalarCosts.contains(I))
5892 continue;
5893
5894 // Compute the cost of the vector instruction. Note that this cost already
5895 // includes the scalarization overhead of the predicated instruction.
5896 InstructionCost VectorCost = getInstructionCost(I, VF).first;
5897
5898 // Compute the cost of the scalarized instruction. This cost is the cost of
5899 // the instruction as if it wasn't if-converted and instead remained in the
5900 // predicated block. We will scale this cost by block probability after
5901 // computing the scalarization overhead.
5902 InstructionCost ScalarCost =
5903 VF.getFixedValue() *
5904 getInstructionCost(I, ElementCount::getFixed(1)).first;
5905
5906 // Compute the scalarization overhead of needed insertelement instructions
5907 // and phi nodes.
5909 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5910 ScalarCost += TTI.getScalarizationOverhead(
5911 cast<VectorType>(ToVectorTy(I->getType(), VF)),
5912 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5913 /*Extract*/ false, CostKind);
5914 ScalarCost +=
5915 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5916 }
5917
5918 // Compute the scalarization overhead of needed extractelement
5919 // instructions. For each of the instruction's operands, if the operand can
5920 // be scalarized, add it to the worklist; otherwise, account for the
5921 // overhead.
5922 for (Use &U : I->operands())
5923 if (auto *J = dyn_cast<Instruction>(U.get())) {
5924 assert(VectorType::isValidElementType(J->getType()) &&
5925 "Instruction has non-scalar type");
5926 if (canBeScalarized(J))
5927 Worklist.push_back(J);
5928 else if (needsExtract(J, VF)) {
5929 ScalarCost += TTI.getScalarizationOverhead(
5930 cast<VectorType>(ToVectorTy(J->getType(), VF)),
5931 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5932 /*Extract*/ true, CostKind);
5933 }
5934 }
5935
5936 // Scale the total scalar cost by block probability.
5937 ScalarCost /= getReciprocalPredBlockProb();
5938
5939 // Compute the discount. A non-negative discount means the vector version
5940 // of the instruction costs more, and scalarizing would be beneficial.
5941 Discount += VectorCost - ScalarCost;
5942 ScalarCosts[I] = ScalarCost;
5943 }
5944
5945 return Discount;
5946}
5947
5952
5953 // For each block.
5954 for (BasicBlock *BB : TheLoop->blocks()) {
5955 VectorizationCostTy BlockCost;
5956
5957 // For each instruction in the old loop.
5958 for (Instruction &I : BB->instructionsWithoutDebug()) {
5959 // Skip ignored values.
5960 if (ValuesToIgnore.count(&I) ||
5961 (VF.isVector() && VecValuesToIgnore.count(&I)))
5962 continue;
5963
5964 VectorizationCostTy C = getInstructionCost(&I, VF);
5965
5966 // Check if we should override the cost.
5967 if (C.first.isValid() &&
5968 ForceTargetInstructionCost.getNumOccurrences() > 0)
5970
5971 // Keep a list of instructions with invalid costs.
5972 if (Invalid && !C.first.isValid())
5973 Invalid->emplace_back(&I, VF);
5974
5975 BlockCost.first += C.first;
5976 BlockCost.second |= C.second;
5977 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5978 << " for VF " << VF << " For instruction: " << I
5979 << '\n');
5980 }
5981
5982 // If we are vectorizing a predicated block, it will have been
5983 // if-converted. This means that the block's instructions (aside from
5984 // stores and instructions that may divide by zero) will now be
5985 // unconditionally executed. For the scalar case, we may not always execute
5986 // the predicated block, if it is an if-else block. Thus, scale the block's
5987 // cost by the probability of executing it. blockNeedsPredication from
5988 // Legal is used so as to not include all blocks in tail folded loops.
5989 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5990 BlockCost.first /= getReciprocalPredBlockProb();
5991
5992 Cost.first += BlockCost.first;
5993 Cost.second |= BlockCost.second;
5994 }
5995
5996 return Cost;
5997}
5998
5999/// Gets Address Access SCEV after verifying that the access pattern
6000/// is loop invariant except the induction variable dependence.
6001///
6002/// This SCEV can be sent to the Target in order to estimate the address
6003/// calculation cost.
6005 Value *Ptr,
6008 const Loop *TheLoop) {
6009
6010 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6011 if (!Gep)
6012 return nullptr;
6013
6014 // We are looking for a gep with all loop invariant indices except for one
6015 // which should be an induction variable.
6016 auto SE = PSE.getSE();
6017 unsigned NumOperands = Gep->getNumOperands();
6018 for (unsigned i = 1; i < NumOperands; ++i) {
6019 Value *Opd = Gep->getOperand(i);
6020 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6021 !Legal->isInductionVariable(Opd))
6022 return nullptr;
6023 }
6024
6025 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6026 return PSE.getSCEV(Ptr);
6027}
6028
6030LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6031 ElementCount VF) {
6032 assert(VF.isVector() &&
6033 "Scalarization cost of instruction implies vectorization.");
6034 if (VF.isScalable())
6036
6037 Type *ValTy = getLoadStoreType(I);
6038 auto SE = PSE.getSE();
6039
6040 unsigned AS = getLoadStoreAddressSpace(I);
6042 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6043 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6044 // that it is being called from this specific place.
6045
6046 // Figure out whether the access is strided and get the stride value
6047 // if it's known in compile time
6048 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6049
6050 // Get the cost of the scalar memory instruction and address computation.
6052 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6053
6054 // Don't pass *I here, since it is scalar but will actually be part of a
6055 // vectorized loop where the user of it is a vectorized instruction.
6057 const Align Alignment = getLoadStoreAlignment(I);
6058 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
6059 ValTy->getScalarType(),
6060 Alignment, AS, CostKind);
6061
6062 // Get the overhead of the extractelement and insertelement instructions
6063 // we might create due to scalarization.
6064 Cost += getScalarizationOverhead(I, VF, CostKind);
6065
6066 // If we have a predicated load/store, it will need extra i1 extracts and
6067 // conditional branches, but may not be executed for each vector lane. Scale
6068 // the cost by the probability of executing the predicated block.
6069 if (isPredicatedInst(I)) {
6071
6072 // Add the cost of an i1 extract and a branch
6073 auto *Vec_i1Ty =
6076 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6077 /*Insert=*/false, /*Extract=*/true, CostKind);
6078 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
6079
6080 if (useEmulatedMaskMemRefHack(I, VF))
6081 // Artificially setting to a high enough value to practically disable
6082 // vectorization with such operations.
6083 Cost = 3000000;
6084 }
6085
6086 return Cost;
6087}
6088
6090LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6091 ElementCount VF) {
6092 Type *ValTy = getLoadStoreType(I);
6093 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6095 unsigned AS = getLoadStoreAddressSpace(I);
6096 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6098
6099 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6100 "Stride should be 1 or -1 for consecutive memory access");
6101 const Align Alignment = getLoadStoreAlignment(I);
6103 if (Legal->isMaskRequired(I)) {
6104 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6105 CostKind);
6106 } else {
6107 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6108 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6109 CostKind, OpInfo, I);
6110 }
6111
6112 bool Reverse = ConsecutiveStride < 0;
6113 if (Reverse)
6115 std::nullopt, CostKind, 0);
6116 return Cost;
6117}
6118
6120LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6121 ElementCount VF) {
6122 assert(Legal->isUniformMemOp(*I, VF));
6123
6124 Type *ValTy = getLoadStoreType(I);
6125 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6126 const Align Alignment = getLoadStoreAlignment(I);
6127 unsigned AS = getLoadStoreAddressSpace(I);
6129 if (isa<LoadInst>(I)) {
6130 return TTI.getAddressComputationCost(ValTy) +
6131 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6132 CostKind) +
6134 }
6135 StoreInst *SI = cast<StoreInst>(I);
6136
6137 bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
6138 return TTI.getAddressComputationCost(ValTy) +
6139 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6140 CostKind) +
6141 (isLoopInvariantStoreValue
6142 ? 0
6143 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6144 CostKind, VF.getKnownMinValue() - 1));
6145}
6146
6148LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6149 ElementCount VF) {
6150 Type *ValTy = getLoadStoreType(I);
6151 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6152 const Align Alignment = getLoadStoreAlignment(I);
6154
6155 return TTI.getAddressComputationCost(VectorTy) +
6157 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6159}
6160
6162LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6163 ElementCount VF) {
6164 Type *ValTy = getLoadStoreType(I);
6165 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6166 unsigned AS = getLoadStoreAddressSpace(I);
6168
6169 auto Group = getInterleavedAccessGroup(I);
6170 assert(Group && "Fail to get an interleaved access group.");
6171
6172 unsigned InterleaveFactor = Group->getFactor();
6173 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6174
6175 // Holds the indices of existing members in the interleaved group.
6177 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6178 if (Group->getMember(IF))
6179 Indices.push_back(IF);
6180
6181 // Calculate the cost of the whole interleaved group.
6182 bool UseMaskForGaps =
6183 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6184 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6186 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6187 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
6188
6189 if (Group->isReverse()) {
6190 // TODO: Add support for reversed masked interleaved access.
6192 "Reverse masked interleaved access not supported.");
6193 Cost += Group->getNumMembers() *
6195 std::nullopt, CostKind, 0);
6196 }
6197 return Cost;
6198}
6199
6200std::optional<InstructionCost>
6201LoopVectorizationCostModel::getReductionPatternCost(
6202 Instruction *I, ElementCount VF, Type *Ty,
6204 using namespace llvm::PatternMatch;
6205 // Early exit for no inloop reductions
6206 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6207 return std::nullopt;
6208 auto *VectorTy = cast<VectorType>(Ty);
6209
6210 // We are looking for a pattern of, and finding the minimal acceptable cost:
6211 // reduce(mul(ext(A), ext(B))) or
6212 // reduce(mul(A, B)) or
6213 // reduce(ext(A)) or
6214 // reduce(A).
6215 // The basic idea is that we walk down the tree to do that, finding the root
6216 // reduction instruction in InLoopReductionImmediateChains. From there we find
6217 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6218 // of the components. If the reduction cost is lower then we return it for the
6219 // reduction instruction and 0 for the other instructions in the pattern. If
6220 // it is not we return an invalid cost specifying the orignal cost method
6221 // should be used.
6222 Instruction *RetI = I;
6223 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6224 if (!RetI->hasOneUser())
6225 return std::nullopt;
6226 RetI = RetI->user_back();
6227 }
6228
6229 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6230 RetI->user_back()->getOpcode() == Instruction::Add) {
6231 RetI = RetI->user_back();
6232 }
6233
6234 // Test if the found instruction is a reduction, and if not return an invalid
6235 // cost specifying the parent to use the original cost modelling.
6236 if (!InLoopReductionImmediateChains.count(RetI))
6237 return std::nullopt;
6238
6239 // Find the reduction this chain is a part of and calculate the basic cost of
6240 // the reduction on its own.
6241 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
6242 Instruction *ReductionPhi = LastChain;
6243 while (!isa<PHINode>(ReductionPhi))
6244 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
6245
6246 const RecurrenceDescriptor &RdxDesc =
6247 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6248
6250 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6251
6252 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6253 // normal fmul instruction to the cost of the fadd reduction.
6254 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6255 BaseCost +=
6256 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6257
6258 // If we're using ordered reductions then we can just return the base cost
6259 // here, since getArithmeticReductionCost calculates the full ordered
6260 // reduction cost when FP reassociation is not allowed.
6261 if (useOrderedReductions(RdxDesc))
6262 return BaseCost;
6263
6264 // Get the operand that was not the reduction chain and match it to one of the
6265 // patterns, returning the better cost if it is found.
6266 Instruction *RedOp = RetI->getOperand(1) == LastChain
6267 ? dyn_cast<Instruction>(RetI->getOperand(0))
6268 : dyn_cast<Instruction>(RetI->getOperand(1));
6269
6270 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6271
6272 Instruction *Op0, *Op1;
6273 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6274 match(RedOp,
6276 match(Op0, m_ZExtOrSExt(m_Value())) &&
6277 Op0->getOpcode() == Op1->getOpcode() &&
6278 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6280 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6281
6282 // Matched reduce.add(ext(mul(ext(A), ext(B)))
6283 // Note that the extend opcodes need to all match, or if A==B they will have
6284 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6285 // which is equally fine.
6286 bool IsUnsigned = isa<ZExtInst>(Op0);
6287 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6288 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6289
6290 InstructionCost ExtCost =
6291 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6293 InstructionCost MulCost =
6294 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6295 InstructionCost Ext2Cost =
6296 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6298
6300 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6301
6302 if (RedCost.isValid() &&
6303 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6304 return I == RetI ? RedCost : 0;
6305 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6306 !TheLoop->isLoopInvariant(RedOp)) {
6307 // Matched reduce(ext(A))
6308 bool IsUnsigned = isa<ZExtInst>(RedOp);
6309 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6311 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6312 RdxDesc.getFastMathFlags(), CostKind);
6313
6314 InstructionCost ExtCost =
6315 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6317 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6318 return I == RetI ? RedCost : 0;
6319 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6320 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6321 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6322 Op0->getOpcode() == Op1->getOpcode() &&
6324 bool IsUnsigned = isa<ZExtInst>(Op0);
6325 Type *Op0Ty = Op0->getOperand(0)->getType();
6326 Type *Op1Ty = Op1->getOperand(0)->getType();
6327 Type *LargestOpTy =
6328 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6329 : Op0Ty;
6330 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6331
6332 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6333 // different sizes. We take the largest type as the ext to reduce, and add
6334 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6336 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6339 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6341 InstructionCost MulCost =
6342 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6343
6345 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6346 InstructionCost ExtraExtCost = 0;
6347 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6348 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6349 ExtraExtCost = TTI.getCastInstrCost(
6350 ExtraExtOp->getOpcode(), ExtType,
6351 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6353 }
6354
6355 if (RedCost.isValid() &&
6356 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6357 return I == RetI ? RedCost : 0;
6358 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6359 // Matched reduce.add(mul())
6360 InstructionCost MulCost =
6361 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6362
6364 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6365
6366 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6367 return I == RetI ? RedCost : 0;
6368 }
6369 }
6370
6371 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6372}
6373
6375LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6376 ElementCount VF) {
6377 // Calculate scalar cost only. Vectorization cost should be ready at this
6378 // moment.
6379 if (VF.isScalar()) {
6380 Type *ValTy = getLoadStoreType(I);
6381 const Align Alignment = getLoadStoreAlignment(I);
6382 unsigned AS = getLoadStoreAddressSpace(I);
6383
6384 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6385 return TTI.getAddressComputationCost(ValTy) +
6386 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6387 TTI::TCK_RecipThroughput, OpInfo, I);
6388 }
6389 return getWideningCost(I, VF);
6390}
6391
6393LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6394 ElementCount VF) {
6395 // If we know that this instruction will remain uniform, check the cost of
6396 // the scalar version.
6398 VF = ElementCount::getFixed(1);
6399
6400 if (VF.isVector() && isProfitableToScalarize(I, VF))
6401 return VectorizationCostTy(InstsToScalarize[VF][I], false);
6402
6403 // Forced scalars do not have any scalarization overhead.
6404 auto ForcedScalar = ForcedScalars.find(VF);
6405 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6406 auto InstSet = ForcedScalar->second;
6407 if (InstSet.count(I))
6408 return VectorizationCostTy(
6409 (getInstructionCost(I, ElementCount::getFixed(1)).first *
6410 VF.getKnownMinValue()),
6411 false);
6412 }
6413
6414 Type *VectorTy;
6415 InstructionCost C = getInstructionCost(I, VF, VectorTy);
6416
6417 bool TypeNotScalarized = false;
6418 if (VF.isVector() && VectorTy->isVectorTy()) {
6419 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6420 if (VF.isScalable())
6421 // <vscale x 1 x iN> is assumed to be profitable over iN because
6422 // scalable registers are a distinct register class from scalar ones.
6423 // If we ever find a target which wants to lower scalable vectors
6424 // back to scalars, we'll need to update this code to explicitly
6425 // ask TTI about the register class uses for each part.
6426 TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6427 else
6428 TypeNotScalarized = NumParts < VF.getKnownMinValue();
6429 } else
6431 }
6432 return VectorizationCostTy(C, TypeNotScalarized);
6433}
6434
6435InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6437
6438 // There is no mechanism yet to create a scalable scalarization loop,
6439 // so this is currently Invalid.
6440 if (VF.isScalable())
6442
6443 if (VF.isScalar())
6444 return 0;
6445
6447 Type *RetTy = ToVectorTy(I->getType(), VF);
6448 if (!RetTy->isVoidTy() &&
6449 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6451 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6452 /*Insert*/ true,
6453 /*Extract*/ false, CostKind);
6454
6455 // Some targets keep addresses scalar.
6456 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6457 return Cost;
6458
6459 // Some targets support efficient element stores.
6460 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6461 return Cost;
6462
6463 // Collect operands to consider.
6464 CallInst *CI = dyn_cast<CallInst>(I);
6465 Instruction::op_range Ops = CI ? CI->args() : I->operands();
6466
6467 // Skip operands that do not require extraction/scalarization and do not incur
6468 // any overhead.
6470 for (auto *V : filterExtractingOperands(Ops, VF))
6471 Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6473 filterExtractingOperands(Ops, VF), Tys, CostKind);
6474}
6475
6477 if (VF.isScalar())
6478 return;
6479 NumPredStores = 0;
6480 for (BasicBlock *BB : TheLoop->blocks()) {
6481 // For each instruction in the old loop.
6482 for (Instruction &I : *BB) {
6484 if (!Ptr)
6485 continue;
6486
6487 // TODO: We should generate better code and update the cost model for
6488 // predicated uniform stores. Today they are treated as any other
6489 // predicated store (see added test cases in
6490 // invariant-store-vectorization.ll).
6491 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6492 NumPredStores++;
6493
6494 if (Legal->isUniformMemOp(I, VF)) {
6495 auto isLegalToScalarize = [&]() {
6496 if (!VF.isScalable())
6497 // Scalarization of fixed length vectors "just works".
6498 return true;
6499
6500 // We have dedicated lowering for unpredicated uniform loads and
6501 // stores. Note that even with tail folding we know that at least
6502 // one lane is active (i.e. generalized predication is not possible
6503 // here), and the logic below depends on this fact.
6504 if (!foldTailByMasking())
6505 return true;
6506
6507 // For scalable vectors, a uniform memop load is always
6508 // uniform-by-parts and we know how to scalarize that.
6509 if (isa<LoadInst>(I))
6510 return true;
6511
6512 // A uniform store isn't neccessarily uniform-by-part
6513 // and we can't assume scalarization.
6514 auto &SI = cast<StoreInst>(I);
6515 return TheLoop->isLoopInvariant(SI.getValueOperand());
6516 };
6517
6518 const InstructionCost GatherScatterCost =
6520 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6521
6522 // Load: Scalar load + broadcast
6523 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6524 // FIXME: This cost is a significant under-estimate for tail folded
6525 // memory ops.
6526 const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6527 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6528
6529 // Choose better solution for the current VF, Note that Invalid
6530 // costs compare as maximumal large. If both are invalid, we get
6531 // scalable invalid which signals a failure and a vectorization abort.
6532 if (GatherScatterCost < ScalarizationCost)
6533 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6534 else
6535 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6536 continue;
6537 }
6538
6539 // We assume that widening is the best solution when possible.
6540 if (memoryInstructionCanBeWidened(&I, VF)) {
6541 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6542 int ConsecutiveStride = Legal->isConsecutivePtr(
6544 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6545 "Expected consecutive stride.");
6546 InstWidening Decision =
6547 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6548 setWideningDecision(&I, VF, Decision, Cost);
6549 continue;
6550 }
6551
6552 // Choose between Interleaving, Gather/Scatter or Scalarization.
6554 unsigned NumAccesses = 1;
6555 if (isAccessInterleaved(&I)) {
6556 auto Group = getInterleavedAccessGroup(&I);
6557 assert(Group && "Fail to get an interleaved access group.");
6558
6559 // Make one decision for the whole group.
6560 if (getWideningDecision(&I, VF) != CM_Unknown)
6561 continue;
6562
6563 NumAccesses = Group->getNumMembers();
6565 InterleaveCost = getInterleaveGroupCost(&I, VF);
6566 }
6567
6568 InstructionCost GatherScatterCost =
6570 ? getGatherScatterCost(&I, VF) * NumAccesses
6572
6573 InstructionCost ScalarizationCost =
6574 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6575
6576 // Choose better solution for the current VF,
6577 // write down this decision and use it during vectorization.
6579 InstWidening Decision;
6580 if (InterleaveCost <= GatherScatterCost &&
6581 InterleaveCost < ScalarizationCost) {
6582 Decision = CM_Interleave;
6583 Cost = InterleaveCost;
6584 } else if (GatherScatterCost < ScalarizationCost) {
6585 Decision = CM_GatherScatter;
6586 Cost = GatherScatterCost;
6587 } else {
6588 Decision = CM_Scalarize;
6589 Cost = ScalarizationCost;
6590 }
6591 // If the instructions belongs to an interleave group, the whole group
6592 // receives the same decision. The whole group receives the cost, but
6593 // the cost will actually be assigned to one instruction.
6594 if (auto Group = getInterleavedAccessGroup(&I))
6595 setWideningDecision(Group, VF, Decision, Cost);
6596 else
6597 setWideningDecision(&I, VF, Decision, Cost);
6598 }
6599 }
6600
6601 // Make sure that any load of address and any other address computation
6602 // remains scalar unless there is gather/scatter support. This avoids
6603 // inevitable extracts into address registers, and also has the benefit of
6604 // activating LSR more, since that pass can't optimize vectorized
6605 // addresses.
6607 return;
6608
6609 // Start with all scalar pointer uses.
6611 for (BasicBlock *BB : TheLoop->blocks())
6612 for (Instruction &I : *BB) {
6613 Instruction *PtrDef =
6614 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6615 if (PtrDef && TheLoop->contains(PtrDef) &&
6617 AddrDefs.insert(PtrDef);
6618 }
6619
6620 // Add all instructions used to generate the addresses.
6622 append_range(Worklist, AddrDefs);
6623 while (!Worklist.empty()) {
6624 Instruction *I = Worklist.pop_back_val();
6625 for (auto &Op : I->operands())
6626 if (auto *InstOp = dyn_cast<Instruction>(Op))
6627 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6628 AddrDefs.insert(InstOp).second)
6629 Worklist.push_back(InstOp);
6630 }
6631
6632 for (auto *I : AddrDefs) {
6633 if (isa<LoadInst>(I)) {
6634 // Setting the desired widening decision should ideally be handled in
6635 // by cost functions, but since this involves the task of finding out
6636 // if the loaded register is involved in an address computation, it is
6637 // instead changed here when we know this is the case.
6638 InstWidening Decision = getWideningDecision(I, VF);
6639 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6640 // Scalarize a widened load of address.
6642 I, VF, CM_Scalarize,
6643 (VF.getKnownMinValue() *
6644 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6645 else if (auto Group = getInterleavedAccessGroup(I)) {
6646 // Scalarize an interleave group of address loads.
6647 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6648 if (Instruction *Member = Group->getMember(I))
6650 Member, VF, CM_Scalarize,
6651 (VF.getKnownMinValue() *
6652 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6653 }
6654 }
6655 } else
6656 // Make sure I gets scalarized and a cost estimate without
6657 // scalarization overhead.
6658 ForcedScalars[VF].insert(I);
6659 }
6660}
6661
6663 assert(!VF.isScalar() &&
6664 "Trying to set a vectorization decision for a scalar VF");
6665
6666 for (BasicBlock *BB : TheLoop->blocks()) {
6667 // For each instruction in the old loop.
6668 for (Instruction &I : *BB) {
6669 CallInst *CI = dyn_cast<CallInst>(&I);
6670
6671 if (!CI)
6672 continue;
6673
6678
6679 Function *ScalarFunc = CI->getCalledFunction();
6680 Type *ScalarRetTy = CI->getType();
6681 SmallVector<Type *, 4> Tys, ScalarTys;
6682 bool MaskRequired = Legal->isMaskRequired(CI);
6683 for (auto &ArgOp : CI->args())
6684 ScalarTys.push_back(ArgOp->getType());
6685
6686 // Compute corresponding vector type for return value and arguments.
6687 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
6688 for (Type *ScalarTy : ScalarTys)
6689 Tys.push_back(ToVectorTy(ScalarTy, VF));
6690
6691 // An in-loop reduction using an fmuladd intrinsic is a special case;
6692 // we don't want the normal cost for that intrinsic.
6694 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6697 std::nullopt, *RedCost);
6698 continue;
6699 }
6700
6701 // Estimate cost of scalarized vector call. The source operands are
6702 // assumed to be vectors, so we need to extract individual elements from
6703 // there, execute VF scalar calls, and then gather the result into the
6704 // vector return value.
6705 InstructionCost ScalarCallCost =
6706 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6707
6708 // Compute costs of unpacking argument values for the scalar calls and
6709 // packing the return values to a vector.
6710 InstructionCost ScalarizationCost =
6711 getScalarizationOverhead(CI, VF, CostKind);
6712
6713 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6714
6715 // Find the cost of vectorizing the call, if we can find a suitable
6716 // vector variant of the function.
6717 bool UsesMask = false;
6718 VFInfo FuncInfo;
6719 Function *VecFunc = nullptr;
6720 // Search through any available variants for one we can use at this VF.
6721 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6722 // Must match requested VF.
6723 if (Info.Shape.VF != VF)
6724 continue;
6725
6726 // Must take a mask argument if one is required
6727 if (MaskRequired && !Info.isMasked())
6728 continue;
6729
6730 // Check that all parameter kinds are supported
6731 bool ParamsOk = true;
6732 for (VFParameter Param : Info.Shape.Parameters) {
6733 switch (Param.ParamKind) {
6735 break;
6737 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6738 // Make sure the scalar parameter in the loop is invariant.
6739 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6740 TheLoop))
6741 ParamsOk = false;
6742 break;
6743 }
6745 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6746 // Find the stride for the scalar parameter in this loop and see if
6747 // it matches the stride for the variant.
6748 // TODO: do we need to figure out the cost of an extract to get the
6749 // first lane? Or do we hope that it will be folded away?
6750 ScalarEvolution *SE = PSE.getSE();
6751 const auto *SAR =
6752 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6753
6754 if (!SAR || SAR->getLoop() != TheLoop) {
6755 ParamsOk = false;
6756 break;
6757 }
6758
6759 const SCEVConstant *Step =
6760 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6761
6762 if (!Step ||
6763 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6764 ParamsOk = false;
6765
6766 break;
6767 }
6769 UsesMask = true;
6770 break;
6771 default:
6772 ParamsOk = false;
6773 break;
6774 }
6775 }
6776
6777 if (!ParamsOk)
6778 continue;
6779
6780 // Found a suitable candidate, stop here.
6781 VecFunc = CI->getModule()->getFunction(Info.VectorName);
6782 FuncInfo = Info;
6783 break;
6784 }
6785
6786 // Add in the cost of synthesizing a mask if one wasn't required.
6787 InstructionCost MaskCost = 0;
6788 if (VecFunc && UsesMask && !MaskRequired)
6789 MaskCost = TTI.getShuffleCost(
6792 VecFunc->getFunctionType()->getContext()),
6793 VF));
6794
6795 if (TLI && VecFunc && !CI->isNoBuiltin())
6796 VectorCost =
6797 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6798
6799 // Find the cost of an intrinsic; some targets may have instructions that
6800 // perform the operation without needing an actual call.
6802 if (IID != Intrinsic::not_intrinsic)
6803 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6804
6805 InstructionCost Cost = ScalarCost;
6806 InstWidening Decision = CM_Scalarize;
6807
6808 if (VectorCost <= Cost) {
6809 Cost = VectorCost;
6810 Decision = CM_VectorCall;
6811 }
6812
6813 if (IntrinsicCost <= Cost) {
6814 Cost = IntrinsicCost;
6815 Decision = CM_IntrinsicCall;
6816 }
6817
6818 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6820 }
6821 }
6822}
6823
6825LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6826 Type *&VectorTy) {
6827 Type *RetTy = I->getType();
6829 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6830 auto SE = PSE.getSE();
6832
6833 auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6834 ElementCount VF) -> bool {
6835 if (VF.isScalar())
6836 return true;
6837
6838 auto Scalarized = InstsToScalarize.find(VF);
6839 assert(Scalarized != InstsToScalarize.end() &&
6840 "VF not yet analyzed for scalarization profitability");
6841 return !Scalarized->second.count(I) &&
6842 llvm::all_of(I->users(), [&](User *U) {
6843 auto *UI = cast<Instruction>(U);
6844 return !Scalarized->second.count(UI);
6845 });
6846 };
6847 (void) hasSingleCopyAfterVectorization;
6848
6849 if (isScalarAfterVectorization(I, VF)) {
6850 // With the exception of GEPs and PHIs, after scalarization there should
6851 // only be one copy of the instruction generated in the loop. This is
6852 // because the VF is either 1, or any instructions that need scalarizing
6853 // have already been dealt with by the time we get here. As a result,
6854 // it means we don't have to multiply the instruction cost by VF.
6855 assert(I->getOpcode() == Instruction::GetElementPtr ||
6856 I->getOpcode() == Instruction::PHI ||
6857 (I->getOpcode() == Instruction::BitCast &&
6858 I->getType()->isPointerTy()) ||
6859 hasSingleCopyAfterVectorization(I, VF));
6860 VectorTy = RetTy;
6861 } else
6862 VectorTy = ToVectorTy(RetTy, VF);
6863
6864 // TODO: We need to estimate the cost of intrinsic calls.
6865 switch (I->getOpcode()) {
6866 case Instruction::GetElementPtr:
6867 // We mark this instruction as zero-cost because the cost of GEPs in
6868 // vectorized code depends on whether the corresponding memory instruction
6869 // is scalarized or not. Therefore, we handle GEPs with the memory
6870 // instruction cost.
6871 return 0;
6872 case Instruction::Br: {
6873 // In cases of scalarized and predicated instructions, there will be VF
6874 // predicated blocks in the vectorized loop. Each branch around these
6875 // blocks requires also an extract of its vector compare i1 element.
6876 bool ScalarPredicatedBB = false;
6877 BranchInst *BI = cast<BranchInst>(I);
6878 if (VF.isVector() && BI->isConditional() &&
6879 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6880 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
6881 ScalarPredicatedBB = true;
6882
6883 if (ScalarPredicatedBB) {
6884 // Not possible to scalarize scalable vector with predicated instructions.
6885 if (VF.isScalable())
6887 // Return cost for branches around scalarized and predicated blocks.
6888 auto *Vec_i1Ty =
6889 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6890 return (
6892 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
6893 /*Insert*/ false, /*Extract*/ true, CostKind) +
6894 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6895 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6896 // The back-edge branch will remain, as will all scalar branches.
6897 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6898 else
6899 // This branch will be eliminated by if-conversion.
6900 return 0;
6901 // Note: We currently assume zero cost for an unconditional branch inside
6902 // a predicated block since it will become a fall-through, although we
6903 // may decide in the future to call TTI for all branches.
6904 }
6905 case Instruction::PHI: {
6906 auto *Phi = cast<PHINode>(I);
6907
6908 // First-order recurrences are replaced by vector shuffles inside the loop.
6909 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6911 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6913 cast<VectorType>(VectorTy), Mask, CostKind,
6914 VF.getKnownMinValue() - 1);
6915 }
6916
6917 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6918 // converted into select instructions. We require N - 1 selects per phi
6919 // node, where N is the number of incoming values.
6920 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6921 return (Phi->getNumIncomingValues() - 1) *
6923 Instruction::Select, ToVectorTy(Phi->getType(), VF),
6924 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6926
6927 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6928 }
6929 case Instruction::UDiv:
6930 case Instruction::SDiv:
6931 case Instruction::URem:
6932 case Instruction::SRem:
6933 if (VF.isVector() && isPredicatedInst(I)) {
6934 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6935 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6936 ScalarCost : SafeDivisorCost;
6937 }
6938 // We've proven all lanes safe to speculate, fall through.
6939 [[fallthrough]];
6940 case Instruction::Add:
6941 case Instruction::FAdd:
6942 case Instruction::Sub:
6943 case Instruction::FSub:
6944 case Instruction::Mul:
6945 case Instruction::FMul:
6946 case Instruction::FDiv:
6947 case Instruction::FRem:
6948 case Instruction::Shl:
6949 case Instruction::LShr:
6950 case Instruction::AShr:
6951 case Instruction::And:
6952 case Instruction::Or:
6953 case Instruction::Xor: {
6954 // If we're speculating on the stride being 1, the multiplication may
6955 // fold away. We can generalize this for all operations using the notion
6956 // of neutral elements. (TODO)
6957 if (I->getOpcode() == Instruction::Mul &&
6958 (PSE.getSCEV(I->getOperand(0))->isOne() ||
6959 PSE.getSCEV(I->getOperand(1))->isOne()))
6960 return 0;
6961
6962 // Detect reduction patterns
6963 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6964 return *RedCost;
6965
6966 // Certain instructions can be cheaper to vectorize if they have a constant
6967 // second vector operand. One example of this are shifts on x86.
6968 Value *Op2 = I->getOperand(1);
6969 auto Op2Info = TTI.getOperandInfo(Op2);
6970 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6971 Legal->isInvariant(Op2))
6973
6974 SmallVector<const Value *, 4> Operands(I->operand_values());
6976 I->getOpcode(), VectorTy, CostKind,
6977 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6978 Op2Info, Operands, I, TLI);
6979 }
6980 case Instruction::FNeg: {
6982 I->getOpcode(), VectorTy, CostKind,
6983 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6984 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6985 I->getOperand(0), I);
6986 }
6987 case Instruction::Select: {
6988 SelectInst *SI = cast<SelectInst>(I);
6989 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6990 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6991
6992 const Value *Op0, *Op1;
6993 using namespace llvm::PatternMatch;
6994 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6995 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6996 // select x, y, false --> x & y
6997 // select x, true, y --> x | y
6998 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6999 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
7000 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7001 Op1->getType()->getScalarSizeInBits() == 1);
7002
7005 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7006 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
7007 }
7008
7009 Type *CondTy = SI->getCondition()->getType();
7010 if (!ScalarCond)
7011 CondTy = VectorType::get(CondTy, VF);
7012
7014 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7015 Pred = Cmp->getPredicate();
7016 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7017 CostKind, I);
7018 }
7019 case Instruction::ICmp:
7020 case Instruction::FCmp: {
7021 Type *ValTy = I->getOperand(0)->getType();
7022 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7023 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7024 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7025 VectorTy = ToVectorTy(ValTy, VF);
7026 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7027 cast<CmpInst>(I)->getPredicate(), CostKind,
7028 I);
7029 }
7030 case Instruction::Store:
7031 case Instruction::Load: {
7032 ElementCount Width = VF;
7033 if (Width.isVector()) {
7034 InstWidening Decision = getWideningDecision(I, Width);
7035 assert(Decision != CM_Unknown &&
7036 "CM decision should be taken at this point");
7039 if (Decision == CM_Scalarize)
7040 Width = ElementCount::getFixed(1);
7041 }
7042 VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7043 return getMemoryInstructionCost(I, VF);
7044 }
7045 case Instruction::BitCast:
7046 if (I->getType()->isPointerTy())
7047 return 0;
7048 [[fallthrough]];
7049 case Instruction::ZExt:
7050 case Instruction::SExt:
7051 case Instruction::FPToUI:
7052 case Instruction::FPToSI:
7053 case Instruction::FPExt:
7054 case Instruction::PtrToInt:
7055 case Instruction::IntToPtr:
7056 case Instruction::SIToFP:
7057 case Instruction::UIToFP:
7058 case Instruction::Trunc:
7059 case Instruction::FPTrunc: {
7060 // Computes the CastContextHint from a Load/Store instruction.
7061 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7062 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7063 "Expected a load or a store!");
7064
7065 if (VF.isScalar() || !TheLoop->contains(I))
7067
7068 switch (getWideningDecision(I, VF)) {
7080 llvm_unreachable("Instr did not go through cost modelling?");
7083 llvm_unreachable_internal("Instr has invalid widening decision");
7084 }
7085
7086 llvm_unreachable("Unhandled case!");
7087 };
7088
7089 unsigned Opcode = I->getOpcode();
7091 // For Trunc, the context is the only user, which must be a StoreInst.
7092 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7093 if (I->hasOneUse())
7094 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7095 CCH = ComputeCCH(Store);
7096 }
7097 // For Z/Sext, the context is the operand, which must be a LoadInst.
7098 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7099 Opcode == Instruction::FPExt) {
7100 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7101 CCH = ComputeCCH(Load);
7102 }
7103
7104 // We optimize the truncation of induction variables having constant
7105 // integer steps. The cost of these truncations is the same as the scalar
7106 // operation.
7107 if (isOptimizableIVTruncate(I, VF)) {
7108 auto *Trunc = cast<TruncInst>(I);
7109 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7110 Trunc->getSrcTy(), CCH, CostKind, Trunc);
7111 }
7112
7113 // Detect reduction patterns
7114 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7115 return *RedCost;
7116
7117 Type *SrcScalarTy = I->getOperand(0)->getType();
7118 Type *SrcVecTy =
7119 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7121 // This cast is going to be shrunk. This may remove the cast or it might
7122 // turn it into slightly different cast. For example, if MinBW == 16,
7123 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7124 //
7125 // Calculate the modified src and dest types.
7126 Type *MinVecTy = VectorTy;
7127 if (Opcode == Instruction::Trunc) {
7128 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7129 VectorTy =
7130 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7131 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7132 // Leave SrcVecTy unchanged - we only shrink the destination element
7133 // type.
7134 VectorTy =
7135 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7136 }
7137 }
7138
7139 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7140 }
7141 case Instruction::Call:
7142 return getVectorCallCost(cast<CallInst>(I), VF);
7143 case Instruction::ExtractValue:
7145 case Instruction::Alloca:
7146 // We cannot easily widen alloca to a scalable alloca, as
7147 // the result would need to be a vector of pointers.
7148 if (VF.isScalable())
7150 [[fallthrough]];
7151 default:
7152 // This opcode is unknown. Assume that it is the same as 'mul'.
7153 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7154 } // end of switch.
7155}
7156
7158 // Ignore ephemeral values.
7160
7161 // Find all stores to invariant variables. Since they are going to sink
7162 // outside the loop we do not need calculate cost for them.
7163 for (BasicBlock *BB : TheLoop->blocks())
7164 for (Instruction &I : *BB) {
7165 StoreInst *SI;
7166 if ((SI = dyn_cast<StoreInst>(&I)) &&
7167 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7168 ValuesToIgnore.insert(&I);
7169 }
7170
7171 // Ignore type-promoting instructions we identified during reduction
7172 // detection.
7173 for (const auto &Reduction : Legal->getReductionVars()) {
7174 const RecurrenceDescriptor &RedDes = Reduction.second;
7175 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7176 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7177 }
7178 // Ignore type-casting instructions we identified during induction
7179 // detection.
7180 for (const auto &Induction : Legal->getInductionVars()) {
7181 const InductionDescriptor &IndDes = Induction.second;
7182 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7183 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7184 }
7185}
7186
7188 for (const auto &Reduction : Legal->getReductionVars()) {
7189 PHINode *Phi = Reduction.first;
7190 const RecurrenceDescriptor &RdxDesc = Reduction.second;
7191
7192 // We don't collect reductions that are type promoted (yet).
7193 if (RdxDesc.getRecurrenceType() != Phi->getType())
7194 continue;
7195
7196 // If the target would prefer this reduction to happen "in-loop", then we
7197 // want to record it as such.
7198 unsigned Opcode = RdxDesc.getOpcode();
7199 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7200 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7202 continue;
7203
7204 // Check that we can correctly put the reductions into the loop, by
7205 // finding the chain of operations that leads from the phi to the loop
7206 // exit value.
7207 SmallVector<Instruction *, 4> ReductionOperations =
7208 RdxDesc.getReductionOpChain(Phi, TheLoop);
7209 bool InLoop = !ReductionOperations.empty();
7210
7211 if (InLoop) {
7212 InLoopReductions.insert(Phi);
7213 // Add the elements to InLoopReductionImmediateChains for cost modelling.
7214 Instruction *LastChain = Phi;
7215 for (auto *I : ReductionOperations) {
7216 InLoopReductionImmediateChains[I] = LastChain;
7217 LastChain = I;
7218 }
7219 }
7220 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7221 << " reduction for phi: " << *Phi << "\n");
7222 }
7223}
7224
7226 DebugLoc DL, const Twine &Name) {
7228 Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
7229 return tryInsertInstruction(
7230 new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
7231}
7232
7233// This function will select a scalable VF if the target supports scalable
7234// vectors and a fixed one otherwise.
7235// TODO: we could return a pair of values that specify the max VF and
7236// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7237// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7238// doesn't have a cost model that can choose which plan to execute if
7239// more than one is generated.
7242 unsigned WidestType;
7243 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7244
7249
7251 unsigned N = RegSize.getKnownMinValue() / WidestType;
7252 return ElementCount::get(N, RegSize.isScalable());
7253}
7254
7257 ElementCount VF = UserVF;
7258 // Outer loop handling: They may require CFG and instruction level
7259 // transformations before even evaluating whether vectorization is profitable.
7260 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7261 // the vectorization pipeline.
7262 if (!OrigLoop->isInnermost()) {
7263 // If the user doesn't provide a vectorization factor, determine a
7264 // reasonable one.
7265 if (UserVF.isZero()) {
7266 VF = determineVPlanVF(TTI, CM);
7267 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7268
7269 // Make sure we have a VF > 1 for stress testing.
7270 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7271 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7272 << "overriding computed VF.\n");
7273 VF = ElementCount::getFixed(4);
7274 }
7275 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7277 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7278 << "not supported by the target.\n");
7280 "Scalable vectorization requested but not supported by the target",
7281 "the scalable user-specified vectorization width for outer-loop "
7282 "vectorization cannot be used because the target does not support "
7283 "scalable vectors.",
7284 "ScalableVFUnfeasible", ORE, OrigLoop);
7286 }
7287 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7289 "VF needs to be a power of two");
7290 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7291 << "VF " << VF << " to build VPlans.\n");
7292 buildVPlans(VF, VF);
7293
7294 // For VPlan build stress testing, we bail out after VPlan construction.
7297
7298 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7299 }
7300
7301 LLVM_DEBUG(
7302 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7303 "VPlan-native path.\n");
7305}
7306
7307std::optional<VectorizationFactor>
7309 assert(OrigLoop->isInnermost() && "Inner loop expected.");
7312
7313 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7314 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7315 return std::nullopt;
7316
7317 // Invalidate interleave groups if all blocks of loop will be predicated.
7318 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7320 LLVM_DEBUG(
7321 dbgs()
7322 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7323 "which requires masked-interleaved support.\n");
7325 // Invalidating interleave groups also requires invalidating all decisions
7326 // based on them, which includes widening decisions and uniform and scalar
7327 // values.
7329 }
7330
7331 ElementCount MaxUserVF =
7332 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7333 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7334 if (!UserVF.isZero() && UserVFIsLegal) {
7336 "VF needs to be a power of two");
7337 // Collect the instructions (and their associated costs) that will be more
7338 // profitable to scalarize.
7340 if (CM.selectUserVectorizationFactor(UserVF)) {
7341 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7342 buildVPlansWithVPRecipes(UserVF, UserVF);
7343 if (!hasPlanWithVF(UserVF)) {
7344 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
7345 << ".\n");
7346 return std::nullopt;
7347 }
7348
7350 return {{UserVF, 0, 0}};
7351 } else
7352 reportVectorizationInfo("UserVF ignored because of invalid costs.",
7353 "InvalidCost", ORE, OrigLoop);
7354 }
7355
7356 // Populate the set of Vectorization Factor Candidates.
7357 ElementCountSet VFCandidates;
7358 for (auto VF = ElementCount::getFixed(1);
7359 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7360 VFCandidates.insert(VF);
7361 for (auto VF = ElementCount::getScalable(1);
7362 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7363 VFCandidates.insert(VF);
7364
7366 for (const auto &VF : VFCandidates) {
7367 // Collect Uniform and Scalar instructions after vectorization with VF.
7369
7370 // Collect the instructions (and their associated costs) that will be more
7371 // profitable to scalarize.
7372 if (VF.isVector())
7374 }
7375
7376 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7377 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7378
7380 if (!MaxFactors.hasVector())
7382
7383 // Select the optimal vectorization factor.
7384 VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
7385 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7386 if (!hasPlanWithVF(VF.Width)) {
7387 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
7388 << ".\n");
7389 return std::nullopt;
7390 }
7391 return VF;
7392}
7393
7395 assert(count_if(VPlans,
7396 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7397 1 &&
7398 "Best VF has not a single VPlan.");
7399
7400 for (const VPlanPtr &Plan : VPlans) {
7401 if (Plan->hasVF(VF))
7402 return *Plan.get();
7403 }
7404 llvm_unreachable("No plan found!");
7405}
7406
7409 // Reserve first location for self reference to the LoopID metadata node.
7410 MDs.push_back(nullptr);
7411 bool IsUnrollMetadata = false;
7412 MDNode *LoopID = L->getLoopID();
7413 if (LoopID) {
7414 // First find existing loop unrolling disable metadata.
7415 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7416 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7417 if (MD) {
7418 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7419 IsUnrollMetadata =
7420 S && S->getString().starts_with("llvm.loop.unroll.disable");
7421 }
7422 MDs.push_back(LoopID->getOperand(i));
7423 }
7424 }
7425
7426 if (!IsUnrollMetadata) {
7427 // Add runtime unroll disable metadata.
7428 LLVMContext &Context = L->getHeader()->getContext();
7429 SmallVector<Metadata *, 1> DisableOperands;
7430 DisableOperands.push_back(
7431 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7432 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7433 MDs.push_back(DisableNode);
7434 MDNode *NewLoopID = MDNode::get(Context, MDs);
7435 // Set operand 0 to refer to the loop id itself.
7436 NewLoopID->replaceOperandWith(0, NewLoopID);
7437 L->setLoopID(NewLoopID);
7438 }
7439}
7440
7441// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7442// create a merge phi node for it and add it to \p ReductionResumeValues.
7444 VPInstruction *RedResult,
7446 VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) {
7447 if (!RedResult ||
7449 return;
7450
7451 auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
7452 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7453
7454 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
7455 Value *FinalValue =
7456 State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
7457 auto *ResumePhi =
7458 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
7459
7460 // TODO: bc.merge.rdx should not be created here, instead it should be
7461 // modeled in VPlan.
7462 BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7463 // Create a phi node that merges control-flow from the backedge-taken check
7464 // block and the middle block.
7465 auto *BCBlockPhi =
7466 PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
7467 LoopScalarPreHeader->getTerminator()->getIterator());
7468
7469 // If we are fixing reductions in the epilogue loop then we should already
7470 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7471 // we carry over the incoming values correctly.
7472 for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
7473 if (Incoming == LoopMiddleBlock)
7474 BCBlockPhi->addIncoming(FinalValue, Incoming);
7475 else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
7476 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
7477 Incoming);
7478 else
7479 BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
7480 }
7481
7482 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
7483 // TODO: This fixup should instead be modeled in VPlan.
7484 // Fix the scalar loop reduction variable with the incoming reduction sum
7485 // from the vector body and from the backedge value.
7486 int IncomingEdgeBlockIdx =
7487 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
7488 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
7489 // Pick the other block.
7490 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7491 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
7492 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7493 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
7494
7495 ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7496}
7497
7498std::pair<DenseMap<const SCEV *, Value *>,
7501 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7502 InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7503 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7504 assert(BestVPlan.hasVF(BestVF) &&
7505 "Trying to execute plan with unsupported VF");
7506 assert(BestVPlan.hasUF(BestUF) &&
7507 "Trying to execute plan with unsupported UF");
7508 assert(
7509 (IsEpilogueVectorization || !ExpandedSCEVs) &&
7510 "expanded SCEVs to reuse can only be used during epilogue vectorization");
7511
7512 if (!IsEpilogueVectorization)
7513 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7514
7515 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
7516 << ", UF=" << BestUF << '\n');
7517 BestVPlan.setName("Final VPlan");
7518 LLVM_DEBUG(BestVPlan.dump());
7519
7520 // Perform the actual loop transformation.
7521 VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7522 OrigLoop->getHeader()->getContext());
7523
7524 // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7525 // before making any changes to the CFG.
7526 if (!BestVPlan.getPreheader()->empty()) {
7527 State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7529 BestVPlan.getPreheader()->execute(&State);
7530 }
7531 if (!ILV.getTripCount())
7532 ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
7533 else
7534 assert(IsEpilogueVectorization && "should only re-use the existing trip "
7535 "count during epilogue vectorization");
7536
7537 // 1. Set up the skeleton for vectorization, including vector pre-header and
7538 // middle block. The vector loop is created during VPlan execution.
7539 Value *CanonicalIVStartValue;
7540 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7541 ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
7542 : State.ExpandedSCEVs);
7543
7544 // Only use noalias metadata when using memory checks guaranteeing no overlap
7545 // across all iterations.
7546 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7547 std::unique_ptr<LoopVersioning> LVer = nullptr;
7548 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7550
7551 // We currently don't use LoopVersioning for the actual loop cloning but we
7552 // still use it to add the noalias metadata.
7553 // TODO: Find a better way to re-use LoopVersioning functionality to add
7554 // metadata.
7555 LVer = std::make_unique<LoopVersioning>(
7556 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7557 PSE.getSE());
7558 State.LVer = &*LVer;
7560 }
7561
7563
7564 //===------------------------------------------------===//
7565 //
7566 // Notice: any optimization or new instruction that go
7567 // into the code below should also be implemented in
7568 // the cost-model.
7569 //
7570 //===------------------------------------------------===//
7571
7572 // 2. Copy and widen instructions from the old loop into the new loop.
7573 BestVPlan.prepareToExecute(ILV.getTripCount(),
7574 ILV.getOrCreateVectorTripCount(nullptr),
7575 CanonicalIVStartValue, State);
7576
7577 BestVPlan.execute(&State);
7578
7579 // 2.5 Collect reduction resume values.
7581 auto *ExitVPBB =
7582 cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7583 for (VPRecipeBase &R : *ExitVPBB) {
7584 createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R),
7585 ReductionResumeValues, State, OrigLoop,
7586 State.CFG.VPBB2IRBB[ExitVPBB]);
7587 }
7588
7589 // 2.6. Maintain Loop Hints
7590 // Keep all loop hints from the original loop on the vector loop (we'll
7591 // replace the vectorizer-specific hints below).
7592 MDNode *OrigLoopID = OrigLoop->getLoopID();
7593
7594 std::optional<MDNode *> VectorizedLoopID =
7597
7598 VPBasicBlock *HeaderVPBB =
7600 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7601 if (VectorizedLoopID)
7602 L->setLoopID(*VectorizedLoopID);
7603 else {
7604 // Keep all loop hints from the original loop on the vector loop (we'll
7605 // replace the vectorizer-specific hints below).
7606 if (MDNode *LID = OrigLoop->getLoopID())
7607 L->setLoopID(LID);
7608
7609 LoopVectorizeHints Hints(L, true, *ORE);
7610 Hints.setAlreadyVectorized();
7611 }
7613 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7614 if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
7616
7617 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7618 // predication, updating analyses.
7619 ILV.fixVectorizedLoop(State, BestVPlan);
7620
7622
7623 return {State.ExpandedSCEVs, ReductionResumeValues};
7624}
7625
7626#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7628 for (const auto &Plan : VPlans)
7630 Plan->printDOT(O);
7631 else
7632 Plan->print(O);
7633}
7634#endif
7635
7636//===--------------------------------------------------------------------===//
7637// EpilogueVectorizerMainLoop
7638//===--------------------------------------------------------------------===//
7639
7640/// This function is partially responsible for generating the control flow
7641/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7642std::pair<BasicBlock *, Value *>
7644 const SCEV2ValueTy &ExpandedSCEVs) {
7646
7647 // Generate the code to check the minimum iteration count of the vector
7648 // epilogue (see below).
7652
7653 // Generate the code to check any assumptions that we've made for SCEV
7654 // expressions.
7656
7657 // Generate the code that checks at runtime if arrays overlap. We put the
7658 // checks into a separate block to make the more common case of few elements
7659 // faster.
7661
7662 // Generate the iteration count check for the main loop, *after* the check
7663 // for the epilogue loop, so that the path-length is shorter for the case
7664 // that goes directly through the vector epilogue. The longer-path length for
7665 // the main loop is compensated for, by the gain from vectorizing the larger
7666 // trip count. Note: the branch will get updated later on when we vectorize
7667 // the epilogue.
7670
7671 // Generate the induction variable.
7673
7674 // Skip induction resume value creation here because they will be created in
7675 // the second pass for the scalar loop. The induction resume values for the
7676 // inductions in the epilogue loop are created before executing the plan for
7677 // the epilogue loop.
7678
7679 return {completeLoopSkeleton(), nullptr};
7680}
7681
7683 LLVM_DEBUG({
7684 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7685 << "Main Loop VF:" << EPI.MainLoopVF
7686 << ", Main Loop UF:" << EPI.MainLoopUF
7687 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7688 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7689 });
7690}
7691
7694 dbgs() << "intermediate fn:\n"
7695 << *OrigLoop->getHeader()->getParent() << "\n";
7696 });
7697}
7698
7699BasicBlock *
7701 bool ForEpilogue) {
7702 assert(Bypass && "Expected valid bypass basic block.");
7703 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7704 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7705 Value *Count = getTripCount();
7706 // Reuse existing vector loop preheader for TC checks.
7707 // Note that new preheader block is generated for vector loop.
7708 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7709 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7710
7711 // Generate code to check if the loop's trip count is less than VF * UF of the
7712 // main vector loop.
7713 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7714 : VF.isVector())
7717
7718 Value *CheckMinIters = Builder.CreateICmp(
7719 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7720 "min.iters.check");
7721
7722 if (!ForEpilogue)
7723 TCCheckBlock->setName("vector.main.loop.iter.check");
7724
7725 // Create new preheader for vector loop.
7726 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7727 DT, LI, nullptr, "vector.ph");
7728
7729 if (ForEpilogue) {
7730 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7731 DT->getNode(Bypass)->getIDom()) &&
7732 "TC check is expected to dominate Bypass");
7733
7734 // Update dominator for Bypass & LoopExit.
7735 DT->changeImmediateDominator(Bypass, TCCheckBlock);
7736 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7737 // For loops with multiple exits, there's no edge from the middle block
7738 // to exit blocks (as the epilogue must run) and thus no need to update
7739 // the immediate dominator of the exit blocks.
7741
7742 LoopBypassBlocks.push_back(TCCheckBlock);
7743
7744 // Save the trip count so we don't have to regenerate it in the
7745 // vec.epilog.iter.check. This is safe to do because the trip count
7746 // generated here dominates the vector epilog iter check.
7747 EPI.TripCount = Count;
7748 }
7749
7750 BranchInst &BI =
7751 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7754 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7755
7756 return TCCheckBlock;
7757}
7758
7759//===--------------------------------------------------------------------===//
7760// EpilogueVectorizerEpilogueLoop
7761//===--------------------------------------------------------------------===//
7762
7763/// This function is partially responsible for generating the control flow
7764/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7765std::pair<BasicBlock *, Value *>
7767 const SCEV2ValueTy &ExpandedSCEVs) {
7768 createVectorLoopSkeleton("vec.epilog.");
7769
7770 // Now, compare the remaining count and if there aren't enough iterations to
7771 // execute the vectorized epilogue skip to the scalar part.
7772 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7773 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7776 LI, nullptr, "vec.epilog.ph");
7778 VecEpilogueIterationCountCheck);
7779
7780 // Adjust the control flow taking the state info from the main loop
7781 // vectorization into account.
7783 "expected this to be saved from the previous pass.");
7785 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7786
7789
7791 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7792
7793 if (EPI.SCEVSafetyCheck)
7795 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7796 if (EPI.MemSafetyCheck)
7798 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7799
7801 VecEpilogueIterationCountCheck,
7802 VecEpilogueIterationCountCheck->getSinglePredecessor());
7803
7806 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7807 // If there is an epilogue which must run, there's no edge from the
7808 // middle block to exit blocks and thus no need to update the immediate
7809 // dominator of the exit blocks.
7812
7813 // Keep track of bypass blocks, as they feed start values to the induction and
7814 // reduction phis in the scalar loop preheader.
7815 if (EPI.SCEVSafetyCheck)
7817 if (EPI.MemSafetyCheck)
7820
7821 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7822 // reductions which merge control-flow from the latch block and the middle
7823 // block. Update the incoming values here and move the Phi into the preheader.
7824 SmallVector<PHINode *, 4> PhisInBlock;
7825 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7826 PhisInBlock.push_back(&Phi);
7827
7828 for (PHINode *Phi : PhisInBlock) {
7829 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7830 Phi->replaceIncomingBlockWith(
7831 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7832 VecEpilogueIterationCountCheck);
7833
7834 // If the phi doesn't have an incoming value from the
7835 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7836 // value and also those from other check blocks. This is needed for
7837 // reduction phis only.
7838 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7839 return EPI.EpilogueIterationCountCheck == IncB;
7840 }))
7841 continue;
7842 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7843 if (EPI.SCEVSafetyCheck)
7844 Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7845 if (EPI.MemSafetyCheck)
7846 Phi->removeIncomingValue(EPI.MemSafetyCheck);
7847 }
7848
7849 // Generate a resume induction for the vector epilogue and put it in the
7850 // vector epilogue preheader
7851 Type *IdxTy = Legal->getWidestInductionType();
7852 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
7854 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7855 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7857
7858 // Generate induction resume values. These variables save the new starting
7859 // indexes for the scalar loop. They are used to test if there are any tail
7860 // iterations left once the vector loop has completed.
7861 // Note that when the vectorized epilogue is skipped due to iteration count
7862 // check, then the resume value for the induction variable comes from
7863 // the trip count of the main vector loop, hence passing the AdditionalBypass
7864 // argument.
7865 createInductionResumeValues(ExpandedSCEVs,
7866 {VecEpilogueIterationCountCheck,
7867 EPI.VectorTripCount} /* AdditionalBypass */);
7868
7869 return {completeLoopSkeleton(), EPResumeVal};
7870}
7871
7872BasicBlock *
7874 BasicBlock *Bypass, BasicBlock *Insert) {
7875
7877 "Expected trip count to have been safed in the first pass.");
7878 assert(
7879 (!isa<Instruction>(EPI.TripCount) ||
7880 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7881 "saved trip count does not dominate insertion point.");
7882 Value *TC = EPI.TripCount;
7883 IRBuilder<> Builder(Insert->getTerminator());
7884 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7885
7886 // Generate code to check if the loop's trip count is less than VF * UF of the
7887 // vector epilogue loop.
7888 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7891
7892 Value *CheckMinIters =
7893 Builder.CreateICmp(P, Count,
7896 "min.epilog.iters.check");
7897
7898 BranchInst &BI =
7899 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7901 unsigned MainLoopStep = UF * VF.getKnownMinValue();
7902 unsigned EpilogueLoopStep =
7904 // We assume the remaining `Count` is equally distributed in
7905 // [0, MainLoopStep)
7906 // So the probability for `Count < EpilogueLoopStep` should be
7907 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7908 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7909 const uint32_t Weights[] = {EstimatedSkipCount,
7910 MainLoopStep - EstimatedSkipCount};
7911 setBranchWeights(BI, Weights);
7912 }
7913 ReplaceInstWithInst(Insert->getTerminator(), &BI);
7914
7915 LoopBypassBlocks.push_back(Insert);
7916 return Insert;
7917}
7918
7920 LLVM_DEBUG({
7921 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7922 << "Epilogue Loop VF:" << EPI.EpilogueVF
7923 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7924 });
7925}
7926
7929 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7930 });
7931}
7932
7934 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7935 assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7936 bool PredicateAtRangeStart = Predicate(Range.Start);
7937
7938 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
7939 if (Predicate(TmpVF) != PredicateAtRangeStart) {
7940 Range.End = TmpVF;
7941 break;
7942 }
7943
7944 return PredicateAtRangeStart;
7945}
7946
7947/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7948/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7949/// of VF's starting at a given VF and extending it as much as possible. Each
7950/// vectorization decision can potentially shorten this sub-range during
7951/// buildVPlan().
7953 ElementCount MaxVF) {
7954 auto MaxVFTimes2 = MaxVF * 2;
7955 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
7956 VFRange SubRange = {VF, MaxVFTimes2};
7957 VPlans.push_back(buildVPlan(SubRange));
7958 VF = SubRange.End;
7959 }
7960}
7961
7962iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
7964 std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
7965 if (auto *I = dyn_cast<Instruction>(Op)) {
7966 if (auto *R = Ingredient2Recipe.lookup(I))
7967 return R->getVPSingleValue();
7968 }
7969 return Plan.getOrAddLiveIn(Op);
7970 };
7971 return map_range(Operands, Fn);
7972}
7973
7975 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7976
7977 // Look for cached value.
7978 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7979 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7980 if (ECEntryIt != EdgeMaskCache.end())
7981 return ECEntryIt->second;
7982
7983 VPValue *SrcMask = getBlockInMask(Src);
7984
7985 // The terminator has to be a branch inst!
7986 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7987 assert(BI && "Unexpected terminator found");
7988
7989 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7990 return EdgeMaskCache[Edge] = SrcMask;
7991
7992 // If source is an exiting block, we know the exit edge is dynamically dead
7993 // in the vector loop, and thus we don't need to restrict the mask. Avoid
7994 // adding uses of an otherwise potentially dead instruction.
7995 if (OrigLoop->isLoopExiting(Src))
7996 return EdgeMaskCache[Edge] = SrcMask;
7997
7998 VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition(), Plan);
7999 assert(EdgeMask && "No Edge Mask found for condition");
8000
8001 if (BI->getSuccessor(0) != Dst)
8002 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8003
8004 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8005 // The condition is 'SrcMask && EdgeMask', which is equivalent to
8006 // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8007 // The select version does not introduce new UB if SrcMask is false and
8008 // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8009 VPValue *False = Plan.getOrAddLiveIn(
8011 EdgeMask =
8012 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8013 }
8014
8015 return EdgeMaskCache[Edge] = EdgeMask;
8016}
8017
8019 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8020
8021 // Look for cached value.
8022 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8023 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
8024 assert(ECEntryIt != EdgeMaskCache.end() &&
8025 "looking up mask for edge which has not been created");
8026 return ECEntryIt->second;
8027}
8028
8030 BasicBlock *Header = OrigLoop->getHeader();
8031
8032 // When not folding the tail, use nullptr to model all-true mask.
8033 if (!CM.foldTailByMasking()) {
8034 BlockMaskCache[Header] = nullptr;
8035 return;
8036 }
8037
8038 // Introduce the early-exit compare IV <= BTC to form header block mask.
8039 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8040 // constructing the desired canonical IV in the header block as its first
8041 // non-phi instructions.
8042
8043 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8044 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8045 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8046 HeaderVPBB->insert(IV, NewInsertionPoint);
8047
8048 VPBuilder::InsertPointGuard Guard(Builder);
8049 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8050 VPValue *BlockMask = nullptr;
8052 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8053 BlockMaskCache[Header] = BlockMask;
8054}
8055
8057 // Return the cached value.
8058 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8059 assert(BCEntryIt != BlockMaskCache.end() &&
8060 "Trying to access mask for block without one.");
8061 return BCEntryIt->second;
8062}
8063
8065 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8066 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8067 assert(OrigLoop->getHeader() != BB &&
8068 "Loop header must have cached block mask");
8069
8070 // All-one mask is modelled as no-mask following the convention for masked
8071 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8072 VPValue *BlockMask = nullptr;
8073 // This is the block mask. We OR all incoming edges.
8074 for (auto *Predecessor : predecessors(BB)) {
8075 VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
8076 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8077 BlockMaskCache[BB] = EdgeMask;
8078 return;
8079 }
8080
8081 if (!BlockMask) { // BlockMask has its initialized nullptr value.
8082 BlockMask = EdgeMask;
8083 continue;
8084 }
8085
8086 BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8087 }
8088
8089 BlockMaskCache[BB] = BlockMask;
8090}
8091
8093VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8094 VFRange &Range) {
8095 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8096 "Must be called with either a load or store");
8097
8098 auto willWiden = [&](ElementCount VF) -> bool {
8100 CM.getWideningDecision(I, VF);
8102 "CM decision should be taken at this point.");
8104 return true;
8105 if (CM.isScalarAfterVectorization(I, VF) ||
8106 CM.isProfitableToScalarize(I, VF))
8107 return false;
8109 };
8110
8112 return nullptr;
8113
8114 VPValue *Mask = nullptr;
8115 if (Legal->isMaskRequired(I))
8116 Mask = getBlockInMask(I->getParent());
8117
8118 // Determine if the pointer operand of the access is either consecutive or
8119 // reverse consecutive.
8121 CM.getWideningDecision(I, Range.Start);
8123 bool Consecutive =
8125
8126 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8127 if (Consecutive) {
8128 auto *GEP = dyn_cast<GetElementPtrInst>(
8129 Ptr->getUnderlyingValue()->stripPointerCasts());
8130 auto *VectorPtr = new VPVectorPointerRecipe(
8131 Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
8132 I->getDebugLoc());
8133 Builder.getInsertBlock()->appendRecipe(VectorPtr);
8134 Ptr = VectorPtr;
8135 }
8136 if (LoadInst *Load = dyn_cast<LoadInst>(I))
8137 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
8138 I->getDebugLoc());
8139
8140 StoreInst *Store = cast<StoreInst>(I);
8141 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
8142 Reverse, I->getDebugLoc());
8143}
8144
8145/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8146/// insert a recipe to expand the step for the induction recipe.
8149 VPValue *Start, const InductionDescriptor &IndDesc,
8150 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,
8151 VFRange &Range) {
8152 assert(IndDesc.getStartValue() ==
8153 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8154 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8155 "step must be loop invariant");
8156
8157 VPValue *Step =
8159 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8160 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
8161 }
8162 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8163 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
8164}
8165
8166VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8168
8169 // Check if this is an integer or fp induction. If so, build the recipe that
8170 // produces its scalar and vector values.
8171 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8172 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8173 *PSE.getSE(), *OrigLoop, Range);
8174
8175 // Check if this is pointer induction. If so, build the recipe for it.
8176 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8177 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8178 *PSE.getSE());
8180 Phi, Operands[0], Step, *II,
8182 [&](ElementCount VF) {
8183 return CM.isScalarAfterVectorization(Phi, VF);
8184 },
8185 Range));
8186 }
8187 return nullptr;
8188}
8189
8190VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8192 // Optimize the special case where the source is a constant integer
8193 // induction variable. Notice that we can only optimize the 'trunc' case
8194 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8195 // (c) other casts depend on pointer size.
8196
8197 // Determine whether \p K is a truncation based on an induction variable that
8198 // can be optimized.
8199 auto isOptimizableIVTruncate =
8200 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8201 return [=](ElementCount VF) -> bool {
8202 return CM.isOptimizableIVTruncate(K, VF);
8203 };
8204 };
8205
8207 isOptimizableIVTruncate(I), Range)) {
8208
8209 auto *Phi = cast<PHINode>(I->getOperand(0));
8210 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8211 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
8212 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8213 *OrigLoop, Range);
8214 }
8215 return nullptr;
8216}
8217
8218VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8220 unsigned NumIncoming = Phi->getNumIncomingValues();
8221
8222 // We know that all PHIs in non-header blocks are converted into selects, so
8223 // we don't have to worry about the insertion order and we can just use the
8224 // builder. At this point we generate the predication tree. There may be
8225 // duplications since this is a simple recursive scan, but future
8226 // optimizations will clean it up.
8227 // TODO: At the moment the first mask is always skipped, but it would be
8228 // better to skip the most expensive mask.
8229 SmallVector<VPValue *, 2> OperandsWithMask;
8230
8231 for (unsigned In = 0; In < NumIncoming; In++) {
8232 OperandsWithMask.push_back(Operands[In]);
8233 VPValue *EdgeMask =
8234 getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8235 if (!EdgeMask) {
8236 assert(In == 0 && "Both null and non-null edge masks found");
8238 "Distinct incoming values with one having a full mask");
8239 break;
8240 }
8241 if (In == 0)
8242 continue;
8243 OperandsWithMask.push_back(EdgeMask);
8244 }
8245 return new VPBlendRecipe(Phi, OperandsWithMask);
8246}
8247
8248VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8250 VFRange &Range) {
8252 [this, CI](ElementCount VF) {
8253 return CM.isScalarWithPredication(CI, VF);
8254 },
8255 Range);
8256
8257 if (IsPredicated)
8258 return nullptr;
8259
8261 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8262 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8263 ID == Intrinsic::pseudoprobe ||
8264 ID == Intrinsic::experimental_noalias_scope_decl))
8265 return nullptr;
8266
8267 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8268
8269 // Is it beneficial to perform intrinsic call compared to lib call?
8270 bool ShouldUseVectorIntrinsic =
8272 [&](ElementCount VF) -> bool {
8273 return CM.getCallWideningDecision(CI, VF).Kind ==
8275 },
8276 Range);
8277 if (ShouldUseVectorIntrinsic)
8278 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID,
8279 CI->getDebugLoc());
8280
8281 Function *Variant = nullptr;
8282 std::optional<unsigned> MaskPos;
8283 // Is better to call a vectorized version of the function than to to scalarize
8284 // the call?
8285 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8286 [&](ElementCount VF) -> bool {
8287 // The following case may be scalarized depending on the VF.
8288 // The flag shows whether we can use a usual Call for vectorized
8289 // version of the instruction.
8290
8291 // If we've found a variant at a previous VF, then stop looking. A
8292 // vectorized variant of a function expects input in a certain shape
8293 // -- basically the number of input registers, the number of lanes
8294 // per register, and whether there's a mask required.
8295 // We store a pointer to the variant in the VPWidenCallRecipe, so
8296 // once we have an appropriate variant it's only valid for that VF.
8297 // This will force a different vplan to be generated for each VF that
8298 // finds a valid variant.
8299 if (Variant)
8300 return false;
8302 CM.getCallWideningDecision(CI, VF);
8304 Variant = Decision.Variant;
8305 MaskPos = Decision.MaskPos;
8306 return true;
8307 }
8308
8309 return false;
8310 },
8311 Range);
8312 if (ShouldUseVectorCall) {
8313 if (MaskPos.has_value()) {
8314 // We have 2 cases that would require a mask:
8315 // 1) The block needs to be predicated, either due to a conditional
8316 // in the scalar loop or use of an active lane mask with
8317 // tail-folding, and we use the appropriate mask for the block.
8318 // 2) No mask is required for the block, but the only available
8319 // vector variant at this VF requires a mask, so we synthesize an
8320 // all-true mask.
8321 VPValue *Mask = nullptr;
8322 if (Legal->isMaskRequired(CI))
8323 Mask = getBlockInMask(CI->getParent());
8324 else
8326 IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
8327
8328 Ops.insert(Ops.begin() + *MaskPos, Mask);
8329 }
8330
8331 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
8333 Variant);
8334 }
8335
8336 return nullptr;
8337}
8338
8339bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8340 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8341 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8342 // Instruction should be widened, unless it is scalar after vectorization,
8343 // scalarization is profitable or it is predicated.
8344 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8345 return CM.isScalarAfterVectorization(I, VF) ||
8346 CM.isProfitableToScalarize(I, VF) ||
8347 CM.isScalarWithPredication(I, VF);
8348 };
8350 Range);
8351}
8352
8353VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8355 VPBasicBlock *VPBB) {
8356 switch (I->getOpcode()) {
8357 default:
8358 return nullptr;
8359 case Instruction::SDiv:
8360 case Instruction::UDiv:
8361 case Instruction::SRem:
8362 case Instruction::URem: {
8363 // If not provably safe, use a select to form a safe divisor before widening the
8364 // div/rem operation itself. Otherwise fall through to general handling below.
8365 if (CM.isPredicatedInst(I)) {
8366 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8367 VPValue *Mask = getBlockInMask(I->getParent());
8368 VPValue *One =
8369 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8370 auto *SafeRHS =
8371 new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
8372 I->getDebugLoc());
8373 VPBB->appendRecipe(SafeRHS);
8374 Ops[1] = SafeRHS;
8375 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8376 }
8377 [[fallthrough]];
8378 }
8379 case Instruction::Add:
8380 case Instruction::And:
8381 case Instruction::AShr:
8382 case Instruction::FAdd:
8383 case Instruction::FCmp:
8384 case Instruction::FDiv:
8385 case Instruction::FMul:
8386 case Instruction::FNeg:
8387 case Instruction::FRem:
8388 case Instruction::FSub:
8389 case Instruction::ICmp:
8390 case Instruction::LShr:
8391 case Instruction::Mul:
8392 case Instruction::Or:
8393 case Instruction::Select:
8394 case Instruction::Shl:
8395 case Instruction::Sub:
8396 case Instruction::Xor:
8397 case Instruction::Freeze:
8398 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8399 };
8400}
8401
8403 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8404 for (VPHeaderPHIRecipe *R : PhisToFix) {
8405 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8406 VPRecipeBase *IncR =
8407 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8408 R->addOperand(IncR->getVPSingleValue());
8409 }
8410}
8411
8413 VFRange &Range) {
8415 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8416 Range);
8417
8418 bool IsPredicated = CM.isPredicatedInst(I);
8419
8420 // Even if the instruction is not marked as uniform, there are certain
8421 // intrinsic calls that can be effectively treated as such, so we check for
8422 // them here. Conservatively, we only do this for scalable vectors, since
8423 // for fixed-width VFs we can always fall back on full scalarization.
8424 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8425 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8426 case Intrinsic::assume:
8427 case Intrinsic::lifetime_start:
8428 case Intrinsic::lifetime_end:
8429 // For scalable vectors if one of the operands is variant then we still
8430 // want to mark as uniform, which will generate one instruction for just
8431 // the first lane of the vector. We can't scalarize the call in the same
8432 // way as for fixed-width vectors because we don't know how many lanes
8433 // there are.
8434 //
8435 // The reasons for doing it this way for scalable vectors are:
8436 // 1. For the assume intrinsic generating the instruction for the first
8437 // lane is still be better than not generating any at all. For
8438 // example, the input may be a splat across all lanes.
8439 // 2. For the lifetime start/end intrinsics the pointer operand only
8440 // does anything useful when the input comes from a stack object,
8441 // which suggests it should always be uniform. For non-stack objects
8442 // the effect is to poison the object, which still allows us to
8443 // remove the call.
8444 IsUniform = true;
8445 break;
8446 default:
8447 break;
8448 }
8449 }
8450 VPValue *BlockInMask = nullptr;
8451 if (!IsPredicated) {
8452 // Finalize the recipe for Instr, first if it is not predicated.
8453 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8454 } else {
8455 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8456 // Instructions marked for predication are replicated and a mask operand is
8457 // added initially. Masked replicate recipes will later be placed under an
8458 // if-then construct to prevent side-effects. Generate recipes to compute
8459 // the block mask for this region.
8460 BlockInMask = getBlockInMask(I->getParent());
8461 }
8462
8463 auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
8464 IsUniform, BlockInMask);
8465 return Recipe;
8466}
8467
8471 VFRange &Range, VPBasicBlock *VPBB) {
8472 // First, check for specific widening recipes that deal with inductions, Phi
8473 // nodes, calls and memory operations.
8474 VPRecipeBase *Recipe;
8475 if (auto Phi = dyn_cast<PHINode>(Instr)) {
8476 if (Phi->getParent() != OrigLoop->getHeader())
8477 return tryToBlend(Phi, Operands);
8478
8479 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8480 return Recipe;
8481
8482 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8483 assert((Legal->isReductionVariable(Phi) ||
8484 Legal->isFixedOrderRecurrence(Phi)) &&
8485 "can only widen reductions and fixed-order recurrences here");
8486 VPValue *StartV = Operands[0];
8487 if (Legal->isReductionVariable(Phi)) {
8488 const RecurrenceDescriptor &RdxDesc =
8489 Legal->getReductionVars().find(Phi)->second;
8490 assert(RdxDesc.getRecurrenceStartValue() ==
8491 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8492 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8493 CM.isInLoopReduction(Phi),
8494 CM.useOrderedReductions(RdxDesc));
8495 } else {
8496 // TODO: Currently fixed-order recurrences are modeled as chains of
8497 // first-order recurrences. If there are no users of the intermediate
8498 // recurrences in the chain, the fixed order recurrence should be modeled
8499 // directly, enabling more efficient codegen.
8500 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8501 }
8502
8503 PhisToFix.push_back(PhiRecipe);
8504 return PhiRecipe;
8505 }
8506
8507 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8508 cast<TruncInst>(Instr), Operands, Range)))
8509 return Recipe;
8510
8511 // All widen recipes below deal only with VF > 1.
8513 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8514 return nullptr;
8515
8516 if (auto *CI = dyn_cast<CallInst>(Instr))
8517 return tryToWidenCall(CI, Operands, Range);
8518
8519 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8520 return tryToWidenMemory(Instr, Operands, Range);
8521
8522 if (!shouldWiden(Instr, Range))
8523 return nullptr;
8524
8525 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8526 return new VPWidenGEPRecipe(GEP,
8527 make_range(Operands.begin(), Operands.end()));
8528
8529 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8530 return new VPWidenSelectRecipe(
8531 *SI, make_range(Operands.begin(), Operands.end()));
8532 }
8533
8534 if (auto *CI = dyn_cast<CastInst>(Instr)) {
8535 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8536 *CI);
8537 }
8538
8539 return tryToWiden(Instr, Operands, VPBB);
8540}
8541
8542void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8543 ElementCount MaxVF) {
8544 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8545
8546 auto MaxVFTimes2 = MaxVF * 2;
8547 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8548 VFRange SubRange = {VF, MaxVFTimes2};
8549 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8550 // Now optimize the initial VPlan.
8551 if (!Plan->hasVF(ElementCount::getFixed(1)))
8553 *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
8554 VPlanTransforms::optimize(*Plan, *PSE.getSE());
8555 // TODO: try to put it close to addActiveLaneMask().
8556 if (CM.foldTailWithEVL())
8558 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8559 VPlans.push_back(std::move(Plan));
8560 }
8561 VF = SubRange.End;
8562 }
8563}
8564
8565// Add the necessary canonical IV and branch recipes required to control the
8566// loop.
8567static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8568 DebugLoc DL) {
8569 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8570 auto *StartV = Plan.getOrAddLiveIn(StartIdx);
8571
8572 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8573 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8574 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8575 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8576 Header->insert(CanonicalIVPHI, Header->begin());
8577
8578 VPBuilder Builder(TopRegion->getExitingBasicBlock());
8579 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8580 auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8581 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
8582 "index.next");
8583 CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8584
8585 // Add the BranchOnCount VPInstruction to the latch.
8587 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8588}
8589
8590// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8591// original exit block.
8592static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
8593 VPRecipeBuilder &Builder, VPlan &Plan) {
8594 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8595 BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8596 // Only handle single-exit loops with unique exit blocks for now.
8597 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8598 return;
8599
8600 // Introduce VPUsers modeling the exit values.
8601 for (PHINode &ExitPhi : ExitBB->phis()) {
8602 Value *IncomingValue =
8603 ExitPhi.getIncomingValueForBlock(ExitingBB);
8604 VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan);
8605 Plan.addLiveOut(&ExitPhi, V);
8606 }
8607}
8608
8610LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8611
8613
8614 // ---------------------------------------------------------------------------
8615 // Build initial VPlan: Scan the body of the loop in a topological order to
8616 // visit each basic block after having visited its predecessor basic blocks.
8617 // ---------------------------------------------------------------------------
8618
8619 // Create initial VPlan skeleton, having a basic block for the pre-header
8620 // which contains SCEV expansions that need to happen before the CFG is
8621 // modified; a basic block for the vector pre-header, followed by a region for
8622 // the vector loop, followed by the middle basic block. The skeleton vector
8623 // loop region contains a header and latch basic blocks.
8625 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8626 *PSE.getSE());
8627 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8628 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8629 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8630 Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);
8631 Plan->getVectorLoopRegion()->setExiting(LatchVPBB);
8632
8633 // Don't use getDecisionAndClampRange here, because we don't know the UF
8634 // so this function is better to be conservative, rather than to split
8635 // it up into different VPlans.
8636 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8637 bool IVUpdateMayOverflow = false;
8638 for (ElementCount VF : Range)
8639 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8640
8642 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8643 // When not folding the tail, we know that the induction increment will not
8644 // overflow.
8645 bool HasNUW = Style == TailFoldingStyle::None;
8646 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8647
8648 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
8649
8650 // ---------------------------------------------------------------------------
8651 // Pre-construction: record ingredients whose recipes we'll need to further
8652 // process after constructing the initial VPlan.
8653 // ---------------------------------------------------------------------------
8654
8655 // For each interleave group which is relevant for this (possibly trimmed)
8656 // Range, add it to the set of groups to be later applied to the VPlan and add
8657 // placeholders for its members' Recipes which we'll be replacing with a
8658 // single VPInterleaveRecipe.
8660 auto applyIG = [IG, this](ElementCount VF) -> bool {
8661 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8662 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8664 // For scalable vectors, the only interleave factor currently supported
8665 // is 2 since we require the (de)interleave2 intrinsics instead of
8666 // shufflevectors.
8667 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8668 "Unsupported interleave factor for scalable vectors");
8669 return Result;
8670 };
8671 if (!getDecisionAndClampRange(applyIG, Range))
8672 continue;
8673 InterleaveGroups.insert(IG);
8674 };
8675
8676 // ---------------------------------------------------------------------------
8677 // Construct recipes for the instructions in the loop
8678 // ---------------------------------------------------------------------------
8679
8680 // Scan the body of the loop in a topological order to visit each basic block
8681 // after having visited its predecessor basic blocks.
8682 LoopBlocksDFS DFS(OrigLoop);
8683 DFS.perform(LI);
8684
8685 VPBasicBlock *VPBB = HeaderVPBB;
8686 BasicBlock *HeaderBB = OrigLoop->getHeader();
8687 bool NeedsMasks =
8688 CM.foldTailByMasking() ||
8689 any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
8690 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
8691 return Legal->blockNeedsPredication(BB) || NeedsBlends;
8692 });
8693 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8694 // Relevant instructions from basic block BB will be grouped into VPRecipe
8695 // ingredients and fill a new VPBasicBlock.
8696 if (VPBB != HeaderVPBB)
8697 VPBB->setName(BB->getName());
8698 Builder.setInsertPoint(VPBB);
8699
8700 if (VPBB == HeaderVPBB)
8701 RecipeBuilder.createHeaderMask();
8702 else if (NeedsMasks)
8703 RecipeBuilder.createBlockInMask(BB);
8704
8705 // Introduce each ingredient into VPlan.
8706 // TODO: Model and preserve debug intrinsics in VPlan.
8707 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
8708 Instruction *Instr = &I;
8710 auto *Phi = dyn_cast<PHINode>(Instr);
8711 if (Phi && Phi->getParent() == HeaderBB) {
8712 Operands.push_back(Plan->getOrAddLiveIn(
8713 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8714 } else {
8715 auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
8716 Operands = {OpRange.begin(), OpRange.end()};
8717 }
8718
8719 // Invariant stores inside loop will be deleted and a single store
8720 // with the final reduction value will be added to the exit block
8721 StoreInst *SI;
8722 if ((SI = dyn_cast<StoreInst>(&I)) &&
8723 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8724 continue;
8725
8726 VPRecipeBase *Recipe =
8727 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
8728 if (!Recipe)
8729 Recipe = RecipeBuilder.handleReplication(Instr, Range);
8730
8731 RecipeBuilder.setRecipe(Instr, Recipe);
8732 if (isa<VPHeaderPHIRecipe>(Recipe)) {
8733 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8734 // the following cases, VPHeaderPHIRecipes may be created after non-phi
8735 // recipes and need to be moved to the phi section of HeaderVPBB:
8736 // * tail-folding (non-phi recipes computing the header mask are
8737 // introduced earlier than regular header phi recipes, and should appear
8738 // after them)
8739 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8740
8741 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8742 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8743 "unexpected recipe needs moving");
8744 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8745 } else
8746 VPBB->appendRecipe(Recipe);
8747 }
8748
8750 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8751 }
8752
8753 // After here, VPBB should not be used.
8754 VPBB = nullptr;
8755
8756 if (CM.requiresScalarEpilogue(Range)) {
8757 // No edge from the middle block to the unique exit block has been inserted
8758 // and there is nothing to fix from vector loop; phis should have incoming
8759 // from scalar loop only.
8760 } else
8761 addUsersInExitBlock(HeaderVPBB, OrigLoop, RecipeBuilder, *Plan);
8762
8763 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8764 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8765 "entry block must be set to a VPRegionBlock having a non-empty entry "
8766 "VPBasicBlock");
8767 RecipeBuilder.fixHeaderPhis();
8768
8769 // ---------------------------------------------------------------------------
8770 // Transform initial VPlan: Apply previously taken decisions, in order, to
8771 // bring the VPlan to its final state.
8772 // ---------------------------------------------------------------------------
8773
8774 // Adjust the recipes for any inloop reductions.
8775 adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start);
8776
8777 // Interleave memory: for each Interleave Group we marked earlier as relevant
8778 // for this VPlan, replace the Recipes widening its memory instructions with a
8779 // single VPInterleaveRecipe at its insertion point.
8780 for (const auto *IG : InterleaveGroups) {
8781 auto *Recipe =
8782 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos()));
8783 SmallVector<VPValue *, 4> StoredValues;
8784 for (unsigned i = 0; i < IG->getFactor(); ++i)
8785 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8786 auto *StoreR = cast<VPWidenStoreRecipe>(RecipeBuilder.getRecipe(SI));
8787 StoredValues.push_back(StoreR->getStoredValue());
8788 }
8789
8790 bool NeedsMaskForGaps =
8791 IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8792 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8793 Recipe->getMask(), NeedsMaskForGaps);
8794 VPIG->insertBefore(Recipe);
8795 unsigned J = 0;
8796 for (unsigned i = 0; i < IG->getFactor(); ++i)
8797 if (Instruction *Member = IG->getMember(i)) {
8798 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
8799 if (!Member->getType()->isVoidTy()) {
8800 VPValue *OriginalV = MemberR->getVPSingleValue();
8801 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8802 J++;
8803 }
8804 MemberR->eraseFromParent();
8805 }
8806 }
8807
8808 for (ElementCount VF : Range)
8809 Plan->addVF(VF);
8810 Plan->setName("Initial VPlan");
8811
8812 // Replace VPValues for known constant strides guaranteed by predicate scalar
8813 // evolution.
8814 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8815 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8816 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
8817 // Only handle constant strides for now.
8818 if (!ScevStride)
8819 continue;
8820 Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt());
8821
8822 auto *ConstVPV = Plan->getOrAddLiveIn(CI);
8823 // The versioned value may not be used in the loop directly, so just add a
8824 // new live-in in those cases.
8825 Plan->getOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV);
8826 }
8827
8829 return Legal->blockNeedsPredication(BB);
8830 });
8831
8832 // Sink users of fixed-order recurrence past the recipe defining the previous
8833 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8835 return nullptr;
8836
8837 if (useActiveLaneMask(Style)) {
8838 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8839 // TailFoldingStyle is visible there.
8840 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8841 bool WithoutRuntimeCheck =
8843 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8844 WithoutRuntimeCheck);
8845 }
8846 return Plan;
8847}
8848
8849VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8850 // Outer loop handling: They may require CFG and instruction level
8851 // transformations before even evaluating whether vectorization is profitable.
8852 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8853 // the vectorization pipeline.
8854 assert(!OrigLoop->isInnermost());
8855 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8856
8857 // Create new empty VPlan
8858 auto Plan = VPlan::createInitialVPlan(
8859 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8860 *PSE.getSE());
8861
8862 // Build hierarchical CFG
8863 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8864 HCFGBuilder.buildHierarchicalCFG();
8865
8866 for (ElementCount VF : Range)
8867 Plan->addVF(VF);
8868
8870 Plan,
8871 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
8872 *PSE.getSE(), *TLI);
8873
8874 // Remove the existing terminator of the exiting block of the top-most region.
8875 // A BranchOnCount will be added instead when adding the canonical IV recipes.
8876 auto *Term =
8877 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8878 Term->eraseFromParent();
8879
8880 // Tail folding is not supported for outer loops, so the induction increment
8881 // is guaranteed to not wrap.
8882 bool HasNUW = true;
8883 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
8884 DebugLoc());
8885 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8886 return Plan;
8887}
8888
8889// Adjust the recipes for reductions. For in-loop reductions the chain of
8890// instructions leading from the loop exit instr to the phi need to be converted
8891// to reductions, with one operand being vector and the other being the scalar
8892// reduction chain. For other reductions, a select is introduced between the phi
8893// and live-out recipes when folding the tail.
8894//
8895// A ComputeReductionResult recipe is added to the middle block, also for
8896// in-loop reductions which compute their result in-loop, because generating
8897// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8898void LoopVectorizationPlanner::adjustRecipesForReductions(
8899 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
8900 ElementCount MinVF) {
8901 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8902 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8903 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8904 // sank outside of the loop would keep the same order as they had in the
8905 // original loop.
8906 SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8907 for (VPRecipeBase &R : Header->phis()) {
8908 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
8909 ReductionPHIList.emplace_back(ReductionPhi);
8910 }
8911 bool HasIntermediateStore = false;
8912 stable_sort(ReductionPHIList,
8913 [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8914 const VPReductionPHIRecipe *R2) {
8915 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8916 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8917 HasIntermediateStore |= IS1 || IS2;
8918
8919 // If neither of the recipes has an intermediate store, keep the
8920 // order the same.
8921 if (!IS1 && !IS2)
8922 return false;
8923
8924 // If only one of the recipes has an intermediate store, then
8925 // move it towards the beginning of the list.
8926 if (IS1 && !IS2)
8927 return true;
8928
8929 if (!IS1 && IS2)
8930 return false;
8931
8932 // If both recipes have an intermediate store, then the recipe
8933 // with the later store should be processed earlier. So it
8934 // should go to the beginning of the list.
8935 return DT->dominates(IS2, IS1);
8936 });
8937
8938 if (HasIntermediateStore && ReductionPHIList.size() > 1)
8939 for (VPRecipeBase *R : ReductionPHIList)
8940 R->moveBefore(*Header, Header->getFirstNonPhi());
8941
8942 for (VPRecipeBase &R : Header->phis()) {
8943 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8944 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8945 continue;
8946
8947 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
8948 RecurKind Kind = RdxDesc.getRecurrenceKind();
8950 "AnyOf reductions are not allowed for in-loop reductions");
8951
8952 // Collect the chain of "link" recipes for the reduction starting at PhiR.
8954 Worklist.insert(PhiR);
8955 for (unsigned I = 0; I != Worklist.size(); ++I) {
8956 VPSingleDefRecipe *Cur = Worklist[I];
8957 for (VPUser *U : Cur->users()) {
8958 auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
8959 if (!UserRecipe) {
8960 assert(isa<VPLiveOut>(U) &&
8961 "U must either be a VPSingleDef or VPLiveOut");
8962 continue;
8963 }
8964 Worklist.insert(UserRecipe);
8965 }
8966 }
8967
8968 // Visit operation "Links" along the reduction chain top-down starting from
8969 // the phi until LoopExitValue. We keep track of the previous item
8970 // (PreviousLink) to tell which of the two operands of a Link will remain
8971 // scalar and which will be reduced. For minmax by select(cmp), Link will be
8972 // the select instructions. Blend recipes of in-loop reduction phi's will
8973 // get folded to their non-phi operand, as the reduction recipe handles the
8974 // condition directly.
8975 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
8976 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
8977 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
8978
8979 // Index of the first operand which holds a non-mask vector operand.
8980 unsigned IndexOfFirstOperand;
8981 // Recognize a call to the llvm.fmuladd intrinsic.
8982 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
8983 VPValue *VecOp;
8984 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
8985 if (IsFMulAdd) {
8986 assert(
8988 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
8989 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
8990 isa<VPWidenCallRecipe>(CurrentLink)) &&
8991 CurrentLink->getOperand(2) == PreviousLink &&
8992 "expected a call where the previous link is the added operand");
8993
8994 // If the instruction is a call to the llvm.fmuladd intrinsic then we
8995 // need to create an fmul recipe (multiplying the first two operands of
8996 // the fmuladd together) to use as the vector operand for the fadd
8997 // reduction.
8998 VPInstruction *FMulRecipe = new VPInstruction(
8999 Instruction::FMul,
9000 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9001 CurrentLinkI->getFastMathFlags());
9002 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9003 VecOp = FMulRecipe;
9004 } else {
9005 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
9006 if (PhiR->isInLoop() && Blend) {
9007 assert(Blend->getNumIncomingValues() == 2 &&
9008 "Blend must have 2 incoming values");
9009 if (Blend->getIncomingValue(0) == PhiR)
9010 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9011 else {
9012 assert(Blend->getIncomingValue(1) == PhiR &&
9013 "PhiR must be an operand of the blend");
9014 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9015 }
9016 continue;
9017 }
9018
9020 if (isa<VPWidenRecipe>(CurrentLink)) {
9021 assert(isa<CmpInst>(CurrentLinkI) &&
9022 "need to have the compare of the select");
9023 continue;
9024 }
9025 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9026 "must be a select recipe");
9027 IndexOfFirstOperand = 1;
9028 } else {
9029 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9030 "Expected to replace a VPWidenSC");
9031 IndexOfFirstOperand = 0;
9032 }
9033 // Note that for non-commutable operands (cmp-selects), the semantics of
9034 // the cmp-select are captured in the recurrence kind.
9035 unsigned VecOpId =
9036 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9037 ? IndexOfFirstOperand + 1
9038 : IndexOfFirstOperand;
9039 VecOp = CurrentLink->getOperand(VecOpId);
9040 assert(VecOp != PreviousLink &&
9041 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9042 (VecOpId - IndexOfFirstOperand)) ==
9043 PreviousLink &&
9044 "PreviousLink must be the operand other than VecOp");
9045 }
9046
9047 BasicBlock *BB = CurrentLinkI->getParent();
9048 VPValue *CondOp = nullptr;
9050 CondOp = RecipeBuilder.getBlockInMask(BB);
9051
9052 VPReductionRecipe *RedRecipe =
9053 new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp,
9054 CondOp, CM.useOrderedReductions(RdxDesc));
9055 // Append the recipe to the end of the VPBasicBlock because we need to
9056 // ensure that it comes after all of it's inputs, including CondOp.
9057 // Note that this transformation may leave over dead recipes (including
9058 // CurrentLink), which will be cleaned by a later VPlan transform.
9059 LinkVPBB->appendRecipe(RedRecipe);
9060 CurrentLink->replaceAllUsesWith(RedRecipe);
9061 PreviousLink = RedRecipe;
9062 }
9063 }
9064 Builder.setInsertPoint(&*LatchVPBB->begin());
9065 for (VPRecipeBase &R :
9066 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9067 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9068 if (!PhiR)
9069 continue;
9070
9071 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9072 // If tail is folded by masking, introduce selects between the phi
9073 // and the live-out instruction of each reduction, at the beginning of the
9074 // dedicated latch block.
9075 auto *OrigExitingVPV = PhiR->getBackedgeValue();
9076 auto *NewExitingVPV = PhiR->getBackedgeValue();
9077 if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9078 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9079 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9080 "reduction recipe must be defined before latch");
9081 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9082 std::optional<FastMathFlags> FMFs =
9083 PhiTy->isFloatingPointTy()
9084 ? std::make_optional(RdxDesc.getFastMathFlags())
9085 : std::nullopt;
9086 NewExitingVPV =
9087 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9088 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9089 return isa<VPInstruction>(&U) &&
9090 cast<VPInstruction>(&U)->getOpcode() ==
9092 });
9095 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
9097 PhiR->setOperand(1, NewExitingVPV);
9098 }
9099
9100 // If the vector reduction can be performed in a smaller type, we truncate
9101 // then extend the loop exit value to enable InstCombine to evaluate the
9102 // entire expression in the smaller type.
9103 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9104 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
9105 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9106 Type *RdxTy = RdxDesc.getRecurrenceType();
9107 auto *Trunc =
9108 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9109 auto *Extnd =
9110 RdxDesc.isSigned()
9111 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9112 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9113
9114 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9115 Extnd->insertAfter(Trunc);
9116 if (PhiR->getOperand(1) == NewExitingVPV)
9117 PhiR->setOperand(1, Extnd->getVPSingleValue());
9118 NewExitingVPV = Extnd;
9119 }
9120
9121 // We want code in the middle block to appear to execute on the location of
9122 // the scalar loop's latch terminator because: (a) it is all compiler
9123 // generated, (b) these instructions are always executed after evaluating
9124 // the latch conditional branch, and (c) other passes may add new
9125 // predecessors which terminate on this line. This is the easiest way to
9126 // ensure we don't accidentally cause an extra step back into the loop while
9127 // debugging.
9128 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9129
9130 // TODO: At the moment ComputeReductionResult also drives creation of the
9131 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9132 // even for in-loop reductions, until the reduction resume value handling is
9133 // also modeled in VPlan.
9134 auto *FinalReductionResult = new VPInstruction(
9135 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9136 cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
9137 ->appendRecipe(FinalReductionResult);
9138 OrigExitingVPV->replaceUsesWithIf(
9139 FinalReductionResult,
9140 [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
9141 }
9142
9144}
9145
9146#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9148 VPSlotTracker &SlotTracker) const {
9149 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9150 IG->getInsertPos()->printAsOperand(O, false);
9151 O << ", ";
9153 VPValue *Mask = getMask();
9154 if (Mask) {
9155 O << ", ";
9156 Mask->printAsOperand(O, SlotTracker);
9157 }
9158
9159 unsigned OpIdx = 0;
9160 for (unsigned i = 0; i < IG->getFactor(); ++i) {
9161 if (!IG->getMember(i))
9162 continue;
9163 if (getNumStoreOperands() > 0) {
9164 O << "\n" << Indent << " store ";
9165 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9166 O << " to index " << i;
9167 } else {
9168 O << "\n" << Indent << " ";
9170 O << " = load from index " << i;
9171 }
9172 ++OpIdx;
9173 }
9174}
9175#endif
9176
9179 "Not a pointer induction according to InductionDescriptor!");
9180 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9181 "Unexpected type.");
9183 "Recipe should have been replaced");
9184
9185 auto *IVR = getParent()->getPlan()->getCanonicalIV();
9186 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
9187 Type *PhiType = IndDesc.getStep()->getType();
9188
9189 // Build a pointer phi
9190 Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9191 Type *ScStValueType = ScalarStartValue->getType();
9192 PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
9193 CanonicalIV->getIterator());
9194
9195 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9196 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9197
9198 // A pointer induction, performed by using a gep
9199 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
9200
9201 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9202 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9203 Value *NumUnrolledElems =
9204 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9205 Value *InductionGEP = GetElementPtrInst::Create(
9206 State.Builder.getInt8Ty(), NewPointerPhi,
9207 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9208 InductionLoc);
9209 // Add induction update using an incorrect block temporarily. The phi node
9210 // will be fixed after VPlan execution. Note that at this point the latch
9211 // block cannot be used, as it does not exist yet.
9212 // TODO: Model increment value in VPlan, by turning the recipe into a
9213 // multi-def and a subclass of VPHeaderPHIRecipe.
9214 NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9215
9216 // Create UF many actual address geps that use the pointer
9217 // phi as base and a vectorized version of the step value
9218 // (<step*0, ..., step*N>) as offset.
9219 for (unsigned Part = 0; Part < State.UF; ++Part) {
9220 Type *VecPhiType = VectorType::get(PhiType, State.VF);
9221 Value *StartOffsetScalar =
9222 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9223 Value *StartOffset =
9224 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9225 // Create a vector of consecutive numbers from zero to VF.
9226 StartOffset = State.Builder.CreateAdd(
9227 StartOffset, State.Builder.CreateStepVector(VecPhiType));
9228
9229 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
9230 "scalar step must be the same across all parts");
9231 Value *GEP = State.Builder.CreateGEP(
9232 State.Builder.getInt8Ty(), NewPointerPhi,
9233 State.Builder.CreateMul(
9234 StartOffset,
9235 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9236 "vector.gep"));
9237 State.set(this, GEP, Part);
9238 }
9239}
9240
9242 assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9243
9244 // Fast-math-flags propagate from the original induction instruction.
9246 if (FPBinOp)
9247 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9248
9249 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9250 Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0));
9251 Value *DerivedIV = emitTransformedIndex(
9252 State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9253 Kind, cast_if_present<BinaryOperator>(FPBinOp));
9254 DerivedIV->setName("offset.idx");
9255 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9256
9257 State.set(this, DerivedIV, VPIteration(0, 0));
9258}
9259
9261 assert(!State.Instance && "Interleave group being replicated.");
9262 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9264 NeedsMaskForGaps);
9265}
9266
9269 if (State.Instance) { // Generate a single instance.
9270 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9271 State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
9272 // Insert scalar instance packing it into a vector.
9273 if (State.VF.isVector() && shouldPack()) {
9274 // If we're constructing lane 0, initialize to start from poison.
9275 if (State.Instance->Lane.isFirstLane()) {
9276 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9277 Value *Poison = PoisonValue::get(
9278 VectorType::get(UI->getType(), State.VF));
9279 State.set(this, Poison, State.Instance->Part);
9280 }
9281 State.packScalarIntoVectorValue(this, *State.Instance);
9282 }
9283 return;
9284 }
9285
9286 if (IsUniform) {
9287 // If the recipe is uniform across all parts (instead of just per VF), only
9288 // generate a single instance.
9289 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9290 all_of(operands(), [](VPValue *Op) {
9291 return Op->isDefinedOutsideVectorRegions();
9292 })) {
9293 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
9294 if (user_begin() != user_end()) {
9295 for (unsigned Part = 1; Part < State.UF; ++Part)
9296 State.set(this, State.get(this, VPIteration(0, 0)),
9297 VPIteration(Part, 0));
9298 }
9299 return;
9300 }
9301
9302 // Uniform within VL means we need to generate lane 0 only for each
9303 // unrolled copy.
9304 for (unsigned Part = 0; Part < State.UF; ++Part)
9305 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
9306 return;
9307 }
9308
9309 // A store of a loop varying value to a uniform address only needs the last
9310 // copy of the store.
9311 if (isa<StoreInst>(UI) &&
9313 auto Lane = VPLane::getLastLaneForVF(State.VF);
9314 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
9315 State);
9316 return;
9317 }
9318
9319 // Generate scalar instances for all VF lanes of all UF parts.
9320 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9321 const unsigned EndLane = State.VF.getKnownMinValue();
9322 for (unsigned Part = 0; Part < State.UF; ++Part)
9323 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9324 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
9325}
9326
9328 auto *LI = cast<LoadInst>(&Ingredient);
9329
9330 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9331 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9332 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9333 bool CreateGather = !isConsecutive();
9334
9335 auto &Builder = State.Builder;
9337 for (unsigned Part = 0; Part < State.UF; ++Part) {
9338 Value *NewLI;
9339 Value *Mask = nullptr;
9340 if (auto *VPMask = getMask()) {
9341 // Mask reversal is only needed for non-all-one (null) masks, as reverse
9342 // of a null all-one mask is a null mask.
9343 Mask = State.get(VPMask, Part);
9344 if (isReverse())
9345 Mask = Builder.CreateVectorReverse(Mask, "reverse");
9346 }
9347
9348 Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather);
9349 if (CreateGather) {
9350 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
9351 "wide.masked.gather");
9352 } else if (Mask) {
9353 NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
9354 PoisonValue::get(DataTy),
9355 "wide.masked.load");
9356 } else {
9357 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
9358 }
9359 // Add metadata to the load, but setVectorValue to the reverse shuffle.
9360 State.addMetadata(NewLI, LI);
9361 if (Reverse)
9362 NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9363 State.set(this, NewLI, Part);
9364 }
9365}
9366
9368 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9369 "explicit vector length.");
9370 // FIXME: Support reverse loading after vp_reverse is added.
9371 assert(!isReverse() && "Reverse loads are not implemented yet.");
9372
9373 auto *LI = cast<LoadInst>(&Ingredient);
9374
9375 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9376 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9377 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9378 bool CreateGather = !isConsecutive();
9379
9380 auto &Builder = State.Builder;
9382 CallInst *NewLI;
9383 Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9384 Value *Addr = State.get(getAddr(), 0, !CreateGather);
9385 Value *Mask = getMask()
9386 ? State.get(getMask(), 0)
9387 : Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9388 if (CreateGather) {
9389 NewLI =
9390 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
9391 nullptr, "wide.masked.gather");
9392 } else {
9393 VectorBuilder VBuilder(Builder);
9394 VBuilder.setEVL(EVL).setMask(Mask);
9395 NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
9396 Instruction::Load, DataTy, Addr, "vp.op.load"));
9397 }
9398 NewLI->addParamAttr(
9399 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
9400 State.addMetadata(NewLI, LI);
9401 State.set(this, NewLI, 0);
9402}
9403
9405 auto *SI = cast<StoreInst>(&Ingredient);
9406
9407 VPValue *StoredVPValue = getStoredValue();
9408 bool CreateScatter = !isConsecutive();
9409 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9410
9411 auto &Builder = State.Builder;
9413
9414 for (unsigned Part = 0; Part < State.UF; ++Part) {
9415 Instruction *NewSI = nullptr;
9416 Value *Mask = nullptr;
9417 if (auto *VPMask = getMask()) {
9418 // Mask reversal is only needed for non-all-one (null) masks, as reverse
9419 // of a null all-one mask is a null mask.
9420 Mask = State.get(VPMask, Part);
9421 if (isReverse())
9422 Mask = Builder.CreateVectorReverse(Mask, "reverse");
9423 }
9424
9425 Value *StoredVal = State.get(StoredVPValue, Part);
9426 if (isReverse()) {
9427 // If we store to reverse consecutive memory locations, then we need
9428 // to reverse the order of elements in the stored value.
9429 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9430 // We don't want to update the value in the map as it might be used in
9431 // another expression. So don't call resetVectorValue(StoredVal).
9432 }
9433 Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter);
9434 if (CreateScatter)
9435 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
9436 else if (Mask)
9437 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
9438 else
9439 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
9440 State.addMetadata(NewSI, SI);
9441 }
9442}
9443
9445 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9446 "explicit vector length.");
9447 // FIXME: Support reverse loading after vp_reverse is added.
9448 assert(!isReverse() && "Reverse store are not implemented yet.");
9449
9450 auto *SI = cast<StoreInst>(&Ingredient);
9451
9452 VPValue *StoredValue = getStoredValue();
9453 bool CreateScatter = !isConsecutive();
9454 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9455
9456 auto &Builder = State.Builder;
9458
9459 CallInst *NewSI = nullptr;
9460 Value *StoredVal = State.get(StoredValue, 0);
9461 Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9462 // FIXME: Support reverse store after vp_reverse is added.
9463 Value *Mask = getMask()
9464 ? State.get(getMask(), 0)
9465 : Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9466 Value *Addr = State.get(getAddr(), 0, !CreateScatter);
9467 if (CreateScatter) {
9468 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
9469 Intrinsic::vp_scatter,
9470 {StoredVal, Addr, Mask, EVL});
9471 } else {
9472 VectorBuilder VBuilder(Builder);
9473 VBuilder.setEVL(EVL).setMask(Mask);
9474 NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
9475 Instruction::Store, Type::getVoidTy(EVL->getContext()),
9476 {StoredVal, Addr}));
9477 }
9478 NewSI->addParamAttr(
9479 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
9480 State.addMetadata(NewSI, SI);
9481}
9482
9483// Determine how to lower the scalar epilogue, which depends on 1) optimising
9484// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9485// predication, and 4) a TTI hook that analyses whether the loop is suitable
9486// for predication.
9491 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9492 // don't look at hints or options, and don't request a scalar epilogue.
9493 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9494 // LoopAccessInfo (due to code dependency and not being able to reliably get
9495 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9496 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9497 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9498 // back to the old way and vectorize with versioning when forced. See D81345.)
9499 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9503
9504 // 2) If set, obey the directives
9505 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9513 };
9514 }
9515
9516 // 3) If set, obey the hints
9517 switch (Hints.getPredicate()) {
9522 };
9523
9524 // 4) if the TTI hook indicates this is profitable, request predication.
9525 TailFoldingInfo TFI(TLI, &LVL, IAI);
9528
9530}
9531
9532// Process the loop in the VPlan-native vectorization path. This path builds
9533// VPlan upfront in the vectorization pipeline, which allows to apply
9534// VPlan-to-VPlan transformations from the very beginning without modifying the
9535// input LLVM IR.
9542 LoopVectorizationRequirements &Requirements) {
9543
9544 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9545 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9546 return false;
9547 }
9548 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9549 Function *F = L->getHeader()->getParent();
9550 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9551
9553 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9554
9555 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9556 &Hints, IAI);
9557 // Use the planner for outer loop vectorization.
9558 // TODO: CM is not used at this point inside the planner. Turn CM into an
9559 // optional argument if we don't need it in the future.
9560 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9561 ORE);
9562
9563 // Get user vectorization factor.
9564 ElementCount UserVF = Hints.getWidth();
9565
9567
9568 // Plan how to best vectorize, return the best VF and its cost.
9569 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9570
9571 // If we are stress testing VPlan builds, do not attempt to generate vector
9572 // code. Masked vector code generation support will follow soon.
9573 // Also, do not attempt to vectorize if no vector code will be produced.
9575 return false;
9576
9577 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9578
9579 {
9580 bool AddBranchWeights =
9581 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9582 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9583 F->getParent()->getDataLayout(), AddBranchWeights);
9584 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9585 VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9586 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9587 << L->getHeader()->getParent()->getName() << "\"\n");
9588 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9589 }
9590
9591 reportVectorization(ORE, L, VF, 1);
9592
9593 // Mark the loop as already vectorized to avoid vectorizing again.
9594 Hints.setAlreadyVectorized();
9595 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9596 return true;
9597}
9598
9599// Emit a remark if there are stores to floats that required a floating point
9600// extension. If the vectorized loop was generated with floating point there
9601// will be a performance penalty from the conversion overhead and the change in
9602// the vector width.
9605 for (BasicBlock *BB : L->getBlocks()) {
9606 for (Instruction &Inst : *BB) {
9607 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9608 if (S->getValueOperand()->getType()->isFloatTy())
9609 Worklist.push_back(S);
9610 }
9611 }
9612 }
9613
9614 // Traverse the floating point stores upwards searching, for floating point
9615 // conversions.
9618 while (!Worklist.empty()) {
9619 auto *I = Worklist.pop_back_val();
9620 if (!L->contains(I))
9621 continue;
9622 if (!Visited.insert(I).second)
9623 continue;
9624
9625 // Emit a remark if the floating point store required a floating
9626 // point conversion.
9627 // TODO: More work could be done to identify the root cause such as a
9628 // constant or a function return type and point the user to it.
9629 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9630 ORE->emit([&]() {
9631 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9632 I->getDebugLoc(), L->getHeader())
9633 << "floating point conversion changes vector width. "
9634 << "Mixed floating point precision requires an up/down "
9635 << "cast that will negatively impact performance.";
9636 });
9637
9638 for (Use &Op : I->operands())
9639 if (auto *OpI = dyn_cast<Instruction>(Op))
9640 Worklist.push_back(OpI);
9641 }
9642}
9643
9644static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9646 std::optional<unsigned> VScale, Loop *L,
9647 ScalarEvolution &SE,
9649 InstructionCost CheckCost = Checks.getCost();
9650 if (!CheckCost.isValid())
9651 return false;
9652
9653 // When interleaving only scalar and vector cost will be equal, which in turn
9654 // would lead to a divide by 0. Fall back to hard threshold.
9655 if (VF.Width.isScalar()) {
9656 if (CheckCost > VectorizeMemoryCheckThreshold) {
9657 LLVM_DEBUG(
9658 dbgs()
9659 << "LV: Interleaving only is not profitable due to runtime checks\n");
9660 return false;
9661 }
9662 return true;
9663 }
9664
9665 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9666 uint64_t ScalarC = *VF.ScalarCost.getValue();
9667 if (ScalarC == 0)
9668 return true;
9669
9670 // First, compute the minimum iteration count required so that the vector
9671 // loop outperforms the scalar loop.
9672 // The total cost of the scalar loop is
9673 // ScalarC * TC
9674 // where
9675 // * TC is the actual trip count of the loop.
9676 // * ScalarC is the cost of a single scalar iteration.
9677 //
9678 // The total cost of the vector loop is
9679 // RtC + VecC * (TC / VF) + EpiC
9680 // where
9681 // * RtC is the cost of the generated runtime checks
9682 // * VecC is the cost of a single vector iteration.
9683 // * TC is the actual trip count of the loop
9684 // * VF is the vectorization factor
9685 // * EpiCost is the cost of the generated epilogue, including the cost
9686 // of the remaining scalar operations.
9687 //
9688 // Vectorization is profitable once the total vector cost is less than the
9689 // total scalar cost:
9690 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9691 //
9692 // Now we can compute the minimum required trip count TC as
9693 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
9694 //
9695 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9696 // the computations are performed on doubles, not integers and the result
9697 // is rounded up, hence we get an upper estimate of the TC.
9698 unsigned IntVF = VF.Width.getKnownMinValue();
9699 if (VF.Width.isScalable()) {
9700 unsigned AssumedMinimumVscale = 1;
9701 if (VScale)
9702 AssumedMinimumVscale = *VScale;
9703 IntVF *= AssumedMinimumVscale;
9704 }
9705 uint64_t RtC = *CheckCost.getValue();
9706 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
9707 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
9708
9709 // Second, compute a minimum iteration count so that the cost of the
9710 // runtime checks is only a fraction of the total scalar loop cost. This
9711 // adds a loop-dependent bound on the overhead incurred if the runtime
9712 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9713 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9714 // cost, compute
9715 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9716 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
9717
9718 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9719 // epilogue is allowed, choose the next closest multiple of VF. This should
9720 // partly compensate for ignoring the epilogue cost.
9721 uint64_t MinTC = std::max(MinTC1, MinTC2);
9722 if (SEL == CM_ScalarEpilogueAllowed)
9723 MinTC = alignTo(MinTC, IntVF);
9725
9726 LLVM_DEBUG(
9727 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9728 << VF.MinProfitableTripCount << "\n");
9729
9730 // Skip vectorization if the expected trip count is less than the minimum
9731 // required trip count.
9732 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9735 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9736 "trip count < minimum profitable VF ("
9737 << *ExpectedTC << " < " << VF.MinProfitableTripCount
9738 << ")\n");
9739
9740 return false;
9741 }
9742 }
9743 return true;
9744}
9745
9747 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9749 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9751
9753 assert((EnableVPlanNativePath || L->isInnermost()) &&
9754 "VPlan-native path is not enabled. Only process inner loops.");
9755
9756#ifndef NDEBUG
9757 const std::string DebugLocStr = getDebugLocString(L);
9758#endif /* NDEBUG */
9759
9760 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9761 << L->getHeader()->getParent()->getName() << "' from "
9762 << DebugLocStr << "\n");
9763
9764 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9765
9766 LLVM_DEBUG(
9767 dbgs() << "LV: Loop hints:"
9768 << " force="
9770 ? "disabled"
9772 ? "enabled"
9773 : "?"))
9774 << " width=" << Hints.getWidth()
9775 << " interleave=" << Hints.getInterleave() << "\n");
9776
9777 // Function containing loop
9778 Function *F = L->getHeader()->getParent();
9779
9780 // Looking at the diagnostic output is the only way to determine if a loop
9781 // was vectorized (other than looking at the IR or machine code), so it
9782 // is important to generate an optimization remark for each loop. Most of
9783 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9784 // generated as OptimizationRemark and OptimizationRemarkMissed are
9785 // less verbose reporting vectorized loops and unvectorized loops that may
9786 // benefit from vectorization, respectively.
9787
9788 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9789 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9790 return false;
9791 }
9792
9793 PredicatedScalarEvolution PSE(*SE, *L);
9794
9795 // Check if it is legal to vectorize the loop.
9796 LoopVectorizationRequirements Requirements;
9797 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9798 &Requirements, &Hints, DB, AC, BFI, PSI);
9800 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9801 Hints.emitRemarkWithHints();
9802 return false;
9803 }
9804
9805 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9806 // here. They may require CFG and instruction level transformations before
9807 // even evaluating whether vectorization is profitable. Since we cannot modify
9808 // the incoming IR, we need to build VPlan upfront in the vectorization
9809 // pipeline.
9810 if (!L->isInnermost())
9811 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9812 ORE, BFI, PSI, Hints, Requirements);
9813
9814 assert(L->isInnermost() && "Inner loop expected.");
9815
9816 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9817 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9818
9819 // If an override option has been passed in for interleaved accesses, use it.
9820 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9821 UseInterleaved = EnableInterleavedMemAccesses;
9822
9823 // Analyze interleaved memory accesses.
9824 if (UseInterleaved)
9826
9827 // Check the function attributes and profiles to find out if this function
9828 // should be optimized for size.
9830 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9831
9832 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9833 // count by optimizing for size, to minimize overheads.
9834 auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9835 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9836 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9837 << "This loop is worth vectorizing only if no scalar "
9838 << "iteration overheads are incurred.");
9840 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9841 else {
9842 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
9843 LLVM_DEBUG(dbgs() << "\n");
9844 // Predicate tail-folded loops are efficient even when the loop
9845 // iteration count is low. However, setting the epilogue policy to
9846 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9847 // with runtime checks. It's more effective to let
9848 // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9849 // for the loop.
9852 } else {
9853 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
9854 "small to consider vectorizing.\n");
9856 "The trip count is below the minial threshold value.",
9857 "loop trip count is too low, avoiding vectorization",
9858 "LowTripCount", ORE, L);
9859 Hints.emitRemarkWithHints();
9860 return false;
9861 }
9862 }
9863 }
9864
9865 // Check the function attributes to see if implicit floats or vectors are
9866 // allowed.
9867 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9869 "Can't vectorize when the NoImplicitFloat attribute is used",
9870 "loop not vectorized due to NoImplicitFloat attribute",
9871 "NoImplicitFloat", ORE, L);
9872 Hints.emitRemarkWithHints();
9873 return false;
9874 }
9875
9876 // Check if the target supports potentially unsafe FP vectorization.
9877 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9878 // for the target we're vectorizing for, to make sure none of the
9879 // additional fp-math flags can help.
9880 if (Hints.isPotentiallyUnsafe() &&
9883 "Potentially unsafe FP op prevents vectorization",
9884 "loop not vectorized due to unsafe FP support.",
9885 "UnsafeFP", ORE, L);
9886 Hints.emitRemarkWithHints();
9887 return false;
9888 }
9889
9890 bool AllowOrderedReductions;
9891 // If the flag is set, use that instead and override the TTI behaviour.
9892 if (ForceOrderedReductions.getNumOccurrences() > 0)
9893 AllowOrderedReductions = ForceOrderedReductions;
9894 else
9895 AllowOrderedReductions = TTI->enableOrderedReductions();
9896 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9897 ORE->emit([&]() {
9898 auto *ExactFPMathInst = Requirements.getExactFPInst();
9899 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9900 ExactFPMathInst->getDebugLoc(),
9901 ExactFPMathInst->getParent())
9902 << "loop not vectorized: cannot prove it is safe to reorder "
9903 "floating-point operations";
9904 });
9905 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9906 "reorder floating-point operations\n");
9907 Hints.emitRemarkWithHints();
9908 return false;
9909 }
9910
9911 // Use the cost model.
9912 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9913 F, &Hints, IAI);
9914 // Use the planner for vectorization.
9915 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9916 ORE);
9917
9918 // Get user vectorization factor and interleave count.
9919 ElementCount UserVF = Hints.getWidth();
9920 unsigned UserIC = Hints.getInterleave();
9921
9922 // Plan how to best vectorize, return the best VF and its cost.
9923 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9924
9926 unsigned IC = 1;
9927
9928 bool AddBranchWeights =
9929 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9930 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9931 F->getParent()->getDataLayout(), AddBranchWeights);
9932 if (MaybeVF) {
9933 VF = *MaybeVF;
9934 // Select the interleave count.
9935 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9936
9937 unsigned SelectedIC = std::max(IC, UserIC);
9938 // Optimistically generate runtime checks if they are needed. Drop them if
9939 // they turn out to not be profitable.
9940 if (VF.Width.isVector() || SelectedIC > 1)
9941 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
9942
9943 // Check if it is profitable to vectorize with runtime checks.
9944 bool ForceVectorization =
9946 if (!ForceVectorization &&
9948 *PSE.getSE(), SEL)) {
9949 ORE->emit([&]() {
9951 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9952 L->getHeader())
9953 << "loop not vectorized: cannot prove it is safe to reorder "
9954 "memory operations";
9955 });
9956 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9957 Hints.emitRemarkWithHints();
9958 return false;
9959 }
9960 }
9961
9962 // Identify the diagnostic messages that should be produced.
9963 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9964 bool VectorizeLoop = true, InterleaveLoop = true;
9965 if (VF.Width.isScalar()) {
9966 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9967 VecDiagMsg = std::make_pair(
9968 "VectorizationNotBeneficial",
9969 "the cost-model indicates that vectorization is not beneficial");
9970 VectorizeLoop = false;
9971 }
9972
9973 if (!MaybeVF && UserIC > 1) {
9974 // Tell the user interleaving was avoided up-front, despite being explicitly
9975 // requested.
9976 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9977 "interleaving should be avoided up front\n");
9978 IntDiagMsg = std::make_pair(
9979 "InterleavingAvoided",
9980 "Ignoring UserIC, because interleaving was avoided up front");
9981 InterleaveLoop = false;
9982 } else if (IC == 1 && UserIC <= 1) {
9983 // Tell the user interleaving is not beneficial.
9984 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9985 IntDiagMsg = std::make_pair(
9986 "InterleavingNotBeneficial",
9987 "the cost-model indicates that interleaving is not beneficial");
9988 InterleaveLoop = false;
9989 if (UserIC == 1) {
9990 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9991 IntDiagMsg.second +=
9992 " and is explicitly disabled or interleave count is set to 1";
9993 }
9994 } else if (IC > 1 && UserIC == 1) {
9995 // Tell the user interleaving is beneficial, but it explicitly disabled.
9996 LLVM_DEBUG(
9997 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9998 IntDiagMsg = std::make_pair(
9999 "InterleavingBeneficialButDisabled",
10000 "the cost-model indicates that interleaving is beneficial "
10001 "but is explicitly disabled or interleave count is set to 1");
10002 InterleaveLoop = false;
10003 }
10004
10005 // Override IC if user provided an interleave count.
10006 IC = UserIC > 0 ? UserIC : IC;
10007
10008 // Emit diagnostic messages, if any.
10009 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10010 if (!VectorizeLoop && !InterleaveLoop) {
10011 // Do not vectorize or interleaving the loop.
10012 ORE->emit([&]() {
10013 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10014 L->getStartLoc(), L->getHeader())
10015 << VecDiagMsg.second;
10016 });
10017 ORE->emit([&]() {
10018 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10019 L->getStartLoc(), L->getHeader())
10020 << IntDiagMsg.second;
10021 });
10022 return false;
10023 } else if (!VectorizeLoop && InterleaveLoop) {
10024 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10025 ORE->emit([&]() {
10026 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10027 L->getStartLoc(), L->getHeader())
10028 << VecDiagMsg.second;
10029 });
10030 } else if (VectorizeLoop && !InterleaveLoop) {
10031 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10032 << ") in " << DebugLocStr << '\n');
10033 ORE->emit([&]() {
10034 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10035 L->getStartLoc(), L->getHeader())
10036 << IntDiagMsg.second;
10037 });
10038 } else if (VectorizeLoop && InterleaveLoop) {
10039 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10040 << ") in " << DebugLocStr << '\n');
10041 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10042 }
10043
10044 bool DisableRuntimeUnroll = false;
10045 MDNode *OrigLoopID = L->getLoopID();
10046 {
10047 using namespace ore;
10048 if (!VectorizeLoop) {
10049 assert(IC > 1 && "interleave count should not be 1 or 0");
10050 // If we decided that it is not legal to vectorize the loop, then
10051 // interleave it.
10052 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10053 &CM, BFI, PSI, Checks);
10054
10055 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10056 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10057
10058 ORE->emit([&]() {
10059 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10060 L->getHeader())
10061 << "interleaved loop (interleaved count: "
10062 << NV("InterleaveCount", IC) << ")";
10063 });
10064 } else {
10065 // If we decided that it is *legal* to vectorize the loop, then do it.
10066
10067 // Consider vectorizing the epilogue too if it's profitable.
10068 VectorizationFactor EpilogueVF =
10070 if (EpilogueVF.Width.isVector()) {
10071
10072 // The first pass vectorizes the main loop and creates a scalar epilogue
10073 // to be vectorized by executing the plan (potentially with a different
10074 // factor) again shortly afterwards.
10075 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10076 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10077 EPI, &LVL, &CM, BFI, PSI, Checks);
10078
10079 std::unique_ptr<VPlan> BestMainPlan(
10081 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
10082 EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
10083 ++LoopsVectorized;
10084
10085 // Second pass vectorizes the epilogue and adjusts the control flow
10086 // edges from the first pass.
10087 EPI.MainLoopVF = EPI.EpilogueVF;
10088 EPI.MainLoopUF = EPI.EpilogueUF;
10089 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10090 ORE, EPI, &LVL, &CM, BFI, PSI,
10091 Checks);
10092
10093 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10094 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10095 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10096 Header->setName("vec.epilog.vector.body");
10097
10098 // Re-use the trip count and steps expanded for the main loop, as
10099 // skeleton creation needs it as a value that dominates both the scalar
10100 // and vector epilogue loops
10101 // TODO: This is a workaround needed for epilogue vectorization and it
10102 // should be removed once induction resume value creation is done
10103 // directly in VPlan.
10104 EpilogILV.setTripCount(MainILV.getTripCount());
10105 for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
10106 auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
10107 auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn(
10108 ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10109 ExpandR->replaceAllUsesWith(ExpandedVal);
10110 if (BestEpiPlan.getTripCount() == ExpandR)
10111 BestEpiPlan.resetTripCount(ExpandedVal);
10112 ExpandR->eraseFromParent();
10113 }
10114
10115 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10116 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10117 // before vectorizing the epilogue loop.
10118 for (VPRecipeBase &R : Header->phis()) {
10119 if (isa<VPCanonicalIVPHIRecipe>(&R))
10120 continue;
10121
10122 Value *ResumeV = nullptr;
10123 // TODO: Move setting of resume values to prepareToExecute.
10124 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10125 ResumeV = ReductionResumeValues
10126 .find(&ReductionPhi->getRecurrenceDescriptor())
10127 ->second;
10128 } else {
10129 // Create induction resume values for both widened pointer and
10130 // integer/fp inductions and update the start value of the induction
10131 // recipes to use the resume value.
10132 PHINode *IndPhi = nullptr;
10133 const InductionDescriptor *ID;
10134 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10135 IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10136 ID = &Ind->getInductionDescriptor();
10137 } else {
10138 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10139 IndPhi = WidenInd->getPHINode();
10140 ID = &WidenInd->getInductionDescriptor();
10141 }
10142
10143 ResumeV = MainILV.createInductionResumeValue(
10144 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
10146 }
10147 assert(ResumeV && "Must have a resume value");
10148 VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV);
10149 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10150 }
10151
10152 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10153 DT, true, &ExpandedSCEVs);
10154 ++LoopsEpilogueVectorized;
10155
10156 if (!MainILV.areSafetyChecksAdded())
10157 DisableRuntimeUnroll = true;
10158 } else {
10159 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10160 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10161 PSI, Checks);
10162
10163 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10164 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10165 ++LoopsVectorized;
10166
10167 // Add metadata to disable runtime unrolling a scalar loop when there
10168 // are no runtime checks about strides and memory. A scalar loop that is
10169 // rarely used is not worth unrolling.
10170 if (!LB.areSafetyChecksAdded())
10171 DisableRuntimeUnroll = true;
10172 }
10173 // Report the vectorization decision.
10174 reportVectorization(ORE, L, VF, IC);
10175 }
10176
10179 }
10180
10181 std::optional<MDNode *> RemainderLoopID =
10184 if (RemainderLoopID) {
10185 L->setLoopID(*RemainderLoopID);
10186 } else {
10187 if (DisableRuntimeUnroll)
10189
10190 // Mark the loop as already vectorized to avoid vectorizing again.
10191 Hints.setAlreadyVectorized();
10192 }
10193
10194 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10195 return true;
10196}
10197
10203 SE = &SE_;
10204 LI = &LI_;
10205 TTI = &TTI_;
10206 DT = &DT_;
10207 BFI = BFI_;
10208 TLI = TLI_;
10209 AC = &AC_;
10210 LAIs = &LAIs_;
10211 DB = &DB_;
10212 ORE = &ORE_;
10213 PSI = PSI_;
10214
10215 // Don't attempt if
10216 // 1. the target claims to have no vector registers, and
10217 // 2. interleaving won't help ILP.
10218 //
10219 // The second condition is necessary because, even if the target has no
10220 // vector registers, loop vectorization may still enable scalar
10221 // interleaving.
10224 return LoopVectorizeResult(false, false);
10225
10226 bool Changed = false, CFGChanged = false;
10227
10228 // The vectorizer requires loops to be in simplified form.
10229 // Since simplification may add new inner loops, it has to run before the
10230 // legality and profitability checks. This means running the loop vectorizer
10231 // will simplify all loops, regardless of whether anything end up being
10232 // vectorized.
10233 for (const auto &L : *LI)
10234 Changed |= CFGChanged |=
10235 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10236
10237 // Build up a worklist of inner-loops to vectorize. This is necessary as
10238 // the act of vectorizing or partially unrolling a loop creates new loops
10239 // and can invalidate iterators across the loops.
10240 SmallVector<Loop *, 8> Worklist;
10241
10242 for (Loop *L : *LI)
10243 collectSupportedLoops(*L, LI, ORE, Worklist);
10244
10245 LoopsAnalyzed += Worklist.size();
10246
10247 // Now walk the identified inner loops.
10248 while (!Worklist.empty()) {
10249 Loop *L = Worklist.pop_back_val();
10250
10251 // For the inner loops we actually process, form LCSSA to simplify the
10252 // transform.
10253 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10254
10255 Changed |= CFGChanged |= processLoop(L);
10256
10257 if (Changed) {
10258 LAIs->clear();
10259
10260#ifndef NDEBUG
10261 if (VerifySCEV)
10262 SE->verify();
10263#endif
10264 }
10265 }
10266
10267 // Process each loop nest in the function.
10268 return LoopVectorizeResult(Changed, CFGChanged);
10269}
10270
10273 auto &LI = AM.getResult<LoopAnalysis>(F);
10274 // There are no loops in the function. Return before computing other expensive
10275 // analyses.
10276 if (LI.empty())
10277 return PreservedAnalyses::all();
10279 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10280 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10281 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10282 auto &AC = AM.getResult<AssumptionAnalysis>(F);
10283 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10285
10287 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10289 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10290 BlockFrequencyInfo *BFI = nullptr;
10291 if (PSI && PSI->hasProfileSummary())
10293 LoopVectorizeResult Result =
10294 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10295 if (!Result.MadeAnyChange)
10296 return PreservedAnalyses::all();
10298
10299 if (isAssignmentTrackingEnabled(*F.getParent())) {
10300 for (auto &BB : F)
10302 }
10303
10304 // We currently do not preserve loopinfo/dominator analyses with outer loop
10305 // vectorization. Until this is addressed, mark these analyses as preserved
10306 // only for non-VPlan-native path.
10307 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10308 if (!EnableVPlanNativePath) {
10309 PA.preserve<LoopAnalysis>();
10312 }
10313
10314 if (Result.MadeCFGChange) {
10315 // Making CFG changes likely means a loop got vectorized. Indicate that
10316 // extra simplification passes should be run.
10317 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10318 // be run if runtime checks have been added.
10321 } else {
10323 }
10324 return PA;
10325}
10326
10328 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10329 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10330 OS, MapClassName2PassName);
10331
10332 OS << '<';
10333 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10334 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10335 OS << '>';
10336}
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
AMDGPU Lower Kernel Arguments
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
ReachingDefAnalysis InstSet & ToRemove
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:693
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
#define DEBUG_WITH_TYPE(TYPE, X)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition: Debug.h:64
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
uint64_t Addr
std::string Name
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define Check(C,...)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
This header provides classes for managing per-loop analyses.
static const char * VerboseDebug
loop Loop Strength Reduction
#define LV_NAME
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static std::optional< unsigned > getSmallBestKnownTC(ScalarEvolution &SE, Loop *L)
Returns "best known" trip count for the specified loop L as defined by the following procedure: 1) Re...
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static void createAndCollectMergePhiForReduction(VPInstruction *RedResult, DenseMap< const RecurrenceDescriptor *, Value * > &ReductionResumeValues, VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock)
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range)
Creates a VPWidenIntOrFpInductionRecpipe for Phi.
static Value * interleaveVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vals, const Twine &Name)
Return a vector containing interleaved elements from multiple smaller input vectors.
static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or it's operands.
static void emitInvalidCostRemarks(SmallVector< InstructionVFPair > InvalidCosts, OptimizationRemarkEmitter *ORE, Loop *TheLoop)
static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan)
const char LLVMLoopVectorizeFollowupAll[]
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, DebugLoc DL)
static std::optional< unsigned > getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI)
Convenience function that returns the value of vscale_range iff vscale_range.min == vscale_range....
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static constexpr uint32_t MemCheckBypassWeights[]
static Type * MaybeVectorizeType(Type *Elt, ElementCount VF)
static cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing."))
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static Type * smallestIntegerVectorType(Type *T1, Type *T2)
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, std::optional< unsigned > VScale, Loop *L, ScalarEvolution &SE, ScalarEpilogueLowering SEL)
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
const char VerboseDebug[]
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I)
Create an analysis remark that explains why vectorization failed.
static constexpr uint32_t SCEVCheckBypassWeights[]
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
const char LLVMLoopVectorizeFollowupVectorized[]
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static std::string getDebugLocString(const Loop *L)
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
const char LLVMLoopVectorizeFollowupEpilogue[]
static bool useActiveLaneMask(TailFoldingStyle Style)
static Type * largestIntegerVectorType(Type *T1, Type *T2)
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
static unsigned getReciprocalPredBlockProb()
A helper function that returns the reciprocal of the block probability of predicated blocks.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
#define DEBUG_TYPE
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static void AddRuntimeUnrollDisableMetaData(Loop *L)
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static cl::opt< bool > PrintVPlansInDotFormat("vplan-print-in-dot-format", cl::Hidden, cl::desc("Use dot format instead of plain text when dumping VPlans"))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
#define R2(n)
This file contains the declarations for metadata subclasses.
#define T1
Module.h This file contains the declarations for the Module class.
LLVMContext & Context
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(VerifyEach)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
separate const offset from gep
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
This defines the Use class.
This file defines the VPlanHCFGBuilder class which contains the public interface (buildHierarchicalCF...
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:473
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
void registerAssumption(AssumeInst *CI)
Add an @llvm.assume intrinsic to this function's cache.
unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
Definition: Attributes.cpp:411
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:194
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:430
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:499
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
Definition: BasicBlock.cpp:367
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:360
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:452
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:165
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:221
BinaryOps getOpcode() const
Definition: InstrTypes.h:513
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
void setCondition(Value *V)
static BranchInst * Create(BasicBlock *IfTrue, BasicBlock::iterator InsertBefore)
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:70
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:2227
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1742
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1678
unsigned arg_size() const
Definition: InstrTypes.h:1685
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1871
This class represents a function call, abstracting a target machine's calling convention.
static bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:1016
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:1018
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:1019
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:849
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:856
This is an important base class in LLVM.
Definition: Constant.h:41
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
A debug info location.
Definition: DebugLoc.h:33
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:101
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:211
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
DomTreeNodeBase * getIDom() const
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:311
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:299
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:296
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition: TypeSize.h:302
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:307
BasicBlock * emitMinimumVectorEpilogueIterCountCheck(BasicBlock *Bypass, BasicBlock *Insert)
Emits an iteration count bypass check after the main vector loop has finished to see if there are any...
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (ie the ...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the main loop strategy (ie the firs...
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:319
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent function types.
Definition: DerivedTypes.h:103
param_iterator param_begin() const
Definition: DerivedTypes.h:128
param_iterator param_end() const
Definition: DerivedTypes.h:129
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:685
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:202
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:701
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:682
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:675
static GetElementPtrInst * Create(Type *PointeeType, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &NameStr, BasicBlock::iterator InsertBefore)
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:921
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:511
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2460
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1807
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1214
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2516
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:466
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:578
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1110
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:175
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:526
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:311
Value * CreateVectorReverse(Value *V, const Twine &Name="")
Return a vector value that contains the vector V reversed.
Definition: IRBuilder.cpp:1170
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1721
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:486
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2205
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2397
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2241
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:145
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1344
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2494
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:598
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1327
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:471
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1666
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1826
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1866
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2351
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:516
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1404
Value * CreateStepVector(Type *DstType, const Twine &Name="")
Creates a vector of type DstType with the linear sequence <0, 1, ...>
Definition: IRBuilder.cpp:109
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1361
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
A struct for saving information about induction variables.
BinaryOperator * getInductionBinOp() const
InductionKind getKind() const
const SCEV * getStep() const
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
Value * getStartValue() const
An extension of the inner loop vectorizer that creates a skeleton for a vectorized loop that has its ...
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
virtual std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)=0
The interface for creating a vectorized skeleton using one of two different strategies,...
std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
PHINode * createInductionResumeValue(PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, ArrayRef< BasicBlock * > BypassBlocks, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create a new phi node for the induction variable OrigPhi to resume iteration count in the scalar epil...
void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
BasicBlock * LoopScalarBody
The scalar loop body.
Value * TripCount
Trip count of the original loop.
void sinkScalarOperands(Instruction *PredInst)
Iteratively sink the scalarized operands of a predicated instruction into the block that was created ...
const TargetLibraryInfo * TLI
Target Library Info.
DenseMap< PHINode *, Value * > IVEndValues
ElementCount MinProfitableTripCount
Value * createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL)
Returns a bitcasted value to the requested vector type.
const TargetTransformInfo * TTI
Target Transform Info.
Value * VectorTripCount
Trip count of the widened loop (TripCount - TripCount % (VF*UF))
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
BasicBlock * emitSCEVChecks(BasicBlock *Bypass)
Emit a bypass check to see if all of the SCEV assumptions we've had to make are correct.
LoopVectorizationCostModel * Cost
The profitablity analysis.
SmallMapVector< const RecurrenceDescriptor *, PHINode *, 4 > ReductionResumeValues
BlockFrequencyInfo * BFI
BFI and PSI are used to check for profile guided size optimizations.
Value * getTripCount() const
Returns the original loop trip count.
BasicBlock * LoopMiddleBlock
Middle Block between the vector and the scalar.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
SmallVector< Instruction *, 4 > PredicatedInstructions
Store instructions that were predicated.
void createVectorLoopSkeleton(StringRef Prefix)
Emit basic blocks (prefixed with Prefix) for the iteration check, vector loop preheader,...
BasicBlock * completeLoopSkeleton()
Complete the loop skeleton by adding debug MDs, creating appropriate conditional branches in the midd...
BasicBlock * emitMemRuntimeChecks(BasicBlock *Bypass)
Emit bypass checks to check any memory assumptions we may have made.
BasicBlock * LoopScalarPreHeader
The scalar-loop preheader.
LoopVectorizationLegality * Legal
The legality analysis.
void emitIterationCountCheck(BasicBlock *Bypass)
Emit a bypass check to see if the vector trip count is zero, including if it overflows.
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, BasicBlock *MiddleBlock, BasicBlock *VectorHeader, VPlan &Plan, VPTransformState &State)
Set up the values of the IVs correctly when exiting the vector loop.
LoopInfo * LI
Loop Info.
void createInductionResumeValues(const SCEV2ValueTy &ExpandedSCEVs, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create new phi nodes for the induction variables to resume iteration count in the scalar epilogue,...
ProfileSummaryInfo * PSI
void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State)
Fix the non-induction PHIs in Plan.
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
BasicBlock * LoopVectorPreHeader
The vector-loop preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
Value * getOrCreateVectorTripCount(BasicBlock *InsertBlock)
Returns (and creates if needed) the trip count of the widened loop.
IRBuilder Builder
The builder that we use.
void vectorizeInterleaveGroup(const InterleaveGroup< Instruction > *Group, ArrayRef< VPValue * > VPDefs, VPTransformState &State, VPValue *Addr, ArrayRef< VPValue * > StoredValues, VPValue *BlockInMask, bool NeedsMaskForGaps)
Try to vectorize interleaved access group Group with the base address given in Addr,...
void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State)
Create the exit value of first order recurrences in the middle block and update their users.
virtual std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
unsigned UF
The vectorization unroll factor to use.
void fixVectorizedLoop(VPTransformState &State, VPlan &Plan)
Fix the vectorized code, taking care of header phi's, live-outs, and more.
BasicBlock * LoopExitBlock
The unique ExitBlock of the scalar loop if one exists.
SmallVector< BasicBlock *, 4 > LoopBypassBlocks
A list of all bypass blocks. The first block is the entry of the loop.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:454
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:83
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
void replaceSuccessorWith(BasicBlock *OldBB, BasicBlock *NewBB)
Replace specified successor OldBB to point at the provided block.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:149
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:451
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
The group of interleaved loads/stores sharing the same stride and close to each other.
Definition: VectorUtils.h:444
uint32_t getFactor() const
Definition: VectorUtils.h:460
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
Definition: VectorUtils.h:514
uint32_t getIndex(const InstTy *Instr) const
Get the index for the given member.
Definition: VectorUtils.h:521
bool isReverse() const
Definition: VectorUtils.h:459
InstTy * getInsertPos() const
Definition: VectorUtils.h:530
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
Align getAlign() const
Definition: VectorUtils.h:461
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:586
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
Definition: VectorUtils.h:631
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
Definition: VectorUtils.h:642
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
Definition: VectorUtils.h:623
bool invalidateGroups()
Invalidate groups, e.g., in case all blocks in loop will be predicated contrary to original assumptio...
Definition: VectorUtils.h:606
iterator_range< SmallPtrSetIterator< llvm::InterleaveGroup< Instruction > * > > getInterleaveGroups()
Definition: VectorUtils.h:636
void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:184
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
iterator_range< block_iterator > blocks() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
Store the result of a depth first search within basic blocks contained by a single loop.
Definition: LoopIterator.h:97
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
Definition: LoopIterator.h:136
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopInfo.cpp:1222
RPOIterator endRPO() const
Definition: LoopIterator.h:140
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopIterator.h:180
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool requiresScalarEpilogue(VFRange Range) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
bool isAccessInterleaved(Instruction *Instr)
Check if Instr belongs to any interleaved access group.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI)
LoopVectorizationLegality * Legal
Vectorization legality.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
std::pair< InstructionCost, bool > VectorizationCostTy
The vectorization cost is a combination of the cost itself and a boolean indicating whether any of th...
DemandedBits * DB
Demanded bits analysis.
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
Loop * TheLoop
The loop that we evaluate.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallVector< RegisterUsage, 8 > calculateRegisterUsage(ArrayRef< ElementCount > VFs)
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
VectorizationCostTy expectedCost(ElementCount VF, SmallVectorImpl< InstructionVFPair > *Invalid=nullptr)
Returns the expected execution cost.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
bool isEpilogueVectorizationProfitable(const ElementCount VF) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
void collectUniformsAndScalars(ElementCount VF)
Collect Uniform and Scalar values for the given VF.
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const
Returns true if I is an instruction which requires predication and for which our chosen predication s...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr)
Get the interleaved access group that Instr belongs to.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost)
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
bool hasVectorCallVariants() const
Returns true if there is at least one function call in the loop which has a vectorized variant availa...
bool isInvariantAddressOfReduction(Value *V)
Returns True if given address is invariant and is used to store recurrent expression.
bool blockNeedsPredication(BasicBlock *BB) const
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const
Check if this pointer is consecutive when vectorizing.
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
bool isReductionVariable(PHINode *PN) const
Returns True if PN is a reduction variable in this loop.
bool isFixedOrderRecurrence(const PHINode *Phi) const
Returns True if Phi is a fixed-order recurrence in this loop.
const InductionDescriptor * getPointerInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is pointer induction.
const InductionDescriptor * getIntOrFpInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is an integer or floating point induction.
bool isInductionPhi(const Value *V) const
Returns True if V is a Phi node of an induction variable in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool isInvariant(Value *V) const
Returns true if value V is uniform across VF lanes, when VF is provided, and otherwise if V is invari...
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Type * getWidestInductionType()
Returns the widest induction type.
const LoopAccessInfo * getLAI() const
bool prepareToFoldTailByMasking()
Return true if we can vectorize this loop while folding its tail by masking, and mark all respective ...
bool isUniformMemOp(Instruction &I, ElementCount VF) const
A uniform memory op is a load or store which accesses the same memory location on all VF lanes,...
bool isMaskRequired(const Instruction *I) const
Returns true if vector representation of the instruction I requires mask.
const RuntimePointerChecking * getRuntimePointerChecking() const
Returns the information that we collected about runtime memory check.
Planner drives the vectorization process after having passed Legality checks.
std::optional< VectorizationFactor > plan(ElementCount UserVF, unsigned UserIC)
Plan how to best vectorize, return the best VF and its cost, or std::nullopt if vectorization and int...
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
std::pair< DenseMap< const SCEV *, Value * >, DenseMap< const RecurrenceDescriptor *, Value * > > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool IsEpilogueVectorization, const DenseMap< const SCEV *, Value * > *ExpandedSCEVs=nullptr)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
VPlan & getBestPlanFor(ElementCount VF) const
Return the best VPlan for VF.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
void printPlans(raw_ostream &O)
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
void prepareNoAliasMetadata()
Set up the aliasing scopes based on the memchecks.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition: LoopInfo.cpp:66
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:631
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:60
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Definition: LoopInfo.cpp:501
Metadata node.
Definition: Metadata.h:1067
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1071
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1428
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1541
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1434
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:600
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
size_type size() const
Definition: MapVector.h:60
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:191
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:756
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
void setIncomingValueForBlock(const BasicBlock *BB, Value *V)
Set every incoming value(s) for block BB to V.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr, BasicBlock::iterator InsertBefore)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static unsigned getIncomingValueNumForOperand(unsigned i)
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:144
void preserve()
Mark an analysis as preserved.
Definition: Analysis.h:129
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:71
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
Instruction * getLoopExitInstr() const
static unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
TrackingVH< Value > getRecurrenceStartValue() const
SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
bool Need
This flag indicates if we need to add the runtime check.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class represents a constant integer value.
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
This class represents an assumption made using SCEV expressions which can be checked at run-time.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isZero() const
Return true if the expression is a constant zero.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getSmallConstantMaxTripCount(const Loop *L)
Returns the upper bound of the loop trip count as a normal unsigned value.
const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVMContext & getContext() const
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
value_type pop_back_val()
Definition: SetVector.h:285
This class provides computation of slot numbers for LLVM Assembly writing.
Definition: AsmWriter.cpp:693
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:356
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
iterator end() const
Definition: SmallPtrSet.h:385
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
iterator begin() const
Definition: SmallPtrSet.h:380
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
std::optional< unsigned > getVScaleForTuning() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const
Return true if the target supports masked scatter.
bool enableAggressiveInterleaving(bool LoopHasReductions) const
Don't restrict interleaved unrolling to small loops.
bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
bool prefersVectorizedAddressing() const
Return true if target doesn't mind addresses in vectors.
bool hasBranchDivergence(const Function *F=nullptr) const
Return true if branch divergence exists.
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE=nullptr, const SCEV *Ptr=nullptr) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const
Get target-customized preferences for the generic loop unrolling transformation.
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instructions unique non-constant operands.
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
std::optional< unsigned > getMaxVScale() const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool enableOrderedReductions() const
Return true if we should be enabling ordered reductions for the target.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Query the target what the preferred style of tail folding is.
unsigned getRegUsageForType(Type *Ty) const
Returns the estimated number of registers required to represent Ty.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add ...
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isElementTypeLegalForScalableVector(Type *Ty) const
ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const
bool enableMaskedInterleavedAccessVectorization() const
Enable matching of interleaved access groups that contain predicated accesses or gaps and therefore v...
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
const char * getRegisterClassName(unsigned ClassID) const
bool preferEpilogueVectorization() const
Return true if the loop vectorizer should consider vectorizing an otherwise scalar epilogue loop.
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
bool isLegalMaskedStore(Type *DataType, Align Alignment) const
Return true if the target supports masked store.
bool enableInterleavedAccessVectorization() const
Enable matching of interleaved access groups.
unsigned getMinTripCountTailFoldingThreshold() const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
unsigned getMaxInterleaveFactor(ElementCount VF) const
unsigned getNumberOfParts(Type *Tp) const
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr) const
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
Query the target whether it would be prefered to create a predicated vector loop, which can avoid the...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool isLegalMaskedLoad(Type *DataType, Align Alignment) const
Return true if the target supports masked load.
Value handle that tracks a Value across RAUW.
Definition: ValueHandle.h:331
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition: Type.h:243
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isTokenTy() const
Return true if this is 'token'.
Definition: Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
op_iterator op_begin()
Definition: User.h:234
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
op_iterator op_end()
Definition: User.h:236
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:70
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:2815
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition: VPlan.h:2883
void execute(VPTransformState *State) override
The method which generates the output IR instructions that correspond to this VPBasicBlock,...
Definition: VPlan.cpp:443
iterator end()
Definition: VPlan.h:2846
iterator begin()
Recipe iterator methods.
Definition: VPlan.h:2844
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition: VPlan.h:2893
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition: VPlan.cpp:210
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition: VPlan.h:2874
bool empty() const
Definition: VPlan.h:2855
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition: VPlan.h:1933
VPRegionBlock * getParent()
Definition: VPlan.h:489
const VPBasicBlock * getExitingBasicBlock() const
Definition: VPlan.cpp:175
void setName(const Twine &newName)
Definition: VPlan.h:482
VPlan * getPlan()
Definition: VPlan.cpp:148
const VPBasicBlock * getEntryBasicBlock() const
Definition: VPlan.cpp:153
VPBlockBase * getSingleSuccessor() const
Definition: VPlan.h:524
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBase NewBlock after BlockPtr.
Definition: VPlan.h:3369
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPBasicBlock * getInsertBlock() const
VPValue * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL={}, const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createOverflowingOp(unsigned Opcode, std::initializer_list< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL={}, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPValue * createNot(VPValue *Operand, DebugLoc DL={}, const Twine &Name="")
VPValue * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL={}, const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition: VPlan.h:2548
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition: VPlanValue.h:423
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition: VPlanValue.h:401
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition: VPlanValue.h:413
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPValue * getStepValue() const
Definition: VPlan.h:2753
VPValue * getStartValue() const
Definition: VPlan.h:2752
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition: VPlan.h:1620
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition: VPlan.h:1664
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition: VPlan.h:1653
This is a concrete Recipe that models a single VPlan-level instruction.
Definition: VPlan.h:1159
@ FirstOrderRecurrenceSplice
Definition: VPlan.h:1165
unsigned getOpcode() const
Definition: VPlan.h:1259
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition: VPlan.h:1990
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition: VPlan.h:2031
VPValue * getMask() const
Return the mask used by this recipe.
Definition: VPlan.h:2037
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition: VPlan.h:2044
unsigned getNumStoreOperands() const
Returns the number of stored operands of this interleave group.
Definition: VPlan.h:2064
static VPLane getLastLaneForVF(const ElementCount &VF)
Definition: VPlan.h:169
static VPLane getFirstLane()
Definition: VPlan.h:167
A value that is used outside the VPlan.
Definition: VPlan.h:669
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition: VPlan.h:709
VPBasicBlock * getParent()
Definition: VPlan.h:734
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition: VPlan.h:800
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPValue * getVPValueOrAddLiveIn(Value *V, VPlan &Plan)
VPValue * createEdgeMask(BasicBlock *Src, BasicBlock *Dst)
A helper function that computes the predicate of the edge between SRC and DST.
VPReplicateRecipe * handleReplication(Instruction *I, VFRange &Range)
Build a VPReplicationRecipe for I.
VPValue * getBlockInMask(BasicBlock *BB) const
Returns the entry mask for the block BB.
VPValue * getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const
A helper that returns the previously computed predicate of the edge between SRC and DST.
iterator_range< mapped_iterator< Use *, std::function< VPValue *(Value *)> > > mapToVPValues(User::op_range Operands)
Returns a range mapping the values of the range Operands to their corresponding VPValues.
void fixHeaderPhis()
Add the incoming values from the backedge to reduction & first-order recurrence cross-iteration phis.
VPRecipeBase * tryToCreateWidenRecipe(Instruction *Instr, ArrayRef< VPValue * > Operands, VFRange &Range, VPBasicBlock *VPBB)
Create and return a widened recipe for I if one can be created within the given VF Range.
void createHeaderMask()
Create the mask for the vector loop header block.
void createBlockInMask(BasicBlock *BB)
A helper function that computes the predicate of the block BB, assuming that the header block of the ...
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
void setFlags(Instruction *I) const
Set the IR flags for I.
Definition: VPlan.h:1082
A recipe for handling reduction phis.
Definition: VPlan.h:1874
bool isInLoop() const
Returns true, if the phi is part of an in-loop reduction.
Definition: VPlan.h:1928
const RecurrenceDescriptor & getRecurrenceDescriptor() const
Definition: VPlan.h:1920
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition: VPlan.h:2079
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition: VPlan.h:2948
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition: VPlan.h:3019
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition: VPlan.h:2127
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition: VPlan.h:826
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition: VPlan.h:888
This class can be used to assign names to VPValues.
Definition: VPlanValue.h:454
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition: VPlanValue.h:203
operand_range operands()
Definition: VPlanValue.h:278
void setOperand(unsigned I, VPValue *New)
Definition: VPlanValue.h:258
unsigned getNumOperands() const
Definition: VPlanValue.h:252
VPValue * getOperand(unsigned N) const
Definition: VPlanValue.h:253
void addOperand(VPValue *Operand)
Definition: VPlanValue.h:247
Value * getUnderlyingValue()
Return the underlying Value attached to this VPValue.
Definition: VPlanValue.h:77
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const
Definition: VPlan.cpp:1302
void replaceAllUsesWith(VPValue *New)
Definition: VPlan.cpp:1270
user_iterator user_begin()
Definition: VPlanValue.h:129
unsigned getNumUsers() const
Definition: VPlanValue.h:112
Value * getLiveInIRValue()
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition: VPlanValue.h:173
user_iterator user_end()
Definition: VPlanValue.h:131
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition: VPlanValue.h:168
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition: VPlan.cpp:1274
user_range users()
Definition: VPlanValue.h:133
A recipe to compute the pointers for widened memory accesses of IndexTy for all parts.
Definition: VPlan.h:1564
A recipe for widening Call instructions.
Definition: VPlan.h:1449
A Recipe for widening the canonical induction variable of the vector loop.
Definition: VPlan.h:2673
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition: VPlan.h:1360
A recipe for handling GEP instructions.
Definition: VPlan.h:1522
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition: VPlan.h:1677
A common base class for widening memory operations.
Definition: VPlan.h:2284
bool Reverse
Whether the consecutive accessed addresses are in reverse order.
Definition: VPlan.h:2292
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
Definition: VPlan.h:2331
Instruction & Ingredient
Definition: VPlan.h:2286
VPValue * getMask() const
Return the mask used by this recipe.
Definition: VPlan.h:2345
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition: VPlan.h:2338
bool isReverse() const
Return whether the consecutive loaded/stored addresses are in reverse order.
Definition: VPlan.h:2335
A recipe for handling phis that are widened in the vector loop.
Definition: VPlan.h:1802
VPValue * getIncomingValue(unsigned I)
Returns the I th incoming VPValue.
Definition: VPlan.h:1841
VPBasicBlock * getIncomingBlock(unsigned I)
Returns the I th incoming VPBasicBlock.
Definition: VPlan.h:1838
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void execute(VPTransformState &State) override
Generate vector values for the pointer induction.
VPWidenRecipe is a recipe for producing a copy of vector type its ingredient.
Definition: VPlan.h:1328
Main class to build the VPlan H-CFG for an incoming IR.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition: VPlan.h:3049
void prepareToExecute(Value *TripCount, Value *VectorTripCount, Value *CanonicalIVStartValue, VPTransformState &State)
Prepare the plan for execution, setting up the required live-in values.
Definition: VPlan.cpp:783
VPBasicBlock * getEntry()
Definition: VPlan.h:3142
VPValue & getVectorTripCount()
The vector trip count.
Definition: VPlan.h:3167
void setName(const Twine &newName)
Definition: VPlan.h:3198
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition: VPlan.h:3170
VPValue * getTripCount() const
The trip count of the original loop.
Definition: VPlan.h:3146
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition: VPlan.h:3160
void removeLiveOut(PHINode *PN)
Definition: VPlan.h:3252
void addLiveOut(PHINode *PN, VPValue *V)
Definition: VPlan.cpp:993
VPBasicBlock * getPreheader()
Definition: VPlan.h:3271
VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition: VPlan.h:3233
static VPlanPtr createInitialVPlan(const SCEV *TripCount, ScalarEvolution &PSE)
Create initial VPlan skeleton, having an "entry" VPBasicBlock (wrapping original scalar pre-header) w...
Definition: VPlan.cpp:769
bool hasVF(ElementCount VF)
Definition: VPlan.h:3180
bool hasUF(unsigned UF) const
Definition: VPlan.h:3187
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition: VPlan.h:3153
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition: VPlan.h:3202
LLVM_DUMP_METHOD void dump() const
Dump the plan to stderr (for debugging).
Definition: VPlan.cpp:990
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition: VPlan.cpp:825
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition: VPlan.h:3241
const MapVector< PHINode *, VPLiveOut * > & getLiveOuts() const
Definition: VPlan.h:3257
VPValue * getSCEVExpansion(const SCEV *S) const
Definition: VPlan.h:3261
VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition: VPlan.cpp:1074
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition: Value.cpp:157
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition: Value.cpp:693
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
VectorBuilder & setEVL(Value *NewExplicitVectorLength)
Definition: VectorBuilder.h:77
VectorBuilder & setMask(Value *NewMask)
Definition: VectorBuilder.h:73
Value * createVectorInstruction(unsigned Opcode, Type *ReturnTy, ArrayRef< Value * > VecOpArray, const Twine &Name=Twine())
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:683
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:217
constexpr bool isNonZero() const
Definition: TypeSize.h:158
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:203
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition: TypeSize.h:243
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr bool isZero() const
Definition: TypeSize.h:156
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:210
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:239
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:224
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition: ilist_node.h:109
A range adaptor for a pair of iterators.
IteratorT end() const
IteratorT begin() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:660
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition: DwarfDebug.h:190
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:777
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:718
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:236
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE)
Get or create a VPValue that corresponds to the expansion of Expr.
Definition: VPlan.cpp:1459
bool isUniformAfterVectorization(VPValue *VPV)
Returns true if VPV is uniform after vectorization.
Definition: VPlan.h:3593
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
@ Offset
Definition: DWP.cpp:456
Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
Definition: LoopUtils.cpp:1820
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:849
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
unsigned getLoadStoreAddressSpace(Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7062
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
const SCEV * createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, Loop *OrigLoop)
std::pair< Instruction *, ElementCount > InstructionVFPair
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:425
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
Definition: LoopUtils.cpp:263
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2073
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
bool VerifySCEV
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition: VPlanCFG.h:226
auto map_range(ContainerTy &&C, FuncTy F)
Definition: STLExtras.h:377
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
cl::opt< bool > EnableVPlanNativePath("enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization."))
Definition: VPlan.cpp:53
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
std::unique_ptr< VPlan > VPlanPtr
Definition: VPlan.h:134
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:116
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
cl::opt< bool > EnableLoopVectorization
Align getLoadStoreAlignment(Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition: STLExtras.h:572
void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition: STLExtras.h:336
Type * ToVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
Definition: VectorUtils.h:133
bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
Definition: DebugInfo.cpp:2433
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, Loop *RemainderLoop, uint64_t UF)
Set weights for UnrolledLoop and RemainderLoop based on weights for OrigLoop and the following distri...
Definition: LoopUtils.cpp:1628
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
ScalarEpilogueLowering
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
@ Invalid
Denotes invalid value.
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
void reportVectorizationInfo(const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
Definition: LoopUtils.cpp:1880
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
InstructionCost Cost
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:613
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
bool verifyVPlanIsValid(const VPlan &Plan)
Verify invariants for general VPlans.
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:491
cl::opt< bool > EnableLoopInterleaving
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
#define OP(n)
Definition: regex2.h:73
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:26
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:50
ElementCountComparator creates a total ordering for ElementCount for the purposes of using it in a se...
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
SmallMapVector< unsigned, unsigned, 4 > LoopInvariantRegs
Holds the number of loop invariant values that are used in the loop.
TargetLibraryInfo * TLI
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LoopVectorizePass(LoopVectorizeOptions Opts={})
BlockFrequencyInfo * BFI
ScalarEvolution * SE
AssumptionCache * AC
LoopVectorizeResult runImpl(Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
Storage for information about made changes.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition: PassManager.h:74
A marker to determine if extra passes after loop vectorization should be run.
Definition: LoopVectorize.h:85
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Flags describing the kind of vector reduction.
Parameters that control the generic loop unrolling transformation.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Definition: VPlan.h:87
ElementCount End
Definition: VPlan.h:92
A recipe for handling first-order recurrence phis.
Definition: VPlan.h:1847
VPIteration represents a single point in the iteration space of the output (vectorized and/or unrolle...
Definition: VPlan.h:219
bool isFirstIteration() const
Definition: VPlan.h:231
BasicBlock * PrevBB
The previous IR BasicBlock created or used.
Definition: VPlan.h:365
SmallDenseMap< VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
Definition: VPlan.h:373
BasicBlock * ExitBB
The last IR BasicBlock in the output IR.
Definition: VPlan.h:369
BasicBlock * getPreheaderBBFor(VPRecipeBase *R)
Returns the BasicBlock* mapped to the pre-header of the loop region containing R.
Definition: VPlan.cpp:348
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
Definition: VPlan.h:236
Value * get(VPValue *Def, unsigned Part, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def and a given Part if IsScalar is false,...
Definition: VPlan.cpp:247
DenseMap< const SCEV *, Value * > ExpandedSCEVs
Map SCEVs to their expanded values.
Definition: VPlan.h:409
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
Definition: VPlan.h:412
void addMetadata(Value *To, Instruction *From)
Add metadata from one instruction to another.
Definition: VPlan.cpp:361
struct llvm::VPTransformState::CFGState CFG
LoopVersioning * LVer
LoopVersioning.
Definition: VPlan.h:405
void addNewMetadata(Instruction *To, const Instruction *Orig)
Add additional metadata to To that was not present on Orig.
Definition: VPlan.cpp:353
void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance)
Construct the vector value of a scalarized value V one lane at a time.
Definition: VPlan.cpp:393
void set(VPValue *Def, Value *V, unsigned Part, bool IsScalar=false)
Set the generated vector Value for a given VPValue and a given Part, if IsScalar is false.
Definition: VPlan.h:288
std::optional< VPIteration > Instance
Hold the indices to generate specific scalar instructions.
Definition: VPlan.h:248
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
Definition: VPlan.h:389
VPlan * Plan
Pointer to the VPlan code is generated for.
Definition: VPlan.h:395
InnerLoopVectorizer * ILV
Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
Definition: VPlan.h:392
ElementCount VF
The chosen Vectorization and Unroll Factors of the loop being vectorized.
Definition: VPlan.h:242
void setDebugLocFrom(DebugLoc DL)
Set the debug location in the builder using the debug location DL.
Definition: VPlan.cpp:372
void execute(VPTransformState &State) override
Generate the wide load or gather.
VPValue * getEVL() const
Return the EVL operand.
Definition: VPlan.h:2411
A recipe for widening load operations, using the address to load from and an optional mask.
Definition: VPlan.h:2360
void execute(VPTransformState &State) override
Generate a wide load or gather.
A recipe for widening select instructions.
Definition: VPlan.h:1488
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition: VPlan.h:2486
void execute(VPTransformState &State) override
Generate the wide store or scatter.
VPValue * getEVL() const
Return the EVL operand.
Definition: VPlan.h:2489
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition: VPlan.h:2434
void execute(VPTransformState &State) override
Generate a wide store or scatter.
VPValue * getStoredValue() const
Return the value stored by this recipe.
Definition: VPlan.h:2451
static void addExplicitVectorLength(VPlan &Plan)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void dropPoisonGeneratingRecipes(VPlan &Plan, function_ref< bool(BasicBlock *)> BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimize(VPlan &Plan, ScalarEvolution &SE)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void VPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, ScalarEvolution &SE, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs, LLVMContext &Ctx)
Insert truncates and extends for any truncated recipe.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Sink users of fixed-order recurrences after the recipe defining their previous value.
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.