LLVM 19.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanHCFGBuilder.h"
62#include "VPlanPatternMatch.h"
63#include "VPlanTransforms.h"
64#include "VPlanVerifier.h"
65#include "llvm/ADT/APInt.h"
66#include "llvm/ADT/ArrayRef.h"
67#include "llvm/ADT/DenseMap.h"
69#include "llvm/ADT/Hashing.h"
70#include "llvm/ADT/MapVector.h"
71#include "llvm/ADT/STLExtras.h"
73#include "llvm/ADT/SmallSet.h"
75#include "llvm/ADT/Statistic.h"
76#include "llvm/ADT/StringRef.h"
77#include "llvm/ADT/Twine.h"
82#include "llvm/Analysis/CFG.h"
98#include "llvm/IR/Attributes.h"
99#include "llvm/IR/BasicBlock.h"
100#include "llvm/IR/CFG.h"
101#include "llvm/IR/Constant.h"
102#include "llvm/IR/Constants.h"
103#include "llvm/IR/DataLayout.h"
104#include "llvm/IR/DebugInfo.h"
106#include "llvm/IR/DebugLoc.h"
107#include "llvm/IR/DerivedTypes.h"
109#include "llvm/IR/Dominators.h"
110#include "llvm/IR/Function.h"
111#include "llvm/IR/IRBuilder.h"
112#include "llvm/IR/InstrTypes.h"
113#include "llvm/IR/Instruction.h"
114#include "llvm/IR/Instructions.h"
116#include "llvm/IR/Intrinsics.h"
117#include "llvm/IR/MDBuilder.h"
118#include "llvm/IR/Metadata.h"
119#include "llvm/IR/Module.h"
120#include "llvm/IR/Operator.h"
121#include "llvm/IR/PatternMatch.h"
123#include "llvm/IR/Type.h"
124#include "llvm/IR/Use.h"
125#include "llvm/IR/User.h"
126#include "llvm/IR/Value.h"
127#include "llvm/IR/ValueHandle.h"
129#include "llvm/IR/Verifier.h"
130#include "llvm/Support/Casting.h"
133#include "llvm/Support/Debug.h"
146#include <algorithm>
147#include <cassert>
148#include <cmath>
149#include <cstdint>
150#include <functional>
151#include <iterator>
152#include <limits>
153#include <map>
154#include <memory>
155#include <string>
156#include <tuple>
157#include <utility>
158
159using namespace llvm;
160
161#define LV_NAME "loop-vectorize"
162#define DEBUG_TYPE LV_NAME
163
164#ifndef NDEBUG
165const char VerboseDebug[] = DEBUG_TYPE "-verbose";
166#endif
167
168/// @{
169/// Metadata attribute names
170const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
172 "llvm.loop.vectorize.followup_vectorized";
174 "llvm.loop.vectorize.followup_epilogue";
175/// @}
176
177STATISTIC(LoopsVectorized, "Number of loops vectorized");
178STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
179STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
180
182 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
183 cl::desc("Enable vectorization of epilogue loops."));
184
186 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
187 cl::desc("When epilogue vectorization is enabled, and a value greater than "
188 "1 is specified, forces the given VF for all applicable epilogue "
189 "loops."));
190
192 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
193 cl::desc("Only loops with vectorization factor equal to or larger than "
194 "the specified value are considered for epilogue vectorization."));
195
196/// Loops with a known constant trip count below this number are vectorized only
197/// if no scalar iteration overheads are incurred.
199 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
200 cl::desc("Loops with a constant trip count that is smaller than this "
201 "value are vectorized only if no scalar iteration overheads "
202 "are incurred."));
203
205 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
206 cl::desc("The maximum allowed number of runtime memory checks"));
207
208// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
209// that predication is preferred, and this lists all options. I.e., the
210// vectorizer will try to fold the tail-loop (epilogue) into the vector body
211// and predicate the instructions accordingly. If tail-folding fails, there are
212// different fallback strategies depending on these values:
214 enum Option {
218 };
219} // namespace PreferPredicateTy
220
222 "prefer-predicate-over-epilogue",
225 cl::desc("Tail-folding and predication preferences over creating a scalar "
226 "epilogue loop."),
228 "scalar-epilogue",
229 "Don't tail-predicate loops, create scalar epilogue"),
231 "predicate-else-scalar-epilogue",
232 "prefer tail-folding, create scalar epilogue if tail "
233 "folding fails."),
235 "predicate-dont-vectorize",
236 "prefers tail-folding, don't attempt vectorization if "
237 "tail-folding fails.")));
238
240 "force-tail-folding-style", cl::desc("Force the tail folding style"),
241 cl::init(TailFoldingStyle::None),
243 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
245 TailFoldingStyle::Data, "data",
246 "Create lane mask for data only, using active.lane.mask intrinsic"),
247 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
248 "data-without-lane-mask",
249 "Create lane mask with compare/stepvector"),
250 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
251 "Create lane mask using active.lane.mask intrinsic, and use "
252 "it for both data and control flow"),
253 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
254 "data-and-control-without-rt-check",
255 "Similar to data-and-control, but remove the runtime check"),
256 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
257 "Use predicated EVL instructions for tail folding. If EVL "
258 "is unsupported, fallback to data-without-lane-mask.")));
259
261 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
262 cl::desc("Maximize bandwidth when selecting vectorization factor which "
263 "will be determined by the smallest type in loop."));
264
266 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
267 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
268
269/// An interleave-group may need masking if it resides in a block that needs
270/// predication, or in order to mask away gaps.
272 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
273 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
274
276 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
277 cl::desc("A flag that overrides the target's number of scalar registers."));
278
280 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
281 cl::desc("A flag that overrides the target's number of vector registers."));
282
284 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
285 cl::desc("A flag that overrides the target's max interleave factor for "
286 "scalar loops."));
287
289 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
290 cl::desc("A flag that overrides the target's max interleave factor for "
291 "vectorized loops."));
292
294 "force-target-instruction-cost", cl::init(0), cl::Hidden,
295 cl::desc("A flag that overrides the target's expected cost for "
296 "an instruction to a single constant value. Mostly "
297 "useful for getting consistent testing."));
298
300 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
301 cl::desc(
302 "Pretend that scalable vectors are supported, even if the target does "
303 "not support them. This flag should only be used for testing."));
304
306 "small-loop-cost", cl::init(20), cl::Hidden,
307 cl::desc(
308 "The cost of a loop that is considered 'small' by the interleaver."));
309
311 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
312 cl::desc("Enable the use of the block frequency analysis to access PGO "
313 "heuristics minimizing code growth in cold regions and being more "
314 "aggressive in hot regions."));
315
316// Runtime interleave loops for load/store throughput.
318 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
319 cl::desc(
320 "Enable runtime interleaving until load/store ports are saturated"));
321
322/// The number of stores in a loop that are allowed to need predication.
324 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
325 cl::desc("Max number of stores to be predicated behind an if."));
326
328 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
329 cl::desc("Count the induction variable only once when interleaving"));
330
332 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
333 cl::desc("Enable if predication of stores during vectorization."));
334
336 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
337 cl::desc("The maximum interleave count to use when interleaving a scalar "
338 "reduction in a nested loop."));
339
340static cl::opt<bool>
341 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
343 cl::desc("Prefer in-loop vector reductions, "
344 "overriding the targets preference."));
345
347 "force-ordered-reductions", cl::init(false), cl::Hidden,
348 cl::desc("Enable the vectorisation of loops with in-order (strict) "
349 "FP reductions"));
350
352 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
353 cl::desc(
354 "Prefer predicating a reduction operation over an after loop select."));
355
356namespace llvm {
358 "enable-vplan-native-path", cl::Hidden,
359 cl::desc("Enable VPlan-native vectorization path with "
360 "support for outer loop vectorization."));
361}
362
363// This flag enables the stress testing of the VPlan H-CFG construction in the
364// VPlan-native vectorization path. It must be used in conjuction with
365// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
366// verification of the H-CFGs built.
368 "vplan-build-stress-test", cl::init(false), cl::Hidden,
369 cl::desc(
370 "Build VPlan for every supported loop nest in the function and bail "
371 "out right after the build (stress test the VPlan H-CFG construction "
372 "in the VPlan-native vectorization path)."));
373
375 "interleave-loops", cl::init(true), cl::Hidden,
376 cl::desc("Enable loop interleaving in Loop vectorization passes"));
378 "vectorize-loops", cl::init(true), cl::Hidden,
379 cl::desc("Run the Loop vectorization passes"));
380
382 "vplan-print-in-dot-format", cl::Hidden,
383 cl::desc("Use dot format instead of plain text when dumping VPlans"));
384
386 "force-widen-divrem-via-safe-divisor", cl::Hidden,
387 cl::desc(
388 "Override cost based safe divisor widening for div/rem instructions"));
389
391 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
393 cl::desc("Try wider VFs if they enable the use of vector variants"));
394
395// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
396// variables not overflowing do not hold. See `emitSCEVChecks`.
397static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
398// Likelyhood of bypassing the vectorized loop because pointers overlap. See
399// `emitMemRuntimeChecks`.
400static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
401// Likelyhood of bypassing the vectorized loop because there are zero trips left
402// after prolog. See `emitIterationCountCheck`.
403static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
404
405/// A helper function that returns true if the given type is irregular. The
406/// type is irregular if its allocated size doesn't equal the store size of an
407/// element of the corresponding vector type.
408static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
409 // Determine if an array of N elements of type Ty is "bitcast compatible"
410 // with a <N x Ty> vector.
411 // This is only true if there is no padding between the array elements.
412 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
413}
414
415/// A helper function that returns the reciprocal of the block probability of
416/// predicated blocks. If we return X, we are assuming the predicated block
417/// will execute once for every X iterations of the loop header.
418///
419/// TODO: We should use actual block probability here, if available. Currently,
420/// we always assume predicated blocks have a 50% chance of executing.
421static unsigned getReciprocalPredBlockProb() { return 2; }
422
423/// Returns "best known" trip count for the specified loop \p L as defined by
424/// the following procedure:
425/// 1) Returns exact trip count if it is known.
426/// 2) Returns expected trip count according to profile data if any.
427/// 3) Returns upper bound estimate if it is known.
428/// 4) Returns std::nullopt if all of the above failed.
429static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
430 Loop *L) {
431 // Check if exact trip count is known.
432 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
433 return ExpectedTC;
434
435 // Check if there is an expected trip count available from profile data.
437 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
438 return *EstimatedTC;
439
440 // Check if upper bound estimate is known.
441 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
442 return ExpectedTC;
443
444 return std::nullopt;
445}
446
447/// Return a vector containing interleaved elements from multiple
448/// smaller input vectors.
450 const Twine &Name) {
451 unsigned Factor = Vals.size();
452 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
453
454 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
455#ifndef NDEBUG
456 for (Value *Val : Vals)
457 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
458#endif
459
460 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
461 // must use intrinsics to interleave.
462 if (VecTy->isScalableTy()) {
463 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
464 return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
465 Vals,
466 /*FMFSource=*/nullptr, Name);
467 }
468
469 // Fixed length. Start by concatenating all vectors into a wide vector.
470 Value *WideVec = concatenateVectors(Builder, Vals);
471
472 // Interleave the elements into the wide vector.
473 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
474 return Builder.CreateShuffleVector(
475 WideVec, createInterleaveMask(NumElts, Factor), Name);
476}
477
478namespace {
479// Forward declare GeneratedRTChecks.
480class GeneratedRTChecks;
481
482using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
483} // namespace
484
485namespace llvm {
486
488
489/// InnerLoopVectorizer vectorizes loops which contain only one basic
490/// block to a specified vectorization factor (VF).
491/// This class performs the widening of scalars into vectors, or multiple
492/// scalars. This class also implements the following features:
493/// * It inserts an epilogue loop for handling loops that don't have iteration
494/// counts that are known to be a multiple of the vectorization factor.
495/// * It handles the code generation for reduction variables.
496/// * Scalarization (implementation using scalars) of un-vectorizable
497/// instructions.
498/// InnerLoopVectorizer does not perform any vectorization-legality
499/// checks, and relies on the caller to check for the different legality
500/// aspects. The InnerLoopVectorizer relies on the
501/// LoopVectorizationLegality class to provide information about the induction
502/// and reduction variables that were found to a given vectorization factor.
504public:
507 const TargetLibraryInfo *TLI,
511 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
513 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
514 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
515 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
516 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
518 // Query this against the original loop and save it here because the profile
519 // of the original loop header may change as the transformation happens.
522
524 this->MinProfitableTripCount = VecWidth;
525 else
526 this->MinProfitableTripCount = MinProfitableTripCount;
527 }
528
529 virtual ~InnerLoopVectorizer() = default;
530
531 /// Create a new empty loop that will contain vectorized instructions later
532 /// on, while the old loop will be used as the scalar remainder. Control flow
533 /// is generated around the vectorized (and scalar epilogue) loops consisting
534 /// of various checks and bypasses. Return the pre-header block of the new
535 /// loop and the start value for the canonical induction, if it is != 0. The
536 /// latter is the case when vectorizing the epilogue loop. In the case of
537 /// epilogue vectorization, this function is overriden to handle the more
538 /// complex control flow around the loops. \p ExpandedSCEVs is used to
539 /// look up SCEV expansions for expressions needed during skeleton creation.
540 virtual std::pair<BasicBlock *, Value *>
541 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
542
543 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
544 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
545
546 // Return true if any runtime check is added.
548
549 /// A helper function to scalarize a single Instruction in the innermost loop.
550 /// Generates a sequence of scalar instances for each lane between \p MinLane
551 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
552 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
553 /// Instr's operands.
554 void scalarizeInstruction(const Instruction *Instr,
555 VPReplicateRecipe *RepRecipe,
556 const VPIteration &Instance,
557 VPTransformState &State);
558
559 /// Try to vectorize interleaved access group \p Group with the base address
560 /// given in \p Addr, optionally masking the vector operations if \p
561 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
562 /// values in the vectorized loop.
564 ArrayRef<VPValue *> VPDefs,
566 ArrayRef<VPValue *> StoredValues,
567 VPValue *BlockInMask, bool NeedsMaskForGaps);
568
569 /// Fix the non-induction PHIs in \p Plan.
570 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
571
572 /// Create a new phi node for the induction variable \p OrigPhi to resume
573 /// iteration count in the scalar epilogue, from where the vectorized loop
574 /// left off. \p Step is the SCEV-expanded induction step to use. In cases
575 /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
576 /// and the resume values can come from an additional bypass block, the \p
577 /// AdditionalBypass pair provides information about the bypass block and the
578 /// end value on the edge from bypass to this loop.
580 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
581 ArrayRef<BasicBlock *> BypassBlocks,
582 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
583
584 /// Returns the original loop trip count.
585 Value *getTripCount() const { return TripCount; }
586
587 /// Used to set the trip count after ILV's construction and after the
588 /// preheader block has been executed. Note that this always holds the trip
589 /// count of the original loop for both main loop and epilogue vectorization.
590 void setTripCount(Value *TC) { TripCount = TC; }
591
592protected:
594
595 /// A small list of PHINodes.
597
598 /// A type for scalarized values in the new loop. Each value from the
599 /// original loop, when scalarized, is represented by UF x VF scalar values
600 /// in the new unrolled loop, where UF is the unroll factor and VF is the
601 /// vectorization factor.
603
604 /// Set up the values of the IVs correctly when exiting the vector loop.
605 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
606 Value *VectorTripCount, Value *EndValue,
607 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
608 VPlan &Plan, VPTransformState &State);
609
610 /// Create the phi node for the resume value of first order recurrences in the
611 /// scalar preheader and update the users in the scalar loop.
613
614 /// Iteratively sink the scalarized operands of a predicated instruction into
615 /// the block that was created for it.
616 void sinkScalarOperands(Instruction *PredInst);
617
618 /// Returns (and creates if needed) the trip count of the widened loop.
620
621 /// Returns a bitcasted value to the requested vector type.
622 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
624 const DataLayout &DL);
625
626 /// Emit a bypass check to see if the vector trip count is zero, including if
627 /// it overflows.
629
630 /// Emit a bypass check to see if all of the SCEV assumptions we've
631 /// had to make are correct. Returns the block containing the checks or
632 /// nullptr if no checks have been added.
634
635 /// Emit bypass checks to check any memory assumptions we may have made.
636 /// Returns the block containing the checks or nullptr if no checks have been
637 /// added.
639
640 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
641 /// vector loop preheader, middle block and scalar preheader.
643
644 /// Create new phi nodes for the induction variables to resume iteration count
645 /// in the scalar epilogue, from where the vectorized loop left off.
646 /// In cases where the loop skeleton is more complicated (eg. epilogue
647 /// vectorization) and the resume values can come from an additional bypass
648 /// block, the \p AdditionalBypass pair provides information about the bypass
649 /// block and the end value on the edge from bypass to this loop.
651 const SCEV2ValueTy &ExpandedSCEVs,
652 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
653
654 /// Complete the loop skeleton by adding debug MDs, creating appropriate
655 /// conditional branches in the middle block, preparing the builder and
656 /// running the verifier. Return the preheader of the completed vector loop.
658
659 /// Allow subclasses to override and print debug traces before/after vplan
660 /// execution, when trace information is requested.
661 virtual void printDebugTracesAtStart(){};
662 virtual void printDebugTracesAtEnd(){};
663
664 /// The original loop.
666
667 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
668 /// dynamic knowledge to simplify SCEV expressions and converts them to a
669 /// more usable form.
671
672 /// Loop Info.
674
675 /// Dominator Tree.
677
678 /// Target Library Info.
680
681 /// Target Transform Info.
683
684 /// Assumption Cache.
686
687 /// Interface to emit optimization remarks.
689
690 /// The vectorization SIMD factor to use. Each vector will have this many
691 /// vector elements.
693
695
696 /// The vectorization unroll factor to use. Each scalar is vectorized to this
697 /// many different vector instructions.
698 unsigned UF;
699
700 /// The builder that we use
702
703 // --- Vectorization state ---
704
705 /// The vector-loop preheader.
707
708 /// The scalar-loop preheader.
710
711 /// Middle Block between the vector and the scalar.
713
714 /// The unique ExitBlock of the scalar loop if one exists. Note that
715 /// there can be multiple exiting edges reaching this block.
717
718 /// The scalar loop body.
720
721 /// A list of all bypass blocks. The first block is the entry of the loop.
723
724 /// Store instructions that were predicated.
726
727 /// Trip count of the original loop.
728 Value *TripCount = nullptr;
729
730 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
732
733 /// The legality analysis.
735
736 /// The profitablity analysis.
738
739 // Record whether runtime checks are added.
740 bool AddedSafetyChecks = false;
741
742 // Holds the end values for each induction variable. We save the end values
743 // so we can later fix-up the external users of the induction variables.
745
746 /// BFI and PSI are used to check for profile guided size optimizations.
749
750 // Whether this loop should be optimized for size based on profile guided size
751 // optimizatios.
753
754 /// Structure to hold information about generated runtime checks, responsible
755 /// for cleaning the checks, if vectorization turns out unprofitable.
756 GeneratedRTChecks &RTChecks;
757
758 // Holds the resume values for reductions in the loops, used to set the
759 // correct start value of reduction PHIs when vectorizing the epilogue.
762};
763
765public:
768 const TargetLibraryInfo *TLI,
770 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
773 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
775 ElementCount::getFixed(1),
776 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
777 BFI, PSI, Check) {}
778};
779
780/// Encapsulate information regarding vectorization of a loop and its epilogue.
781/// This information is meant to be updated and used across two stages of
782/// epilogue vectorization.
785 unsigned MainLoopUF = 0;
787 unsigned EpilogueUF = 0;
792 Value *TripCount = nullptr;
794
796 ElementCount EVF, unsigned EUF)
797 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
798 assert(EUF == 1 &&
799 "A high UF for the epilogue loop is likely not beneficial.");
800 }
801};
802
803/// An extension of the inner loop vectorizer that creates a skeleton for a
804/// vectorized loop that has its epilogue (residual) also vectorized.
805/// The idea is to run the vplan on a given loop twice, firstly to setup the
806/// skeleton and vectorize the main loop, and secondly to complete the skeleton
807/// from the first step and vectorize the epilogue. This is achieved by
808/// deriving two concrete strategy classes from this base class and invoking
809/// them in succession from the loop vectorizer planner.
811public:
819 GeneratedRTChecks &Checks)
821 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
822 CM, BFI, PSI, Checks),
823 EPI(EPI) {}
824
825 // Override this function to handle the more complex control flow around the
826 // three loops.
827 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
828 const SCEV2ValueTy &ExpandedSCEVs) final {
829 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
830 }
831
832 /// The interface for creating a vectorized skeleton using one of two
833 /// different strategies, each corresponding to one execution of the vplan
834 /// as described above.
835 virtual std::pair<BasicBlock *, Value *>
836 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
837
838 /// Holds and updates state information required to vectorize the main loop
839 /// and its epilogue in two separate passes. This setup helps us avoid
840 /// regenerating and recomputing runtime safety checks. It also helps us to
841 /// shorten the iteration-count-check path length for the cases where the
842 /// iteration count of the loop is so small that the main vector loop is
843 /// completely skipped.
845};
846
847/// A specialized derived class of inner loop vectorizer that performs
848/// vectorization of *main* loops in the process of vectorizing loops and their
849/// epilogues.
851public:
859 GeneratedRTChecks &Check)
861 EPI, LVL, CM, BFI, PSI, Check) {}
862 /// Implements the interface for creating a vectorized skeleton using the
863 /// *main loop* strategy (ie the first pass of vplan execution).
864 std::pair<BasicBlock *, Value *>
865 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
866
867protected:
868 /// Emits an iteration count bypass check once for the main loop (when \p
869 /// ForEpilogue is false) and once for the epilogue loop (when \p
870 /// ForEpilogue is true).
871 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
872 void printDebugTracesAtStart() override;
873 void printDebugTracesAtEnd() override;
874};
875
876// A specialized derived class of inner loop vectorizer that performs
877// vectorization of *epilogue* loops in the process of vectorizing loops and
878// their epilogues.
880public:
888 GeneratedRTChecks &Checks)
890 EPI, LVL, CM, BFI, PSI, Checks) {
892 }
893 /// Implements the interface for creating a vectorized skeleton using the
894 /// *epilogue loop* strategy (ie the second pass of vplan execution).
895 std::pair<BasicBlock *, Value *>
896 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
897
898protected:
899 /// Emits an iteration count bypass check after the main vector loop has
900 /// finished to see if there are any iterations left to execute by either
901 /// the vector epilogue or the scalar epilogue.
903 BasicBlock *Bypass,
904 BasicBlock *Insert);
905 void printDebugTracesAtStart() override;
906 void printDebugTracesAtEnd() override;
907};
908} // end namespace llvm
909
910/// Look for a meaningful debug location on the instruction or it's
911/// operands.
913 if (!I)
914 return DebugLoc();
915
917 if (I->getDebugLoc() != Empty)
918 return I->getDebugLoc();
919
920 for (Use &Op : I->operands()) {
921 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
922 if (OpInst->getDebugLoc() != Empty)
923 return OpInst->getDebugLoc();
924 }
925
926 return I->getDebugLoc();
927}
928
929/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
930/// is passed, the message relates to that particular instruction.
931#ifndef NDEBUG
932static void debugVectorizationMessage(const StringRef Prefix,
933 const StringRef DebugMsg,
934 Instruction *I) {
935 dbgs() << "LV: " << Prefix << DebugMsg;
936 if (I != nullptr)
937 dbgs() << " " << *I;
938 else
939 dbgs() << '.';
940 dbgs() << '\n';
941}
942#endif
943
944/// Create an analysis remark that explains why vectorization failed
945///
946/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
947/// RemarkName is the identifier for the remark. If \p I is passed it is an
948/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
949/// the location of the remark. \return the remark object that can be
950/// streamed to.
952 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
953 Value *CodeRegion = TheLoop->getHeader();
954 DebugLoc DL = TheLoop->getStartLoc();
955
956 if (I) {
957 CodeRegion = I->getParent();
958 // If there is no debug location attached to the instruction, revert back to
959 // using the loop's.
960 if (I->getDebugLoc())
961 DL = I->getDebugLoc();
962 }
963
964 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
965}
966
967namespace llvm {
968
969/// Return a value for Step multiplied by VF.
971 int64_t Step) {
972 assert(Ty->isIntegerTy() && "Expected an integer step");
973 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
974}
975
976/// Return the runtime value for VF.
978 return B.CreateElementCount(Ty, VF);
979}
980
982 Loop *OrigLoop) {
983 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
984 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
985
986 ScalarEvolution &SE = *PSE.getSE();
987 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
988}
989
991 const StringRef OREMsg, const StringRef ORETag,
992 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
993 Instruction *I) {
994 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
995 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
996 ORE->emit(
997 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
998 << "loop not vectorized: " << OREMsg);
999}
1000
1001void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1002 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1003 Instruction *I) {
1005 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1006 ORE->emit(
1007 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1008 << Msg);
1009}
1010
1011/// Report successful vectorization of the loop. In case an outer loop is
1012/// vectorized, prepend "outer" to the vectorization remark.
1014 VectorizationFactor VF, unsigned IC) {
1016 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
1017 nullptr));
1018 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
1019 ORE->emit([&]() {
1020 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
1021 TheLoop->getHeader())
1022 << "vectorized " << LoopType << "loop (vectorization width: "
1023 << ore::NV("VectorizationFactor", VF.Width)
1024 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
1025 });
1026}
1027
1028} // end namespace llvm
1029
1030namespace llvm {
1031
1032// Loop vectorization cost-model hints how the scalar epilogue loop should be
1033// lowered.
1035
1036 // The default: allowing scalar epilogues.
1038
1039 // Vectorization with OptForSize: don't allow epilogues.
1041
1042 // A special case of vectorisation with OptForSize: loops with a very small
1043 // trip count are considered for vectorization under OptForSize, thereby
1044 // making sure the cost of their loop body is dominant, free of runtime
1045 // guards and scalar iteration overheads.
1047
1048 // Loop hint predicate indicating an epilogue is undesired.
1050
1051 // Directive indicating we must either tail fold or not vectorize
1054
1055using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1056
1057/// LoopVectorizationCostModel - estimates the expected speedups due to
1058/// vectorization.
1059/// In many cases vectorization is not profitable. This can happen because of
1060/// a number of reasons. In this class we mainly attempt to predict the
1061/// expected speedup/slowdowns due to the supported instruction set. We use the
1062/// TargetTransformInfo to query the different backends for the cost of
1063/// different operations.
1065public:
1069 const TargetTransformInfo &TTI,
1075 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1076 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1077 Hints(Hints), InterleaveInfo(IAI) {}
1078
1079 /// \return An upper bound for the vectorization factors (both fixed and
1080 /// scalable). If the factors are 0, vectorization and interleaving should be
1081 /// avoided up front.
1082 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1083
1084 /// \return True if runtime checks are required for vectorization, and false
1085 /// otherwise.
1086 bool runtimeChecksRequired();
1087
1088 /// Setup cost-based decisions for user vectorization factor.
1089 /// \return true if the UserVF is a feasible VF to be chosen.
1093 return expectedCost(UserVF).first.isValid();
1094 }
1095
1096 /// \return The size (in bits) of the smallest and widest types in the code
1097 /// that needs to be vectorized. We ignore values that remain scalar such as
1098 /// 64 bit loop indices.
1099 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1100
1101 /// \return The desired interleave count.
1102 /// If interleave count has been specified by metadata it will be returned.
1103 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1104 /// are the selected vectorization factor and the cost of the selected VF.
1105 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1106
1107 /// Memory access instruction may be vectorized in more than one way.
1108 /// Form of instruction after vectorization depends on cost.
1109 /// This function takes cost-based decisions for Load/Store instructions
1110 /// and collects them in a map. This decisions map is used for building
1111 /// the lists of loop-uniform and loop-scalar instructions.
1112 /// The calculated cost is saved with widening decision in order to
1113 /// avoid redundant calculations.
1115
1116 /// A call may be vectorized in different ways depending on whether we have
1117 /// vectorized variants available and whether the target supports masking.
1118 /// This function analyzes all calls in the function at the supplied VF,
1119 /// makes a decision based on the costs of available options, and stores that
1120 /// decision in a map for use in planning and plan execution.
1122
1123 /// A struct that represents some properties of the register usage
1124 /// of a loop.
1126 /// Holds the number of loop invariant values that are used in the loop.
1127 /// The key is ClassID of target-provided register class.
1129 /// Holds the maximum number of concurrent live intervals in the loop.
1130 /// The key is ClassID of target-provided register class.
1132 };
1133
1134 /// \return Returns information about the register usages of the loop for the
1135 /// given vectorization factors.
1138
1139 /// Collect values we want to ignore in the cost model.
1140 void collectValuesToIgnore();
1141
1142 /// Collect all element types in the loop for which widening is needed.
1144
1145 /// Split reductions into those that happen in the loop, and those that happen
1146 /// outside. In loop reductions are collected into InLoopReductions.
1148
1149 /// Returns true if we should use strict in-order reductions for the given
1150 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1151 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1152 /// of FP operations.
1153 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1154 return !Hints->allowReordering() && RdxDesc.isOrdered();
1155 }
1156
1157 /// \returns The smallest bitwidth each instruction can be represented with.
1158 /// The vector equivalents of these instructions should be truncated to this
1159 /// type.
1161 return MinBWs;
1162 }
1163
1164 /// \returns True if it is more profitable to scalarize instruction \p I for
1165 /// vectorization factor \p VF.
1167 assert(VF.isVector() &&
1168 "Profitable to scalarize relevant only for VF > 1.");
1169 assert(
1170 TheLoop->isInnermost() &&
1171 "cost-model should not be used for outer loops (in VPlan-native path)");
1172
1173 auto Scalars = InstsToScalarize.find(VF);
1174 assert(Scalars != InstsToScalarize.end() &&
1175 "VF not yet analyzed for scalarization profitability");
1176 return Scalars->second.contains(I);
1177 }
1178
1179 /// Returns true if \p I is known to be uniform after vectorization.
1181 assert(
1182 TheLoop->isInnermost() &&
1183 "cost-model should not be used for outer loops (in VPlan-native path)");
1184 // Pseudo probe needs to be duplicated for each unrolled iteration and
1185 // vector lane so that profiled loop trip count can be accurately
1186 // accumulated instead of being under counted.
1187 if (isa<PseudoProbeInst>(I))
1188 return false;
1189
1190 if (VF.isScalar())
1191 return true;
1192
1193 auto UniformsPerVF = Uniforms.find(VF);
1194 assert(UniformsPerVF != Uniforms.end() &&
1195 "VF not yet analyzed for uniformity");
1196 return UniformsPerVF->second.count(I);
1197 }
1198
1199 /// Returns true if \p I is known to be scalar after vectorization.
1201 assert(
1202 TheLoop->isInnermost() &&
1203 "cost-model should not be used for outer loops (in VPlan-native path)");
1204 if (VF.isScalar())
1205 return true;
1206
1207 auto ScalarsPerVF = Scalars.find(VF);
1208 assert(ScalarsPerVF != Scalars.end() &&
1209 "Scalar values are not calculated for VF");
1210 return ScalarsPerVF->second.count(I);
1211 }
1212
1213 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1214 /// for vectorization factor \p VF.
1216 return VF.isVector() && MinBWs.contains(I) &&
1217 !isProfitableToScalarize(I, VF) &&
1219 }
1220
1221 /// Decision that was taken during cost calculation for memory instruction.
1224 CM_Widen, // For consecutive accesses with stride +1.
1225 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1232
1233 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1234 /// instruction \p I and vector width \p VF.
1237 assert(VF.isVector() && "Expected VF >=2");
1238 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1239 }
1240
1241 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1242 /// interleaving group \p Grp and vector width \p VF.
1246 assert(VF.isVector() && "Expected VF >=2");
1247 /// Broadcast this decicion to all instructions inside the group.
1248 /// But the cost will be assigned to one instruction only.
1249 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1250 if (auto *I = Grp->getMember(i)) {
1251 if (Grp->getInsertPos() == I)
1252 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1253 else
1254 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1255 }
1256 }
1257 }
1258
1259 /// Return the cost model decision for the given instruction \p I and vector
1260 /// width \p VF. Return CM_Unknown if this instruction did not pass
1261 /// through the cost modeling.
1263 assert(VF.isVector() && "Expected VF to be a vector VF");
1264 assert(
1265 TheLoop->isInnermost() &&
1266 "cost-model should not be used for outer loops (in VPlan-native path)");
1267
1268 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1269 auto Itr = WideningDecisions.find(InstOnVF);
1270 if (Itr == WideningDecisions.end())
1271 return CM_Unknown;
1272 return Itr->second.first;
1273 }
1274
1275 /// Return the vectorization cost for the given instruction \p I and vector
1276 /// width \p VF.
1278 assert(VF.isVector() && "Expected VF >=2");
1279 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1280 assert(WideningDecisions.contains(InstOnVF) &&
1281 "The cost is not calculated");
1282 return WideningDecisions[InstOnVF].second;
1283 }
1284
1289 std::optional<unsigned> MaskPos;
1291 };
1292
1294 Function *Variant, Intrinsic::ID IID,
1295 std::optional<unsigned> MaskPos,
1297 assert(!VF.isScalar() && "Expected vector VF");
1298 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1299 MaskPos, Cost};
1300 }
1301
1303 ElementCount VF) const {
1304 assert(!VF.isScalar() && "Expected vector VF");
1305 return CallWideningDecisions.at(std::make_pair(CI, VF));
1306 }
1307
1308 /// Return True if instruction \p I is an optimizable truncate whose operand
1309 /// is an induction variable. Such a truncate will be removed by adding a new
1310 /// induction variable with the destination type.
1312 // If the instruction is not a truncate, return false.
1313 auto *Trunc = dyn_cast<TruncInst>(I);
1314 if (!Trunc)
1315 return false;
1316
1317 // Get the source and destination types of the truncate.
1318 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1319 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1320
1321 // If the truncate is free for the given types, return false. Replacing a
1322 // free truncate with an induction variable would add an induction variable
1323 // update instruction to each iteration of the loop. We exclude from this
1324 // check the primary induction variable since it will need an update
1325 // instruction regardless.
1326 Value *Op = Trunc->getOperand(0);
1327 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1328 return false;
1329
1330 // If the truncated value is not an induction variable, return false.
1331 return Legal->isInductionPhi(Op);
1332 }
1333
1334 /// Collects the instructions to scalarize for each predicated instruction in
1335 /// the loop.
1337
1338 /// Collect Uniform and Scalar values for the given \p VF.
1339 /// The sets depend on CM decision for Load/Store instructions
1340 /// that may be vectorized as interleave, gather-scatter or scalarized.
1341 /// Also make a decision on what to do about call instructions in the loop
1342 /// at that VF -- scalarize, call a known vector routine, or call a
1343 /// vector intrinsic.
1345 // Do the analysis once.
1346 if (VF.isScalar() || Uniforms.contains(VF))
1347 return;
1350 collectLoopUniforms(VF);
1351 collectLoopScalars(VF);
1352 }
1353
1354 /// Returns true if the target machine supports masked store operation
1355 /// for the given \p DataType and kind of access to \p Ptr.
1356 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1357 return Legal->isConsecutivePtr(DataType, Ptr) &&
1358 TTI.isLegalMaskedStore(DataType, Alignment);
1359 }
1360
1361 /// Returns true if the target machine supports masked load operation
1362 /// for the given \p DataType and kind of access to \p Ptr.
1363 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1364 return Legal->isConsecutivePtr(DataType, Ptr) &&
1365 TTI.isLegalMaskedLoad(DataType, Alignment);
1366 }
1367
1368 /// Returns true if the target machine can represent \p V as a masked gather
1369 /// or scatter operation.
1371 bool LI = isa<LoadInst>(V);
1372 bool SI = isa<StoreInst>(V);
1373 if (!LI && !SI)
1374 return false;
1375 auto *Ty = getLoadStoreType(V);
1377 if (VF.isVector())
1378 Ty = VectorType::get(Ty, VF);
1379 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1380 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1381 }
1382
1383 /// Returns true if the target machine supports all of the reduction
1384 /// variables found for the given VF.
1386 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1387 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1388 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1389 }));
1390 }
1391
1392 /// Given costs for both strategies, return true if the scalar predication
1393 /// lowering should be used for div/rem. This incorporates an override
1394 /// option so it is not simply a cost comparison.
1396 InstructionCost SafeDivisorCost) const {
1397 switch (ForceSafeDivisor) {
1398 case cl::BOU_UNSET:
1399 return ScalarCost < SafeDivisorCost;
1400 case cl::BOU_TRUE:
1401 return false;
1402 case cl::BOU_FALSE:
1403 return true;
1404 };
1405 llvm_unreachable("impossible case value");
1406 }
1407
1408 /// Returns true if \p I is an instruction which requires predication and
1409 /// for which our chosen predication strategy is scalarization (i.e. we
1410 /// don't have an alternate strategy such as masking available).
1411 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1413
1414 /// Returns true if \p I is an instruction that needs to be predicated
1415 /// at runtime. The result is independent of the predication mechanism.
1416 /// Superset of instructions that return true for isScalarWithPredication.
1417 bool isPredicatedInst(Instruction *I) const;
1418
1419 /// Return the costs for our two available strategies for lowering a
1420 /// div/rem operation which requires speculating at least one lane.
1421 /// First result is for scalarization (will be invalid for scalable
1422 /// vectors); second is for the safe-divisor strategy.
1423 std::pair<InstructionCost, InstructionCost>
1425 ElementCount VF) const;
1426
1427 /// Returns true if \p I is a memory instruction with consecutive memory
1428 /// access that can be widened.
1430
1431 /// Returns true if \p I is a memory instruction in an interleaved-group
1432 /// of memory accesses that can be vectorized with wide vector loads/stores
1433 /// and shuffles.
1435
1436 /// Check if \p Instr belongs to any interleaved access group.
1438 return InterleaveInfo.isInterleaved(Instr);
1439 }
1440
1441 /// Get the interleaved access group that \p Instr belongs to.
1444 return InterleaveInfo.getInterleaveGroup(Instr);
1445 }
1446
1447 /// Returns true if we're required to use a scalar epilogue for at least
1448 /// the final iteration of the original loop.
1449 bool requiresScalarEpilogue(bool IsVectorizing) const {
1450 if (!isScalarEpilogueAllowed()) {
1451 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1452 return false;
1453 }
1454 // If we might exit from anywhere but the latch, must run the exiting
1455 // iteration in scalar form.
1457 LLVM_DEBUG(
1458 dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
1459 return true;
1460 }
1461 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1462 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1463 "interleaved group requires scalar epilogue\n");
1464 return true;
1465 }
1466 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1467 return false;
1468 }
1469
1470 /// Returns true if we're required to use a scalar epilogue for at least
1471 /// the final iteration of the original loop for all VFs in \p Range.
1472 /// A scalar epilogue must either be required for all VFs in \p Range or for
1473 /// none.
1475 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1476 return requiresScalarEpilogue(VF.isVector());
1477 };
1478 bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1479 assert(
1480 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1481 "all VFs in range must agree on whether a scalar epilogue is required");
1482 return IsRequired;
1483 }
1484
1485 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1486 /// loop hint annotation.
1488 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1489 }
1490
1491 /// Returns the TailFoldingStyle that is best for the current loop.
1492 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1493 if (!ChosenTailFoldingStyle)
1495 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1496 : ChosenTailFoldingStyle->second;
1497 }
1498
1499 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1500 /// overflow or not.
1501 /// \param IsScalableVF true if scalable vector factors enabled.
1502 /// \param UserIC User specific interleave count.
1503 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1504 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1506 ChosenTailFoldingStyle =
1508 return;
1509 }
1510
1511 if (!ForceTailFoldingStyle.getNumOccurrences()) {
1512 ChosenTailFoldingStyle = std::make_pair(
1513 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1514 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1515 return;
1516 }
1517
1518 // Set styles when forced.
1519 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1520 ForceTailFoldingStyle.getValue());
1522 return;
1523 // Override forced styles if needed.
1524 // FIXME: use actual opcode/data type for analysis here.
1525 // FIXME: Investigate opportunity for fixed vector factor.
1526 bool EVLIsLegal =
1527 IsScalableVF && UserIC <= 1 &&
1528 TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1530 // FIXME: implement support for max safe dependency distance.
1532 // FIXME: remove this once reductions are supported.
1534 if (!EVLIsLegal) {
1535 // If for some reason EVL mode is unsupported, fallback to
1536 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1537 // in a generic way.
1538 ChosenTailFoldingStyle =
1541 LLVM_DEBUG(
1542 dbgs()
1543 << "LV: Preference for VP intrinsics indicated. Will "
1544 "not try to generate VP Intrinsics "
1545 << (UserIC > 1
1546 ? "since interleave count specified is greater than 1.\n"
1547 : "due to non-interleaving reasons.\n"));
1548 }
1549 }
1550
1551 /// Returns true if all loop blocks should be masked to fold tail loop.
1552 bool foldTailByMasking() const {
1553 // TODO: check if it is possible to check for None style independent of
1554 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1556 }
1557
1558 /// Returns true if the instructions in this block requires predication
1559 /// for any reason, e.g. because tail folding now requires a predicate
1560 /// or because the block in the original loop was predicated.
1563 }
1564
1565 /// Returns true if VP intrinsics with explicit vector length support should
1566 /// be generated in the tail folded loop.
1567 bool foldTailWithEVL() const {
1569 }
1570
1571 /// Returns true if the Phi is part of an inloop reduction.
1572 bool isInLoopReduction(PHINode *Phi) const {
1573 return InLoopReductions.contains(Phi);
1574 }
1575
1576 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1577 /// with factor VF. Return the cost of the instruction, including
1578 /// scalarization overhead if it's needed.
1580
1581 /// Estimate cost of a call instruction CI if it were vectorized with factor
1582 /// VF. Return the cost of the instruction, including scalarization overhead
1583 /// if it's needed.
1585
1586 /// Invalidates decisions already taken by the cost model.
1588 WideningDecisions.clear();
1589 CallWideningDecisions.clear();
1590 Uniforms.clear();
1591 Scalars.clear();
1592 }
1593
1594 /// The vectorization cost is a combination of the cost itself and a boolean
1595 /// indicating whether any of the contributing operations will actually
1596 /// operate on vector values after type legalization in the backend. If this
1597 /// latter value is false, then all operations will be scalarized (i.e. no
1598 /// vectorization has actually taken place).
1599 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1600
1601 /// Returns the expected execution cost. The unit of the cost does
1602 /// not matter because we use the 'cost' units to compare different
1603 /// vector widths. The cost that is returned is *not* normalized by
1604 /// the factor width. If \p Invalid is not nullptr, this function
1605 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1606 /// each instruction that has an Invalid cost for the given VF.
1610
1611 bool hasPredStores() const { return NumPredStores > 0; }
1612
1613 /// Returns true if epilogue vectorization is considered profitable, and
1614 /// false otherwise.
1615 /// \p VF is the vectorization factor chosen for the original loop.
1617
1618private:
1619 unsigned NumPredStores = 0;
1620
1621 /// \return An upper bound for the vectorization factors for both
1622 /// fixed and scalable vectorization, where the minimum-known number of
1623 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1624 /// disabled or unsupported, then the scalable part will be equal to
1625 /// ElementCount::getScalable(0).
1626 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1627 ElementCount UserVF,
1628 bool FoldTailByMasking);
1629
1630 /// \return the maximized element count based on the targets vector
1631 /// registers and the loop trip-count, but limited to a maximum safe VF.
1632 /// This is a helper function of computeFeasibleMaxVF.
1633 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1634 unsigned SmallestType,
1635 unsigned WidestType,
1636 ElementCount MaxSafeVF,
1637 bool FoldTailByMasking);
1638
1639 /// \return the maximum legal scalable VF, based on the safe max number
1640 /// of elements.
1641 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1642
1643 /// Returns the execution time cost of an instruction for a given vector
1644 /// width. Vector width of one means scalar.
1645 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1646
1647 /// The cost-computation logic from getInstructionCost which provides
1648 /// the vector type as an output parameter.
1649 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1650 Type *&VectorTy);
1651
1652 /// Return the cost of instructions in an inloop reduction pattern, if I is
1653 /// part of that pattern.
1654 std::optional<InstructionCost>
1655 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1657
1658 /// Calculate vectorization cost of memory instruction \p I.
1659 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1660
1661 /// The cost computation for scalarized memory instruction.
1662 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1663
1664 /// The cost computation for interleaving group of memory instructions.
1665 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1666
1667 /// The cost computation for Gather/Scatter instruction.
1668 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1669
1670 /// The cost computation for widening instruction \p I with consecutive
1671 /// memory access.
1672 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1673
1674 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1675 /// Load: scalar load + broadcast.
1676 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1677 /// element)
1678 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1679
1680 /// Estimate the overhead of scalarizing an instruction. This is a
1681 /// convenience wrapper for the type-based getScalarizationOverhead API.
1682 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1684
1685 /// Returns true if an artificially high cost for emulated masked memrefs
1686 /// should be used.
1687 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1688
1689 /// Map of scalar integer values to the smallest bitwidth they can be legally
1690 /// represented as. The vector equivalents of these values should be truncated
1691 /// to this type.
1693
1694 /// A type representing the costs for instructions if they were to be
1695 /// scalarized rather than vectorized. The entries are Instruction-Cost
1696 /// pairs.
1697 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1698
1699 /// A set containing all BasicBlocks that are known to present after
1700 /// vectorization as a predicated block.
1702 PredicatedBBsAfterVectorization;
1703
1704 /// Records whether it is allowed to have the original scalar loop execute at
1705 /// least once. This may be needed as a fallback loop in case runtime
1706 /// aliasing/dependence checks fail, or to handle the tail/remainder
1707 /// iterations when the trip count is unknown or doesn't divide by the VF,
1708 /// or as a peel-loop to handle gaps in interleave-groups.
1709 /// Under optsize and when the trip count is very small we don't allow any
1710 /// iterations to execute in the scalar loop.
1711 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1712
1713 /// Control finally chosen tail folding style. The first element is used if
1714 /// the IV update may overflow, the second element - if it does not.
1715 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1716 ChosenTailFoldingStyle;
1717
1718 /// A map holding scalar costs for different vectorization factors. The
1719 /// presence of a cost for an instruction in the mapping indicates that the
1720 /// instruction will be scalarized when vectorizing with the associated
1721 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1723
1724 /// Holds the instructions known to be uniform after vectorization.
1725 /// The data is collected per VF.
1727
1728 /// Holds the instructions known to be scalar after vectorization.
1729 /// The data is collected per VF.
1731
1732 /// Holds the instructions (address computations) that are forced to be
1733 /// scalarized.
1735
1736 /// PHINodes of the reductions that should be expanded in-loop.
1737 SmallPtrSet<PHINode *, 4> InLoopReductions;
1738
1739 /// A Map of inloop reduction operations and their immediate chain operand.
1740 /// FIXME: This can be removed once reductions can be costed correctly in
1741 /// VPlan. This was added to allow quick lookup of the inloop operations.
1742 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1743
1744 /// Returns the expected difference in cost from scalarizing the expression
1745 /// feeding a predicated instruction \p PredInst. The instructions to
1746 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1747 /// non-negative return value implies the expression will be scalarized.
1748 /// Currently, only single-use chains are considered for scalarization.
1749 InstructionCost computePredInstDiscount(Instruction *PredInst,
1750 ScalarCostsTy &ScalarCosts,
1751 ElementCount VF);
1752
1753 /// Collect the instructions that are uniform after vectorization. An
1754 /// instruction is uniform if we represent it with a single scalar value in
1755 /// the vectorized loop corresponding to each vector iteration. Examples of
1756 /// uniform instructions include pointer operands of consecutive or
1757 /// interleaved memory accesses. Note that although uniformity implies an
1758 /// instruction will be scalar, the reverse is not true. In general, a
1759 /// scalarized instruction will be represented by VF scalar values in the
1760 /// vectorized loop, each corresponding to an iteration of the original
1761 /// scalar loop.
1762 void collectLoopUniforms(ElementCount VF);
1763
1764 /// Collect the instructions that are scalar after vectorization. An
1765 /// instruction is scalar if it is known to be uniform or will be scalarized
1766 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1767 /// to the list if they are used by a load/store instruction that is marked as
1768 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1769 /// VF values in the vectorized loop, each corresponding to an iteration of
1770 /// the original scalar loop.
1771 void collectLoopScalars(ElementCount VF);
1772
1773 /// Keeps cost model vectorization decision and cost for instructions.
1774 /// Right now it is used for memory instructions only.
1776 std::pair<InstWidening, InstructionCost>>;
1777
1778 DecisionList WideningDecisions;
1779
1780 using CallDecisionList =
1781 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1782
1783 CallDecisionList CallWideningDecisions;
1784
1785 /// Returns true if \p V is expected to be vectorized and it needs to be
1786 /// extracted.
1787 bool needsExtract(Value *V, ElementCount VF) const {
1788 Instruction *I = dyn_cast<Instruction>(V);
1789 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1791 return false;
1792
1793 // Assume we can vectorize V (and hence we need extraction) if the
1794 // scalars are not computed yet. This can happen, because it is called
1795 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1796 // the scalars are collected. That should be a safe assumption in most
1797 // cases, because we check if the operands have vectorizable types
1798 // beforehand in LoopVectorizationLegality.
1799 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1800 };
1801
1802 /// Returns a range containing only operands needing to be extracted.
1803 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1804 ElementCount VF) const {
1806 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1807 }
1808
1809public:
1810 /// The loop that we evaluate.
1812
1813 /// Predicated scalar evolution analysis.
1815
1816 /// Loop Info analysis.
1818
1819 /// Vectorization legality.
1821
1822 /// Vector target information.
1824
1825 /// Target Library Info.
1827
1828 /// Demanded bits analysis.
1830
1831 /// Assumption cache.
1833
1834 /// Interface to emit optimization remarks.
1836
1838
1839 /// Loop Vectorize Hint.
1841
1842 /// The interleave access information contains groups of interleaved accesses
1843 /// with the same stride and close to each other.
1845
1846 /// Values to ignore in the cost model.
1848
1849 /// Values to ignore in the cost model when VF > 1.
1851
1852 /// All element types found in the loop.
1854};
1855} // end namespace llvm
1856
1857namespace {
1858/// Helper struct to manage generating runtime checks for vectorization.
1859///
1860/// The runtime checks are created up-front in temporary blocks to allow better
1861/// estimating the cost and un-linked from the existing IR. After deciding to
1862/// vectorize, the checks are moved back. If deciding not to vectorize, the
1863/// temporary blocks are completely removed.
1864class GeneratedRTChecks {
1865 /// Basic block which contains the generated SCEV checks, if any.
1866 BasicBlock *SCEVCheckBlock = nullptr;
1867
1868 /// The value representing the result of the generated SCEV checks. If it is
1869 /// nullptr, either no SCEV checks have been generated or they have been used.
1870 Value *SCEVCheckCond = nullptr;
1871
1872 /// Basic block which contains the generated memory runtime checks, if any.
1873 BasicBlock *MemCheckBlock = nullptr;
1874
1875 /// The value representing the result of the generated memory runtime checks.
1876 /// If it is nullptr, either no memory runtime checks have been generated or
1877 /// they have been used.
1878 Value *MemRuntimeCheckCond = nullptr;
1879
1880 DominatorTree *DT;
1881 LoopInfo *LI;
1883
1884 SCEVExpander SCEVExp;
1885 SCEVExpander MemCheckExp;
1886
1887 bool CostTooHigh = false;
1888 const bool AddBranchWeights;
1889
1890 Loop *OuterLoop = nullptr;
1891
1892public:
1893 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1895 bool AddBranchWeights)
1896 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1897 MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1898
1899 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1900 /// accurately estimate the cost of the runtime checks. The blocks are
1901 /// un-linked from the IR and is added back during vector code generation. If
1902 /// there is no vector code generation, the check blocks are removed
1903 /// completely.
1904 void Create(Loop *L, const LoopAccessInfo &LAI,
1905 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1906
1907 // Hard cutoff to limit compile-time increase in case a very large number of
1908 // runtime checks needs to be generated.
1909 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1910 // profile info.
1911 CostTooHigh =
1913 if (CostTooHigh)
1914 return;
1915
1916 BasicBlock *LoopHeader = L->getHeader();
1917 BasicBlock *Preheader = L->getLoopPreheader();
1918
1919 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1920 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1921 // may be used by SCEVExpander. The blocks will be un-linked from their
1922 // predecessors and removed from LI & DT at the end of the function.
1923 if (!UnionPred.isAlwaysTrue()) {
1924 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1925 nullptr, "vector.scevcheck");
1926
1927 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1928 &UnionPred, SCEVCheckBlock->getTerminator());
1929 }
1930
1931 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1932 if (RtPtrChecking.Need) {
1933 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1934 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1935 "vector.memcheck");
1936
1937 auto DiffChecks = RtPtrChecking.getDiffChecks();
1938 if (DiffChecks) {
1939 Value *RuntimeVF = nullptr;
1940 MemRuntimeCheckCond = addDiffRuntimeChecks(
1941 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1942 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1943 if (!RuntimeVF)
1944 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1945 return RuntimeVF;
1946 },
1947 IC);
1948 } else {
1949 MemRuntimeCheckCond = addRuntimeChecks(
1950 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1952 }
1953 assert(MemRuntimeCheckCond &&
1954 "no RT checks generated although RtPtrChecking "
1955 "claimed checks are required");
1956 }
1957
1958 if (!MemCheckBlock && !SCEVCheckBlock)
1959 return;
1960
1961 // Unhook the temporary block with the checks, update various places
1962 // accordingly.
1963 if (SCEVCheckBlock)
1964 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1965 if (MemCheckBlock)
1966 MemCheckBlock->replaceAllUsesWith(Preheader);
1967
1968 if (SCEVCheckBlock) {
1969 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1970 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1971 Preheader->getTerminator()->eraseFromParent();
1972 }
1973 if (MemCheckBlock) {
1974 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1975 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1976 Preheader->getTerminator()->eraseFromParent();
1977 }
1978
1979 DT->changeImmediateDominator(LoopHeader, Preheader);
1980 if (MemCheckBlock) {
1981 DT->eraseNode(MemCheckBlock);
1982 LI->removeBlock(MemCheckBlock);
1983 }
1984 if (SCEVCheckBlock) {
1985 DT->eraseNode(SCEVCheckBlock);
1986 LI->removeBlock(SCEVCheckBlock);
1987 }
1988
1989 // Outer loop is used as part of the later cost calculations.
1990 OuterLoop = L->getParentLoop();
1991 }
1992
1993 InstructionCost getCost() {
1994 if (SCEVCheckBlock || MemCheckBlock)
1995 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1996
1997 if (CostTooHigh) {
1999 Cost.setInvalid();
2000 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
2001 return Cost;
2002 }
2003
2004 InstructionCost RTCheckCost = 0;
2005 if (SCEVCheckBlock)
2006 for (Instruction &I : *SCEVCheckBlock) {
2007 if (SCEVCheckBlock->getTerminator() == &I)
2008 continue;
2011 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2012 RTCheckCost += C;
2013 }
2014 if (MemCheckBlock) {
2015 InstructionCost MemCheckCost = 0;
2016 for (Instruction &I : *MemCheckBlock) {
2017 if (MemCheckBlock->getTerminator() == &I)
2018 continue;
2021 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2022 MemCheckCost += C;
2023 }
2024
2025 // If the runtime memory checks are being created inside an outer loop
2026 // we should find out if these checks are outer loop invariant. If so,
2027 // the checks will likely be hoisted out and so the effective cost will
2028 // reduce according to the outer loop trip count.
2029 if (OuterLoop) {
2030 ScalarEvolution *SE = MemCheckExp.getSE();
2031 // TODO: If profitable, we could refine this further by analysing every
2032 // individual memory check, since there could be a mixture of loop
2033 // variant and invariant checks that mean the final condition is
2034 // variant.
2035 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
2036 if (SE->isLoopInvariant(Cond, OuterLoop)) {
2037 // It seems reasonable to assume that we can reduce the effective
2038 // cost of the checks even when we know nothing about the trip
2039 // count. Assume that the outer loop executes at least twice.
2040 unsigned BestTripCount = 2;
2041
2042 // If exact trip count is known use that.
2043 if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
2044 BestTripCount = SmallTC;
2046 // Else use profile data if available.
2047 if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
2048 BestTripCount = *EstimatedTC;
2049 }
2050
2051 BestTripCount = std::max(BestTripCount, 1U);
2052 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2053
2054 // Let's ensure the cost is always at least 1.
2055 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2057
2058 if (BestTripCount > 1)
2060 << "We expect runtime memory checks to be hoisted "
2061 << "out of the outer loop. Cost reduced from "
2062 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2063
2064 MemCheckCost = NewMemCheckCost;
2065 }
2066 }
2067
2068 RTCheckCost += MemCheckCost;
2069 }
2070
2071 if (SCEVCheckBlock || MemCheckBlock)
2072 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2073 << "\n");
2074
2075 return RTCheckCost;
2076 }
2077
2078 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2079 /// unused.
2080 ~GeneratedRTChecks() {
2081 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2082 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2083 if (!SCEVCheckCond)
2084 SCEVCleaner.markResultUsed();
2085
2086 if (!MemRuntimeCheckCond)
2087 MemCheckCleaner.markResultUsed();
2088
2089 if (MemRuntimeCheckCond) {
2090 auto &SE = *MemCheckExp.getSE();
2091 // Memory runtime check generation creates compares that use expanded
2092 // values. Remove them before running the SCEVExpanderCleaners.
2093 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2094 if (MemCheckExp.isInsertedInstruction(&I))
2095 continue;
2096 SE.forgetValue(&I);
2097 I.eraseFromParent();
2098 }
2099 }
2100 MemCheckCleaner.cleanup();
2101 SCEVCleaner.cleanup();
2102
2103 if (SCEVCheckCond)
2104 SCEVCheckBlock->eraseFromParent();
2105 if (MemRuntimeCheckCond)
2106 MemCheckBlock->eraseFromParent();
2107 }
2108
2109 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2110 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2111 /// depending on the generated condition.
2112 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2113 BasicBlock *LoopVectorPreHeader,
2114 BasicBlock *LoopExitBlock) {
2115 if (!SCEVCheckCond)
2116 return nullptr;
2117
2118 Value *Cond = SCEVCheckCond;
2119 // Mark the check as used, to prevent it from being removed during cleanup.
2120 SCEVCheckCond = nullptr;
2121 if (auto *C = dyn_cast<ConstantInt>(Cond))
2122 if (C->isZero())
2123 return nullptr;
2124
2125 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2126
2127 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2128 // Create new preheader for vector loop.
2129 if (OuterLoop)
2130 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2131
2132 SCEVCheckBlock->getTerminator()->eraseFromParent();
2133 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2134 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2135 SCEVCheckBlock);
2136
2137 DT->addNewBlock(SCEVCheckBlock, Pred);
2138 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2139
2140 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2141 if (AddBranchWeights)
2142 setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
2143 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2144 return SCEVCheckBlock;
2145 }
2146
2147 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2148 /// the branches to branch to the vector preheader or \p Bypass, depending on
2149 /// the generated condition.
2150 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2151 BasicBlock *LoopVectorPreHeader) {
2152 // Check if we generated code that checks in runtime if arrays overlap.
2153 if (!MemRuntimeCheckCond)
2154 return nullptr;
2155
2156 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2157 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2158 MemCheckBlock);
2159
2160 DT->addNewBlock(MemCheckBlock, Pred);
2161 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2162 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2163
2164 if (OuterLoop)
2165 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2166
2167 BranchInst &BI =
2168 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2169 if (AddBranchWeights) {
2170 setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2171 }
2172 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2173 MemCheckBlock->getTerminator()->setDebugLoc(
2174 Pred->getTerminator()->getDebugLoc());
2175
2176 // Mark the check as used, to prevent it from being removed during cleanup.
2177 MemRuntimeCheckCond = nullptr;
2178 return MemCheckBlock;
2179 }
2180};
2181} // namespace
2182
2184 return Style == TailFoldingStyle::Data ||
2185 Style == TailFoldingStyle::DataAndControlFlow ||
2186 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2187}
2188
2190 return Style == TailFoldingStyle::DataAndControlFlow ||
2191 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2192}
2193
2194// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2195// vectorization. The loop needs to be annotated with #pragma omp simd
2196// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2197// vector length information is not provided, vectorization is not considered
2198// explicit. Interleave hints are not allowed either. These limitations will be
2199// relaxed in the future.
2200// Please, note that we are currently forced to abuse the pragma 'clang
2201// vectorize' semantics. This pragma provides *auto-vectorization hints*
2202// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2203// provides *explicit vectorization hints* (LV can bypass legal checks and
2204// assume that vectorization is legal). However, both hints are implemented
2205// using the same metadata (llvm.loop.vectorize, processed by
2206// LoopVectorizeHints). This will be fixed in the future when the native IR
2207// representation for pragma 'omp simd' is introduced.
2208static bool isExplicitVecOuterLoop(Loop *OuterLp,
2210 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2211 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2212
2213 // Only outer loops with an explicit vectorization hint are supported.
2214 // Unannotated outer loops are ignored.
2216 return false;
2217
2218 Function *Fn = OuterLp->getHeader()->getParent();
2219 if (!Hints.allowVectorization(Fn, OuterLp,
2220 true /*VectorizeOnlyWhenForced*/)) {
2221 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2222 return false;
2223 }
2224
2225 if (Hints.getInterleave() > 1) {
2226 // TODO: Interleave support is future work.
2227 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2228 "outer loops.\n");
2229 Hints.emitRemarkWithHints();
2230 return false;
2231 }
2232
2233 return true;
2234}
2235
2239 // Collect inner loops and outer loops without irreducible control flow. For
2240 // now, only collect outer loops that have explicit vectorization hints. If we
2241 // are stress testing the VPlan H-CFG construction, we collect the outermost
2242 // loop of every loop nest.
2243 if (L.isInnermost() || VPlanBuildStressTest ||
2245 LoopBlocksRPO RPOT(&L);
2246 RPOT.perform(LI);
2247 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2248 V.push_back(&L);
2249 // TODO: Collect inner loops inside marked outer loops in case
2250 // vectorization fails for the outer loop. Do not invoke
2251 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2252 // already known to be reducible. We can use an inherited attribute for
2253 // that.
2254 return;
2255 }
2256 }
2257 for (Loop *InnerL : L)
2258 collectSupportedLoops(*InnerL, LI, ORE, V);
2259}
2260
2261//===----------------------------------------------------------------------===//
2262// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2263// LoopVectorizationCostModel and LoopVectorizationPlanner.
2264//===----------------------------------------------------------------------===//
2265
2266/// Compute the transformed value of Index at offset StartValue using step
2267/// StepValue.
2268/// For integer induction, returns StartValue + Index * StepValue.
2269/// For pointer induction, returns StartValue[Index * StepValue].
2270/// FIXME: The newly created binary instructions should contain nsw/nuw
2271/// flags, which can be found from the original scalar operations.
2272static Value *
2274 Value *Step,
2276 const BinaryOperator *InductionBinOp) {
2277 Type *StepTy = Step->getType();
2278 Value *CastedIndex = StepTy->isIntegerTy()
2279 ? B.CreateSExtOrTrunc(Index, StepTy)
2280 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2281 if (CastedIndex != Index) {
2282 CastedIndex->setName(CastedIndex->getName() + ".cast");
2283 Index = CastedIndex;
2284 }
2285
2286 // Note: the IR at this point is broken. We cannot use SE to create any new
2287 // SCEV and then expand it, hoping that SCEV's simplification will give us
2288 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2289 // lead to various SCEV crashes. So all we can do is to use builder and rely
2290 // on InstCombine for future simplifications. Here we handle some trivial
2291 // cases only.
2292 auto CreateAdd = [&B](Value *X, Value *Y) {
2293 assert(X->getType() == Y->getType() && "Types don't match!");
2294 if (auto *CX = dyn_cast<ConstantInt>(X))
2295 if (CX->isZero())
2296 return Y;
2297 if (auto *CY = dyn_cast<ConstantInt>(Y))
2298 if (CY->isZero())
2299 return X;
2300 return B.CreateAdd(X, Y);
2301 };
2302
2303 // We allow X to be a vector type, in which case Y will potentially be
2304 // splatted into a vector with the same element count.
2305 auto CreateMul = [&B](Value *X, Value *Y) {
2306 assert(X->getType()->getScalarType() == Y->getType() &&
2307 "Types don't match!");
2308 if (auto *CX = dyn_cast<ConstantInt>(X))
2309 if (CX->isOne())
2310 return Y;
2311 if (auto *CY = dyn_cast<ConstantInt>(Y))
2312 if (CY->isOne())
2313 return X;
2314 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2315 if (XVTy && !isa<VectorType>(Y->getType()))
2316 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2317 return B.CreateMul(X, Y);
2318 };
2319
2320 switch (InductionKind) {
2322 assert(!isa<VectorType>(Index->getType()) &&
2323 "Vector indices not supported for integer inductions yet");
2324 assert(Index->getType() == StartValue->getType() &&
2325 "Index type does not match StartValue type");
2326 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2327 return B.CreateSub(StartValue, Index);
2328 auto *Offset = CreateMul(Index, Step);
2329 return CreateAdd(StartValue, Offset);
2330 }
2332 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2334 assert(!isa<VectorType>(Index->getType()) &&
2335 "Vector indices not supported for FP inductions yet");
2336 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2337 assert(InductionBinOp &&
2338 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2339 InductionBinOp->getOpcode() == Instruction::FSub) &&
2340 "Original bin op should be defined for FP induction");
2341
2342 Value *MulExp = B.CreateFMul(Step, Index);
2343 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2344 "induction");
2345 }
2347 return nullptr;
2348 }
2349 llvm_unreachable("invalid enum");
2350}
2351
2352std::optional<unsigned> getMaxVScale(const Function &F,
2353 const TargetTransformInfo &TTI) {
2354 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2355 return MaxVScale;
2356
2357 if (F.hasFnAttribute(Attribute::VScaleRange))
2358 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2359
2360 return std::nullopt;
2361}
2362
2363/// For the given VF and UF and maximum trip count computed for the loop, return
2364/// whether the induction variable might overflow in the vectorized loop. If not,
2365/// then we know a runtime overflow check always evaluates to false and can be
2366/// removed.
2369 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2370 // Always be conservative if we don't know the exact unroll factor.
2371 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2372
2373 Type *IdxTy = Cost->Legal->getWidestInductionType();
2374 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2375
2376 // We know the runtime overflow check is known false iff the (max) trip-count
2377 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2378 // the vector loop induction variable.
2379 if (unsigned TC =
2380 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2381 uint64_t MaxVF = VF.getKnownMinValue();
2382 if (VF.isScalable()) {
2383 std::optional<unsigned> MaxVScale =
2384 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2385 if (!MaxVScale)
2386 return false;
2387 MaxVF *= *MaxVScale;
2388 }
2389
2390 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2391 }
2392
2393 return false;
2394}
2395
2396// Return whether we allow using masked interleave-groups (for dealing with
2397// strided loads/stores that reside in predicated blocks, or for dealing
2398// with gaps).
2400 // If an override option has been passed in for interleaved accesses, use it.
2401 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2403
2405}
2406
2407// Try to vectorize the interleave group that \p Instr belongs to.
2408//
2409// E.g. Translate following interleaved load group (factor = 3):
2410// for (i = 0; i < N; i+=3) {
2411// R = Pic[i]; // Member of index 0
2412// G = Pic[i+1]; // Member of index 1
2413// B = Pic[i+2]; // Member of index 2
2414// ... // do something to R, G, B
2415// }
2416// To:
2417// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2418// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2419// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2420// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2421//
2422// Or translate following interleaved store group (factor = 3):
2423// for (i = 0; i < N; i+=3) {
2424// ... do something to R, G, B
2425// Pic[i] = R; // Member of index 0
2426// Pic[i+1] = G; // Member of index 1
2427// Pic[i+2] = B; // Member of index 2
2428// }
2429// To:
2430// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2431// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2432// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2433// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2434// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2437 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2438 VPValue *BlockInMask, bool NeedsMaskForGaps) {
2439 Instruction *Instr = Group->getInsertPos();
2440 const DataLayout &DL = Instr->getDataLayout();
2441
2442 // Prepare for the vector type of the interleaved load/store.
2443 Type *ScalarTy = getLoadStoreType(Instr);
2444 unsigned InterleaveFactor = Group->getFactor();
2445 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2446
2447 // Prepare for the new pointers.
2448 SmallVector<Value *, 2> AddrParts;
2449 unsigned Index = Group->getIndex(Instr);
2450
2451 // TODO: extend the masked interleaved-group support to reversed access.
2452 assert((!BlockInMask || !Group->isReverse()) &&
2453 "Reversed masked interleave-group not supported.");
2454
2455 Value *Idx;
2456 // If the group is reverse, adjust the index to refer to the last vector lane
2457 // instead of the first. We adjust the index from the first vector lane,
2458 // rather than directly getting the pointer for lane VF - 1, because the
2459 // pointer operand of the interleaved access is supposed to be uniform. For
2460 // uniform instructions, we're only required to generate a value for the
2461 // first vector lane in each unroll iteration.
2462 if (Group->isReverse()) {
2463 Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2464 Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
2468 } else
2470
2471 for (unsigned Part = 0; Part < State.UF; Part++) {
2472 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2473 if (auto *I = dyn_cast<Instruction>(AddrPart))
2474 State.setDebugLocFrom(I->getDebugLoc());
2475
2476 // Notice current instruction could be any index. Need to adjust the address
2477 // to the member of index 0.
2478 //
2479 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2480 // b = A[i]; // Member of index 0
2481 // Current pointer is pointed to A[i+1], adjust it to A[i].
2482 //
2483 // E.g. A[i+1] = a; // Member of index 1
2484 // A[i] = b; // Member of index 0
2485 // A[i+2] = c; // Member of index 2 (Current instruction)
2486 // Current pointer is pointed to A[i+2], adjust it to A[i].
2487
2488 bool InBounds = false;
2489 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2490 InBounds = gep->isInBounds();
2491 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2492 AddrParts.push_back(AddrPart);
2493 }
2494
2495 State.setDebugLocFrom(Instr->getDebugLoc());
2496 Value *PoisonVec = PoisonValue::get(VecTy);
2497
2498 auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
2499 unsigned Part, Value *MaskForGaps) -> Value * {
2500 if (VF.isScalable()) {
2501 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2502 assert(InterleaveFactor == 2 &&
2503 "Unsupported deinterleave factor for scalable vectors");
2504 auto *BlockInMaskPart = State.get(BlockInMask, Part);
2505 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2506 auto *MaskTy =
2508 return Builder.CreateIntrinsic(MaskTy, Intrinsic::vector_interleave2, Ops,
2509 /*FMFSource=*/nullptr, "interleaved.mask");
2510 }
2511
2512 if (!BlockInMask)
2513 return MaskForGaps;
2514
2515 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2516 Value *ShuffledMask = Builder.CreateShuffleVector(
2517 BlockInMaskPart,
2518 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2519 "interleaved.mask");
2520 return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2521 MaskForGaps)
2522 : ShuffledMask;
2523 };
2524
2525 // Vectorize the interleaved load group.
2526 if (isa<LoadInst>(Instr)) {
2527 Value *MaskForGaps = nullptr;
2528 if (NeedsMaskForGaps) {
2529 MaskForGaps =
2531 assert(MaskForGaps && "Mask for Gaps is required but it is null");
2532 }
2533
2534 // For each unroll part, create a wide load for the group.
2535 SmallVector<Value *, 2> NewLoads;
2536 for (unsigned Part = 0; Part < State.UF; Part++) {
2537 Instruction *NewLoad;
2538 if (BlockInMask || MaskForGaps) {
2540 "masked interleaved groups are not allowed.");
2541 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2542 NewLoad =
2543 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2544 GroupMask, PoisonVec, "wide.masked.vec");
2545 }
2546 else
2547 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2548 Group->getAlign(), "wide.vec");
2549 Group->addMetadata(NewLoad);
2550 NewLoads.push_back(NewLoad);
2551 }
2552
2553 if (VecTy->isScalableTy()) {
2554 assert(InterleaveFactor == 2 &&
2555 "Unsupported deinterleave factor for scalable vectors");
2556
2557 for (unsigned Part = 0; Part < State.UF; ++Part) {
2558 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2559 // so must use intrinsics to deinterleave.
2561 Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part],
2562 /*FMFSource=*/nullptr, "strided.vec");
2563 unsigned J = 0;
2564 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2565 Instruction *Member = Group->getMember(I);
2566
2567 if (!Member)
2568 continue;
2569
2570 Value *StridedVec = Builder.CreateExtractValue(DI, I);
2571 // If this member has different type, cast the result type.
2572 if (Member->getType() != ScalarTy) {
2573 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2574 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2575 }
2576
2577 if (Group->isReverse())
2578 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2579
2580 State.set(VPDefs[J], StridedVec, Part);
2581 ++J;
2582 }
2583 }
2584
2585 return;
2586 }
2587
2588 // For each member in the group, shuffle out the appropriate data from the
2589 // wide loads.
2590 unsigned J = 0;
2591 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2592 Instruction *Member = Group->getMember(I);
2593
2594 // Skip the gaps in the group.
2595 if (!Member)
2596 continue;
2597
2598 auto StrideMask =
2599 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2600 for (unsigned Part = 0; Part < State.UF; Part++) {
2601 Value *StridedVec = Builder.CreateShuffleVector(
2602 NewLoads[Part], StrideMask, "strided.vec");
2603
2604 // If this member has different type, cast the result type.
2605 if (Member->getType() != ScalarTy) {
2606 assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2607 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2608 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2609 }
2610
2611 if (Group->isReverse())
2612 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2613
2614 State.set(VPDefs[J], StridedVec, Part);
2615 }
2616 ++J;
2617 }
2618 return;
2619 }
2620
2621 // The sub vector type for current instruction.
2622 auto *SubVT = VectorType::get(ScalarTy, VF);
2623
2624 // Vectorize the interleaved store group.
2625 Value *MaskForGaps =
2627 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2628 "masked interleaved groups are not allowed.");
2629 assert((!MaskForGaps || !VF.isScalable()) &&
2630 "masking gaps for scalable vectors is not yet supported.");
2631 for (unsigned Part = 0; Part < State.UF; Part++) {
2632 // Collect the stored vector from each member.
2633 SmallVector<Value *, 4> StoredVecs;
2634 unsigned StoredIdx = 0;
2635 for (unsigned i = 0; i < InterleaveFactor; i++) {
2636 assert((Group->getMember(i) || MaskForGaps) &&
2637 "Fail to get a member from an interleaved store group");
2638 Instruction *Member = Group->getMember(i);
2639
2640 // Skip the gaps in the group.
2641 if (!Member) {
2642 Value *Undef = PoisonValue::get(SubVT);
2643 StoredVecs.push_back(Undef);
2644 continue;
2645 }
2646
2647 Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2648 ++StoredIdx;
2649
2650 if (Group->isReverse())
2651 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2652
2653 // If this member has different type, cast it to a unified type.
2654
2655 if (StoredVec->getType() != SubVT)
2656 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2657
2658 StoredVecs.push_back(StoredVec);
2659 }
2660
2661 // Interleave all the smaller vectors into one wider vector.
2662 Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
2663 Instruction *NewStoreInstr;
2664 if (BlockInMask || MaskForGaps) {
2665 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2666 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2667 Group->getAlign(), GroupMask);
2668 } else
2669 NewStoreInstr =
2670 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2671
2672 Group->addMetadata(NewStoreInstr);
2673 }
2674}
2675
2677 VPReplicateRecipe *RepRecipe,
2678 const VPIteration &Instance,
2679 VPTransformState &State) {
2680 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2681
2682 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2683 // the first lane and part.
2684 if (isa<NoAliasScopeDeclInst>(Instr))
2685 if (!Instance.isFirstIteration())
2686 return;
2687
2688 // Does this instruction return a value ?
2689 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2690
2691 Instruction *Cloned = Instr->clone();
2692 if (!IsVoidRetTy) {
2693 Cloned->setName(Instr->getName() + ".cloned");
2694#if !defined(NDEBUG)
2695 // Verify that VPlan type inference results agree with the type of the
2696 // generated values.
2697 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2698 "inferred type and type from generated instructions do not match");
2699#endif
2700 }
2701
2702 RepRecipe->setFlags(Cloned);
2703
2704 if (auto DL = Instr->getDebugLoc())
2705 State.setDebugLocFrom(DL);
2706
2707 // Replace the operands of the cloned instructions with their scalar
2708 // equivalents in the new loop.
2709 for (const auto &I : enumerate(RepRecipe->operands())) {
2710 auto InputInstance = Instance;
2711 VPValue *Operand = I.value();
2713 InputInstance.Lane = VPLane::getFirstLane();
2714 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2715 }
2716 State.addNewMetadata(Cloned, Instr);
2717
2718 // Place the cloned scalar in the new loop.
2719 State.Builder.Insert(Cloned);
2720
2721 State.set(RepRecipe, Cloned, Instance);
2722
2723 // If we just cloned a new assumption, add it the assumption cache.
2724 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2726
2727 // End if-block.
2728 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2729 if (IfPredicateInstr)
2730 PredicatedInstructions.push_back(Cloned);
2731}
2732
2733Value *
2735 if (VectorTripCount)
2736 return VectorTripCount;
2737
2738 Value *TC = getTripCount();
2739 IRBuilder<> Builder(InsertBlock->getTerminator());
2740
2741 Type *Ty = TC->getType();
2742 // This is where we can make the step a runtime constant.
2743 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2744
2745 // If the tail is to be folded by masking, round the number of iterations N
2746 // up to a multiple of Step instead of rounding down. This is done by first
2747 // adding Step-1 and then rounding down. Note that it's ok if this addition
2748 // overflows: the vector induction variable will eventually wrap to zero given
2749 // that it starts at zero and its Step is a power of two; the loop will then
2750 // exit, with the last early-exit vector comparison also producing all-true.
2751 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2752 // is accounted for in emitIterationCountCheck that adds an overflow check.
2753 if (Cost->foldTailByMasking()) {
2755 "VF*UF must be a power of 2 when folding tail by masking");
2756 TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2757 "n.rnd.up");
2758 }
2759
2760 // Now we need to generate the expression for the part of the loop that the
2761 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2762 // iterations are not required for correctness, or N - Step, otherwise. Step
2763 // is equal to the vectorization factor (number of SIMD elements) times the
2764 // unroll factor (number of SIMD instructions).
2765 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2766
2767 // There are cases where we *must* run at least one iteration in the remainder
2768 // loop. See the cost model for when this can happen. If the step evenly
2769 // divides the trip count, we set the remainder to be equal to the step. If
2770 // the step does not evenly divide the trip count, no adjustment is necessary
2771 // since there will already be scalar iterations. Note that the minimum
2772 // iterations check ensures that N >= Step.
2773 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2774 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2775 R = Builder.CreateSelect(IsZero, Step, R);
2776 }
2777
2778 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2779
2780 return VectorTripCount;
2781}
2782
2784 const DataLayout &DL) {
2785 // Verify that V is a vector type with same number of elements as DstVTy.
2786 auto *DstFVTy = cast<VectorType>(DstVTy);
2787 auto VF = DstFVTy->getElementCount();
2788 auto *SrcVecTy = cast<VectorType>(V->getType());
2789 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2790 Type *SrcElemTy = SrcVecTy->getElementType();
2791 Type *DstElemTy = DstFVTy->getElementType();
2792 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2793 "Vector elements must have same size");
2794
2795 // Do a direct cast if element types are castable.
2796 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2797 return Builder.CreateBitOrPointerCast(V, DstFVTy);
2798 }
2799 // V cannot be directly casted to desired vector type.
2800 // May happen when V is a floating point vector but DstVTy is a vector of
2801 // pointers or vice-versa. Handle this using a two-step bitcast using an
2802 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2803 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2804 "Only one type should be a pointer type");
2805 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2806 "Only one type should be a floating point type");
2807 Type *IntTy =
2808 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2809 auto *VecIntTy = VectorType::get(IntTy, VF);
2810 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2811 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2812}
2813
2815 Value *Count = getTripCount();
2816 // Reuse existing vector loop preheader for TC checks.
2817 // Note that new preheader block is generated for vector loop.
2818 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2819 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2820
2821 // Generate code to check if the loop's trip count is less than VF * UF, or
2822 // equal to it in case a scalar epilogue is required; this implies that the
2823 // vector trip count is zero. This check also covers the case where adding one
2824 // to the backedge-taken count overflowed leading to an incorrect trip count
2825 // of zero. In this case we will also jump to the scalar loop.
2826 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2828
2829 // If tail is to be folded, vector loop takes care of all iterations.
2830 Type *CountTy = Count->getType();
2831 Value *CheckMinIters = Builder.getFalse();
2832 auto CreateStep = [&]() -> Value * {
2833 // Create step with max(MinProTripCount, UF * VF).
2835 return createStepForVF(Builder, CountTy, VF, UF);
2836
2837 Value *MinProfTC =
2839 if (!VF.isScalable())
2840 return MinProfTC;
2842 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2843 };
2844
2845 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2846 if (Style == TailFoldingStyle::None)
2847 CheckMinIters =
2848 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2849 else if (VF.isScalable() &&
2852 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2853 // an overflow to zero when updating induction variables and so an
2854 // additional overflow check is required before entering the vector loop.
2855
2856 // Get the maximum unsigned value for the type.
2857 Value *MaxUIntTripCount =
2858 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2859 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2860
2861 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2862 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2863 }
2864
2865 // Create new preheader for vector loop.
2867 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2868 "vector.ph");
2869
2870 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2871 DT->getNode(Bypass)->getIDom()) &&
2872 "TC check is expected to dominate Bypass");
2873
2874 // Update dominator for Bypass & LoopExit (if needed).
2875 DT->changeImmediateDominator(Bypass, TCCheckBlock);
2876 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2877 // If there is an epilogue which must run, there's no edge from the
2878 // middle block to exit blocks and thus no need to update the immediate
2879 // dominator of the exit blocks.
2881
2882 BranchInst &BI =
2883 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2885 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2886 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2887 LoopBypassBlocks.push_back(TCCheckBlock);
2888}
2889
2891 BasicBlock *const SCEVCheckBlock =
2892 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2893 if (!SCEVCheckBlock)
2894 return nullptr;
2895
2896 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2898 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2899 "Cannot SCEV check stride or overflow when optimizing for size");
2900
2901
2902 // Update dominator only if this is first RT check.
2903 if (LoopBypassBlocks.empty()) {
2904 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2905 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2906 // If there is an epilogue which must run, there's no edge from the
2907 // middle block to exit blocks and thus no need to update the immediate
2908 // dominator of the exit blocks.
2909 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2910 }
2911
2912 LoopBypassBlocks.push_back(SCEVCheckBlock);
2913 AddedSafetyChecks = true;
2914 return SCEVCheckBlock;
2915}
2916
2918 // VPlan-native path does not do any analysis for runtime checks currently.
2920 return nullptr;
2921
2922 BasicBlock *const MemCheckBlock =
2923 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2924
2925 // Check if we generated code that checks in runtime if arrays overlap. We put
2926 // the checks into a separate block to make the more common case of few
2927 // elements faster.
2928 if (!MemCheckBlock)
2929 return nullptr;
2930
2931 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2932 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2933 "Cannot emit memory checks when optimizing for size, unless forced "
2934 "to vectorize.");
2935 ORE->emit([&]() {
2936 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2939 << "Code-size may be reduced by not forcing "
2940 "vectorization, or by source-code modifications "
2941 "eliminating the need for runtime checks "
2942 "(e.g., adding 'restrict').";
2943 });
2944 }
2945
2946 LoopBypassBlocks.push_back(MemCheckBlock);
2947
2948 AddedSafetyChecks = true;
2949
2950 return MemCheckBlock;
2951}
2952
2956 assert(LoopVectorPreHeader && "Invalid loop structure");
2957 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2958 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2959 "multiple exit loop without required epilogue?");
2960
2963 LI, nullptr, Twine(Prefix) + "middle.block");
2966 nullptr, Twine(Prefix) + "scalar.ph");
2967
2968 // Set up the middle block terminator. Two cases:
2969 // 1) If we know that we must execute the scalar epilogue, retain the existing
2970 // unconditional branch from the middle block to the scalar preheader. In that
2971 // case, there's no edge from the middle block to exit blocks and thus no
2972 // need to update the immediate dominator of the exit blocks.
2973 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2974 assert(
2976 " middle block should have the scalar preheader as single successor");
2977 return;
2978 }
2979
2980 // 2) Otherwise, we must have a single unique exit block (due to how we
2981 // implement the multiple exit case). In this case, set up a conditional
2982 // branch from the middle block to the loop scalar preheader, and the
2983 // exit block. completeLoopSkeleton will update the condition to use an
2984 // iteration check, if required to decide whether to execute the remainder.
2985 BranchInst *BrInst =
2987 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
2988 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
2990
2991 // Update dominator for loop exit. During skeleton creation, only the vector
2992 // pre-header and the middle block are created. The vector loop is entirely
2993 // created during VPlan exection.
2995}
2996
2998 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
2999 ArrayRef<BasicBlock *> BypassBlocks,
3000 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3002 assert(VectorTripCount && "Expected valid arguments");
3003
3004 Instruction *OldInduction = Legal->getPrimaryInduction();
3005 Value *&EndValue = IVEndValues[OrigPhi];
3006 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3007 if (OrigPhi == OldInduction) {
3008 // We know what the end value is.
3009 EndValue = VectorTripCount;
3010 } else {
3012
3013 // Fast-math-flags propagate from the original induction instruction.
3014 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3015 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3016
3017 EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
3018 Step, II.getKind(), II.getInductionBinOp());
3019 EndValue->setName("ind.end");
3020
3021 // Compute the end value for the additional bypass (if applicable).
3022 if (AdditionalBypass.first) {
3023 B.SetInsertPoint(AdditionalBypass.first,
3024 AdditionalBypass.first->getFirstInsertionPt());
3025 EndValueFromAdditionalBypass =
3026 emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
3027 Step, II.getKind(), II.getInductionBinOp());
3028 EndValueFromAdditionalBypass->setName("ind.end");
3029 }
3030 }
3031
3032 // Create phi nodes to merge from the backedge-taken check block.
3033 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3035 // Copy original phi DL over to the new one.
3036 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3037
3038 // The new PHI merges the original incoming value, in case of a bypass,
3039 // or the value at the end of the vectorized loop.
3040 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3041
3042 // Fix the scalar body counter (PHI node).
3043 // The old induction's phi node in the scalar body needs the truncated
3044 // value.
3045 for (BasicBlock *BB : BypassBlocks)
3046 BCResumeVal->addIncoming(II.getStartValue(), BB);
3047
3048 if (AdditionalBypass.first)
3049 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3050 EndValueFromAdditionalBypass);
3051 return BCResumeVal;
3052}
3053
3054/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
3055/// expansion results.
3057 const SCEV2ValueTy &ExpandedSCEVs) {
3058 const SCEV *Step = ID.getStep();
3059 if (auto *C = dyn_cast<SCEVConstant>(Step))
3060 return C->getValue();
3061 if (auto *U = dyn_cast<SCEVUnknown>(Step))
3062 return U->getValue();
3063 auto I = ExpandedSCEVs.find(Step);
3064 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
3065 return I->second;
3066}
3067
3069 const SCEV2ValueTy &ExpandedSCEVs,
3070 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3071 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3072 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3073 "Inconsistent information about additional bypass.");
3074 // We are going to resume the execution of the scalar loop.
3075 // Go over all of the induction variables that we found and fix the
3076 // PHIs that are left in the scalar version of the loop.
3077 // The starting values of PHI nodes depend on the counter of the last
3078 // iteration in the vectorized loop.
3079 // If we come from a bypass edge then we need to start from the original
3080 // start value.
3081 for (const auto &InductionEntry : Legal->getInductionVars()) {
3082 PHINode *OrigPhi = InductionEntry.first;
3083 const InductionDescriptor &II = InductionEntry.second;
3084 PHINode *BCResumeVal = createInductionResumeValue(
3085 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
3086 AdditionalBypass);
3087 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3088 }
3089}
3090
3092 // The trip counts should be cached by now.
3093 Value *Count = getTripCount();
3095
3096 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3097
3098 // Add a check in the middle block to see if we have completed
3099 // all of the iterations in the first vector loop. Three cases:
3100 // 1) If we require a scalar epilogue, there is no conditional branch as
3101 // we unconditionally branch to the scalar preheader. Do nothing.
3102 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3103 // Thus if tail is to be folded, we know we don't need to run the
3104 // remainder and we can use the previous value for the condition (true).
3105 // 3) Otherwise, construct a runtime check.
3106 if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
3107 !Cost->foldTailByMasking()) {
3108 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3109 // of the corresponding compare because they may have ended up with
3110 // different line numbers and we want to avoid awkward line stepping while
3111 // debugging. Eg. if the compare has got a line number inside the loop.
3112 // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3113 // operands. Perform simplification directly on VPlan once the branch is
3114 // modeled there.
3116 B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3117 Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
3118 BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
3119 BI.setCondition(CmpN);
3120 if (hasBranchWeightMD(*ScalarLatchTerm)) {
3121 // Assume that `Count % VectorTripCount` is equally distributed.
3122 unsigned TripCount = UF * VF.getKnownMinValue();
3123 assert(TripCount > 0 && "trip count should not be zero");
3124 const uint32_t Weights[] = {1, TripCount - 1};
3125 setBranchWeights(BI, Weights, /*IsExpected=*/false);
3126 }
3127 }
3128
3129#ifdef EXPENSIVE_CHECKS
3130 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3131#endif
3132
3133 return LoopVectorPreHeader;
3134}
3135
3136std::pair<BasicBlock *, Value *>
3138 const SCEV2ValueTy &ExpandedSCEVs) {
3139 /*
3140 In this function we generate a new loop. The new loop will contain
3141 the vectorized instructions while the old loop will continue to run the
3142 scalar remainder.
3143
3144 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3145 / | preheader are expanded here. Eventually all required SCEV
3146 / | expansion should happen here.
3147 / v
3148 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3149 | / |
3150 | / v
3151 || [ ] <-- vector pre header.
3152 |/ |
3153 | v
3154 | [ ] \
3155 | [ ]_| <-- vector loop (created during VPlan execution).
3156 | |
3157 | v
3158 \ -[ ] <--- middle-block.
3159 \/ |
3160 /\ v
3161 | ->[ ] <--- new preheader.
3162 | |
3163 (opt) v <-- edge from middle to exit iff epilogue is not required.
3164 | [ ] \
3165 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3166 \ |
3167 \ v
3168 >[ ] <-- exit block(s).
3169 ...
3170 */
3171
3172 // Create an empty vector loop, and prepare basic blocks for the runtime
3173 // checks.
3175
3176 // Now, compare the new count to zero. If it is zero skip the vector loop and
3177 // jump to the scalar loop. This check also covers the case where the
3178 // backedge-taken count is uint##_max: adding one to it will overflow leading
3179 // to an incorrect trip count of zero. In this (rare) case we will also jump
3180 // to the scalar loop.
3182
3183 // Generate the code to check any assumptions that we've made for SCEV
3184 // expressions.
3186
3187 // Generate the code that checks in runtime if arrays overlap. We put the
3188 // checks into a separate block to make the more common case of few elements
3189 // faster.
3191
3192 // Emit phis for the new starting index of the scalar loop.
3193 createInductionResumeValues(ExpandedSCEVs);
3194
3195 return {completeLoopSkeleton(), nullptr};
3196}
3197
3198// Fix up external users of the induction variable. At this point, we are
3199// in LCSSA form, with all external PHIs that use the IV having one input value,
3200// coming from the remainder loop. We need those PHIs to also have a correct
3201// value for the IV when arriving directly from the middle block.
3203 const InductionDescriptor &II,
3204 Value *VectorTripCount, Value *EndValue,
3205 BasicBlock *MiddleBlock,
3206 BasicBlock *VectorHeader, VPlan &Plan,
3207 VPTransformState &State) {
3208 // There are two kinds of external IV usages - those that use the value
3209 // computed in the last iteration (the PHI) and those that use the penultimate
3210 // value (the value that feeds into the phi from the loop latch).
3211 // We allow both, but they, obviously, have different values.
3212
3213 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3214
3215 DenseMap<Value *, Value *> MissingVals;
3216
3217 // An external user of the last iteration's value should see the value that
3218 // the remainder loop uses to initialize its own IV.
3220 for (User *U : PostInc->users()) {
3221 Instruction *UI = cast<Instruction>(U);
3222 if (!OrigLoop->contains(UI)) {
3223 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3224 MissingVals[UI] = EndValue;
3225 }
3226 }
3227
3228 // An external user of the penultimate value need to see EndValue - Step.
3229 // The simplest way to get this is to recompute it from the constituent SCEVs,
3230 // that is Start + (Step * (CRD - 1)).
3231 for (User *U : OrigPhi->users()) {
3232 auto *UI = cast<Instruction>(U);
3233 if (!OrigLoop->contains(UI)) {
3234 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3235 IRBuilder<> B(MiddleBlock->getTerminator());
3236
3237 // Fast-math-flags propagate from the original induction instruction.
3238 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3239 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3240
3241 Value *CountMinusOne = B.CreateSub(
3242 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3243 CountMinusOne->setName("cmo");
3244
3245 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
3246 assert(StepVPV && "step must have been expanded during VPlan execution");
3247 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
3248 : State.get(StepVPV, {0, 0});
3249 Value *Escape =
3250 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
3251 II.getKind(), II.getInductionBinOp());
3252 Escape->setName("ind.escape");
3253 MissingVals[UI] = Escape;
3254 }
3255 }
3256
3257 for (auto &I : MissingVals) {
3258 PHINode *PHI = cast<PHINode>(I.first);
3259 // One corner case we have to handle is two IVs "chasing" each-other,
3260 // that is %IV2 = phi [...], [ %IV1, %latch ]
3261 // In this case, if IV1 has an external use, we need to avoid adding both
3262 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3263 // don't already have an incoming value for the middle block.
3264 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3265 PHI->addIncoming(I.second, MiddleBlock);
3266 Plan.removeLiveOut(PHI);
3267 }
3268 }
3269}
3270
3271namespace {
3272
3273struct CSEDenseMapInfo {
3274 static bool canHandle(const Instruction *I) {
3275 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3276 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3277 }
3278
3279 static inline Instruction *getEmptyKey() {
3281 }
3282
3283 static inline Instruction *getTombstoneKey() {
3285 }
3286
3287 static unsigned getHashValue(const Instruction *I) {
3288 assert(canHandle(I) && "Unknown instruction!");
3289 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3290 I->value_op_end()));
3291 }
3292
3293 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3294 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3295 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3296 return LHS == RHS;
3297 return LHS->isIdenticalTo(RHS);
3298 }
3299};
3300
3301} // end anonymous namespace
3302
3303///Perform cse of induction variable instructions.
3304static void cse(BasicBlock *BB) {
3305 // Perform simple cse.
3307 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3308 if (!CSEDenseMapInfo::canHandle(&In))
3309 continue;
3310
3311 // Check if we can replace this instruction with any of the
3312 // visited instructions.
3313 if (Instruction *V = CSEMap.lookup(&In)) {
3314 In.replaceAllUsesWith(V);
3315 In.eraseFromParent();
3316 continue;
3317 }
3318
3319 CSEMap[&In] = &In;
3320 }
3321}
3322
3325 ElementCount VF) const {
3326 // We only need to calculate a cost if the VF is scalar; for actual vectors
3327 // we should already have a pre-calculated cost at each VF.
3328 if (!VF.isScalar())
3329 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
3330
3332 Type *RetTy = CI->getType();
3334 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
3335 return *RedCost;
3336
3338 for (auto &ArgOp : CI->args())
3339 Tys.push_back(ArgOp->getType());
3340
3341 InstructionCost ScalarCallCost =
3343
3344 // If this is an intrinsic we may have a lower cost for it.
3346 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
3347 return std::min(ScalarCallCost, IntrinsicCost);
3348 }
3349 return ScalarCallCost;
3350}
3351
3353 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3354 return Elt;
3355 return VectorType::get(Elt, VF);
3356}
3357
3360 ElementCount VF) const {
3362 assert(ID && "Expected intrinsic call!");
3363 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3364 FastMathFlags FMF;
3365 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3366 FMF = FPMO->getFastMathFlags();
3367
3370 SmallVector<Type *> ParamTys;
3371 std::transform(FTy->param_begin(), FTy->param_end(),
3372 std::back_inserter(ParamTys),
3373 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3374
3375 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3376 dyn_cast<IntrinsicInst>(CI));
3377 return TTI.getIntrinsicInstrCost(CostAttrs,
3379}
3380
3382 VPlan &Plan) {
3383 // Fix widened non-induction PHIs by setting up the PHI operands.
3385 fixNonInductionPHIs(Plan, State);
3386
3387 // At this point every instruction in the original loop is widened to a
3388 // vector form. Note that fixing reduction phis, as well as extracting the
3389 // exit and resume values for fixed-order recurrences are already modeled in
3390 // VPlan. All that remains to do here is to create a phi in the scalar
3391 // pre-header for each fixed-order recurrence resume value.
3392 // TODO: Also model creating phis in the scalar pre-header in VPlan.
3393 for (const auto &[_, LO] : to_vector(Plan.getLiveOuts())) {
3394 if (!Legal->isFixedOrderRecurrence(LO->getPhi()))
3395 continue;
3396 fixFixedOrderRecurrence(LO, State);
3397 Plan.removeLiveOut(LO->getPhi());
3398 }
3399
3400 // Forget the original basic block.
3403
3404 // After vectorization, the exit blocks of the original loop will have
3405 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
3406 // looked through single-entry phis.
3407 SmallVector<BasicBlock *> ExitBlocks;
3408 OrigLoop->getExitBlocks(ExitBlocks);
3409 for (BasicBlock *Exit : ExitBlocks)
3410 for (PHINode &PN : Exit->phis())
3412
3413 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3414 VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
3415 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3416 if (Cost->requiresScalarEpilogue(VF.isVector())) {
3417 // No edge from the middle block to the unique exit block has been inserted
3418 // and there is nothing to fix from vector loop; phis should have incoming
3419 // from scalar loop only.
3420 } else {
3421 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
3422 // the cost model.
3423
3424 // If we inserted an edge from the middle block to the unique exit block,
3425 // update uses outside the loop (phis) to account for the newly inserted
3426 // edge.
3427
3428 // Fix-up external users of the induction variables.
3429 for (const auto &Entry : Legal->getInductionVars())
3430 fixupIVUsers(Entry.first, Entry.second,
3432 IVEndValues[Entry.first], LoopMiddleBlock,
3433 VectorLoop->getHeader(), Plan, State);
3434 }
3435
3436 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3437 // in the exit block, so update the builder.
3438 State.Builder.SetInsertPoint(State.CFG.ExitBB,
3439 State.CFG.ExitBB->getFirstNonPHIIt());
3440 for (const auto &KV : Plan.getLiveOuts())
3441 KV.second->fixPhi(Plan, State);
3442
3444 sinkScalarOperands(&*PI);
3445
3446 // Remove redundant induction instructions.
3447 cse(VectorLoop->getHeader());
3448
3449 // Set/update profile weights for the vector and remainder loops as original
3450 // loop iterations are now distributed among them. Note that original loop
3451 // represented by LoopScalarBody becomes remainder loop after vectorization.
3452 //
3453 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3454 // end up getting slightly roughened result but that should be OK since
3455 // profile is not inherently precise anyway. Note also possible bypass of
3456 // vector code caused by legality checks is ignored, assigning all the weight
3457 // to the vector loop, optimistically.
3458 //
3459 // For scalable vectorization we can't know at compile time how many iterations
3460 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3461 // vscale of '1'.
3464 VF.getKnownMinValue() * UF);
3465}
3466
3468 VPTransformState &State) {
3469 // Extract the last vector element in the middle block. This will be the
3470 // initial value for the recurrence when jumping to the scalar loop.
3471 VPValue *VPExtract = LO->getOperand(0);
3472 using namespace llvm::VPlanPatternMatch;
3473 assert(match(VPExtract, m_VPInstruction<VPInstruction::ExtractFromEnd>(
3474 m_VPValue(), m_VPValue())) &&
3475 "FOR LiveOut expects to use an extract from end.");
3476 Value *ResumeScalarFOR = State.get(VPExtract, UF - 1, true);
3477
3478 // Fix the initial value of the original recurrence in the scalar loop.
3479 PHINode *ScalarHeaderPhi = LO->getPhi();
3480 auto *InitScalarFOR =
3483 auto *ScalarPreheaderPhi =
3484 Builder.CreatePHI(ScalarHeaderPhi->getType(), 2, "scalar.recur.init");
3485 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3486 auto *Incoming = BB == LoopMiddleBlock ? ResumeScalarFOR : InitScalarFOR;
3487 ScalarPreheaderPhi->addIncoming(Incoming, BB);
3488 }
3490 ScalarPreheaderPhi);
3491 ScalarHeaderPhi->setName("scalar.recur");
3492}
3493
3495 // The basic block and loop containing the predicated instruction.
3496 auto *PredBB = PredInst->getParent();
3497 auto *VectorLoop = LI->getLoopFor(PredBB);
3498
3499 // Initialize a worklist with the operands of the predicated instruction.
3500 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3501
3502 // Holds instructions that we need to analyze again. An instruction may be
3503 // reanalyzed if we don't yet know if we can sink it or not.
3504 SmallVector<Instruction *, 8> InstsToReanalyze;
3505
3506 // Returns true if a given use occurs in the predicated block. Phi nodes use
3507 // their operands in their corresponding predecessor blocks.
3508 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3509 auto *I = cast<Instruction>(U.getUser());
3510 BasicBlock *BB = I->getParent();
3511 if (auto *Phi = dyn_cast<PHINode>(I))
3512 BB = Phi->getIncomingBlock(
3513 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3514 return BB == PredBB;
3515 };
3516
3517 // Iteratively sink the scalarized operands of the predicated instruction
3518 // into the block we created for it. When an instruction is sunk, it's
3519 // operands are then added to the worklist. The algorithm ends after one pass
3520 // through the worklist doesn't sink a single instruction.
3521 bool Changed;
3522 do {
3523 // Add the instructions that need to be reanalyzed to the worklist, and
3524 // reset the changed indicator.
3525 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3526 InstsToReanalyze.clear();
3527 Changed = false;
3528
3529 while (!Worklist.empty()) {
3530 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3531
3532 // We can't sink an instruction if it is a phi node, is not in the loop,
3533 // may have side effects or may read from memory.
3534 // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3535 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3536 I->mayHaveSideEffects() || I->mayReadFromMemory())
3537 continue;
3538
3539 // If the instruction is already in PredBB, check if we can sink its
3540 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3541 // sinking the scalar instruction I, hence it appears in PredBB; but it
3542 // may have failed to sink I's operands (recursively), which we try
3543 // (again) here.
3544 if (I->getParent() == PredBB) {
3545 Worklist.insert(I->op_begin(), I->op_end());
3546 continue;
3547 }
3548
3549 // It's legal to sink the instruction if all its uses occur in the
3550 // predicated block. Otherwise, there's nothing to do yet, and we may
3551 // need to reanalyze the instruction.
3552 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3553 InstsToReanalyze.push_back(I);
3554 continue;
3555 }
3556
3557 // Move the instruction to the beginning of the predicated block, and add
3558 // it's operands to the worklist.
3559 I->moveBefore(&*PredBB->getFirstInsertionPt());
3560 Worklist.insert(I->op_begin(), I->op_end());
3561
3562 // The sinking may have enabled other instructions to be sunk, so we will
3563 // need to iterate.
3564 Changed = true;
3565 }
3566 } while (Changed);
3567}
3568
3570 VPTransformState &State) {
3571 auto Iter = vp_depth_first_deep(Plan.getEntry());
3572 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3573 for (VPRecipeBase &P : VPBB->phis()) {
3574 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3575 if (!VPPhi)
3576 continue;
3577 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3578 // Make sure the builder has a valid insert point.
3579 Builder.SetInsertPoint(NewPhi);
3580 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3581 VPValue *Inc = VPPhi->getIncomingValue(i);
3582 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3583 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
3584 }
3585 }
3586 }
3587}
3588
3589void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3590 // We should not collect Scalars more than once per VF. Right now, this
3591 // function is called from collectUniformsAndScalars(), which already does
3592 // this check. Collecting Scalars for VF=1 does not make any sense.
3593 assert(VF.isVector() && !Scalars.contains(VF) &&
3594 "This function should not be visited twice for the same VF");
3595
3596 // This avoids any chances of creating a REPLICATE recipe during planning
3597 // since that would result in generation of scalarized code during execution,
3598 // which is not supported for scalable vectors.
3599 if (VF.isScalable()) {
3600 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3601 return;
3602 }
3603
3605
3606 // These sets are used to seed the analysis with pointers used by memory
3607 // accesses that will remain scalar.
3609 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3610 auto *Latch = TheLoop->getLoopLatch();
3611
3612 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3613 // The pointer operands of loads and stores will be scalar as long as the
3614 // memory access is not a gather or scatter operation. The value operand of a
3615 // store will remain scalar if the store is scalarized.
3616 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3617 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3618 assert(WideningDecision != CM_Unknown &&
3619 "Widening decision should be ready at this moment");
3620 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3621 if (Ptr == Store->getValueOperand())
3622 return WideningDecision == CM_Scalarize;
3623 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3624 "Ptr is neither a value or pointer operand");
3625 return WideningDecision != CM_GatherScatter;
3626 };
3627
3628 // A helper that returns true if the given value is a bitcast or
3629 // getelementptr instruction contained in the loop.
3630 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
3631 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
3632 isa<GetElementPtrInst>(V)) &&
3634 };
3635
3636 // A helper that evaluates a memory access's use of a pointer. If the use will
3637 // be a scalar use and the pointer is only used by memory accesses, we place
3638 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3639 // PossibleNonScalarPtrs.
3640 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3641 // We only care about bitcast and getelementptr instructions contained in
3642 // the loop.
3643 if (!isLoopVaryingBitCastOrGEP(Ptr))
3644 return;
3645
3646 // If the pointer has already been identified as scalar (e.g., if it was
3647 // also identified as uniform), there's nothing to do.
3648 auto *I = cast<Instruction>(Ptr);
3649 if (Worklist.count(I))
3650 return;
3651
3652 // If the use of the pointer will be a scalar use, and all users of the
3653 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3654 // place the pointer in PossibleNonScalarPtrs.
3655 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3656 return isa<LoadInst>(U) || isa<StoreInst>(U);
3657 }))
3658 ScalarPtrs.insert(I);
3659 else
3660 PossibleNonScalarPtrs.insert(I);
3661 };
3662
3663 // We seed the scalars analysis with three classes of instructions: (1)
3664 // instructions marked uniform-after-vectorization and (2) bitcast,
3665 // getelementptr and (pointer) phi instructions used by memory accesses
3666 // requiring a scalar use.
3667 //
3668 // (1) Add to the worklist all instructions that have been identified as
3669 // uniform-after-vectorization.
3670 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3671
3672 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3673 // memory accesses requiring a scalar use. The pointer operands of loads and
3674 // stores will be scalar as long as the memory accesses is not a gather or
3675 // scatter operation. The value operand of a store will remain scalar if the
3676 // store is scalarized.
3677 for (auto *BB : TheLoop->blocks())
3678 for (auto &I : *BB) {
3679 if (auto *Load = dyn_cast<LoadInst>(&I)) {
3680 evaluatePtrUse(Load, Load->getPointerOperand());
3681 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3682 evaluatePtrUse(Store, Store->getPointerOperand());
3683 evaluatePtrUse(Store, Store->getValueOperand());
3684 }
3685 }
3686 for (auto *I : ScalarPtrs)
3687 if (!PossibleNonScalarPtrs.count(I)) {
3688 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3689 Worklist.insert(I);
3690 }
3691
3692 // Insert the forced scalars.
3693 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3694 // induction variable when the PHI user is scalarized.
3695 auto ForcedScalar = ForcedScalars.find(VF);
3696 if (ForcedScalar != ForcedScalars.end())
3697 for (auto *I : ForcedScalar->second) {
3698 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3699 Worklist.insert(I);
3700 }
3701
3702 // Expand the worklist by looking through any bitcasts and getelementptr
3703 // instructions we've already identified as scalar. This is similar to the
3704 // expansion step in collectLoopUniforms(); however, here we're only
3705 // expanding to include additional bitcasts and getelementptr instructions.
3706 unsigned Idx = 0;
3707 while (Idx != Worklist.size()) {
3708 Instruction *Dst = Worklist[Idx++];
3709 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
3710 continue;
3711 auto *Src = cast<Instruction>(Dst->getOperand(0));
3712 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3713 auto *J = cast<Instruction>(U);
3714 return !TheLoop->contains(J) || Worklist.count(J) ||
3715 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3716 isScalarUse(J, Src));
3717 })) {
3718 Worklist.insert(Src);
3719 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3720 }
3721 }
3722
3723 // An induction variable will remain scalar if all users of the induction
3724 // variable and induction variable update remain scalar.
3725 for (const auto &Induction : Legal->getInductionVars()) {
3726 auto *Ind = Induction.first;
3727 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3728
3729 // If tail-folding is applied, the primary induction variable will be used
3730 // to feed a vector compare.
3731 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3732 continue;
3733
3734 // Returns true if \p Indvar is a pointer induction that is used directly by
3735 // load/store instruction \p I.
3736 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3737 Instruction *I) {
3738 return Induction.second.getKind() ==
3740 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3741 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
3742 };
3743
3744 // Determine if all users of the induction variable are scalar after
3745 // vectorization.
3746 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3747 auto *I = cast<Instruction>(U);
3748 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3749 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3750 });
3751 if (!ScalarInd)
3752 continue;
3753
3754 // If the induction variable update is a fixed-order recurrence, neither the
3755 // induction variable or its update should be marked scalar after
3756 // vectorization.
3757 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3758 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3759 continue;
3760
3761 // Determine if all users of the induction variable update instruction are
3762 // scalar after vectorization.
3763 auto ScalarIndUpdate =
3764 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3765 auto *I = cast<Instruction>(U);
3766 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3767 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3768 });
3769 if (!ScalarIndUpdate)
3770 continue;
3771
3772 // The induction variable and its update instruction will remain scalar.
3773 Worklist.insert(Ind);
3774 Worklist.insert(IndUpdate);
3775 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3776 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3777 << "\n");
3778 }
3779
3780 Scalars[VF].insert(Worklist.begin(), Worklist.end());
3781}
3782
3784 Instruction *I, ElementCount VF) const {
3785 if (!isPredicatedInst(I))
3786 return false;
3787
3788 // Do we have a non-scalar lowering for this predicated
3789 // instruction? No - it is scalar with predication.
3790 switch(I->getOpcode()) {
3791 default:
3792 return true;
3793 case Instruction::Call:
3794 if (VF.isScalar())
3795 return true;
3796 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3797 .Kind == CM_Scalarize;
3798 case Instruction::Load:
3799 case Instruction::Store: {
3801 auto *Ty = getLoadStoreType(I);
3802 Type *VTy = Ty;
3803 if (VF.isVector())
3804 VTy = VectorType::get(Ty, VF);
3805 const Align Alignment = getLoadStoreAlignment(I);
3806 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3807 TTI.isLegalMaskedGather(VTy, Alignment))
3808 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3809 TTI.isLegalMaskedScatter(VTy, Alignment));
3810 }
3811 case Instruction::UDiv:
3812 case Instruction::SDiv:
3813 case Instruction::SRem:
3814 case Instruction::URem: {
3815 // We have the option to use the safe-divisor idiom to avoid predication.
3816 // The cost based decision here will always select safe-divisor for
3817 // scalable vectors as scalarization isn't legal.
3818 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3819 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3820 }
3821 }
3822}
3823
3825 if (!blockNeedsPredicationForAnyReason(I->getParent()))
3826 return false;
3827
3828 // Can we prove this instruction is safe to unconditionally execute?
3829 // If not, we must use some form of predication.
3830 switch(I->getOpcode()) {
3831 default:
3832 return false;
3833 case Instruction::Load:
3834 case Instruction::Store: {
3835 if (!Legal->isMaskRequired(I))
3836 return false;
3837 // When we know the load's address is loop invariant and the instruction
3838 // in the original scalar loop was unconditionally executed then we
3839 // don't need to mark it as a predicated instruction. Tail folding may
3840 // introduce additional predication, but we're guaranteed to always have
3841 // at least one active lane. We call Legal->blockNeedsPredication here
3842 // because it doesn't query tail-folding. For stores, we need to prove
3843 // both speculation safety (which follows from the same argument as loads),
3844 // but also must prove the value being stored is correct. The easiest
3845 // form of the later is to require that all values stored are the same.
3847 (isa<LoadInst>(I) ||
3848 (isa<StoreInst>(I) &&
3849 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
3850 !Legal->blockNeedsPredication(I->getParent()))
3851 return false;
3852 return true;
3853 }
3854 case Instruction::UDiv:
3855 case Instruction::SDiv:
3856 case Instruction::SRem:
3857 case Instruction::URem:
3858 // TODO: We can use the loop-preheader as context point here and get
3859 // context sensitive reasoning
3861 case Instruction::Call:
3862 return Legal->isMaskRequired(I);
3863 }
3864}
3865
3866std::pair<InstructionCost, InstructionCost>
3868 ElementCount VF) const {
3869 assert(I->getOpcode() == Instruction::UDiv ||
3870 I->getOpcode() == Instruction::SDiv ||
3871 I->getOpcode() == Instruction::SRem ||
3872 I->getOpcode() == Instruction::URem);
3874
3876
3877 // Scalarization isn't legal for scalable vector types
3878 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3879 if (!VF.isScalable()) {
3880 // Get the scalarization cost and scale this amount by the probability of
3881 // executing the predicated block. If the instruction is not predicated,
3882 // we fall through to the next case.
3883 ScalarizationCost = 0;
3884
3885 // These instructions have a non-void type, so account for the phi nodes
3886 // that we will create. This cost is likely to be zero. The phi node
3887 // cost, if any, should be scaled by the block probability because it
3888 // models a copy at the end of each predicated block.
3889 ScalarizationCost += VF.getKnownMinValue() *
3890 TTI.getCFInstrCost(Instruction::PHI, CostKind);
3891
3892 // The cost of the non-predicated instruction.
3893 ScalarizationCost += VF.getKnownMinValue() *
3894 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3895
3896 // The cost of insertelement and extractelement instructions needed for
3897 // scalarization.
3898 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
3899
3900 // Scale the cost by the probability of executing the predicated blocks.
3901 // This assumes the predicated block for each vector lane is equally
3902 // likely.
3903 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3904 }
3905 InstructionCost SafeDivisorCost = 0;
3906
3907 auto *VecTy = ToVectorTy(I->getType(), VF);
3908
3909 // The cost of the select guard to ensure all lanes are well defined
3910 // after we speculate above any internal control flow.
3911 SafeDivisorCost += TTI.getCmpSelInstrCost(
3912 Instruction::Select, VecTy,
3913 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
3915
3916 // Certain instructions can be cheaper to vectorize if they have a constant
3917 // second vector operand. One example of this are shifts on x86.
3918 Value *Op2 = I->getOperand(1);
3919 auto Op2Info = TTI.getOperandInfo(Op2);
3920 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3921 Legal->isInvariant(Op2))
3923
3924 SmallVector<const Value *, 4> Operands(I->operand_values());
3925 SafeDivisorCost += TTI.getArithmeticInstrCost(
3926 I->getOpcode(), VecTy, CostKind,
3927 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3928 Op2Info, Operands, I);
3929 return {ScalarizationCost, SafeDivisorCost};
3930}
3931
3933 Instruction *I, ElementCount VF) const {
3934 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3936 "Decision should not be set yet.");
3937 auto *Group = getInterleavedAccessGroup(I);
3938 assert(Group && "Must have a group.");
3939
3940 // If the instruction's allocated size doesn't equal it's type size, it
3941 // requires padding and will be scalarized.
3942 auto &DL = I->getDataLayout();
3943 auto *ScalarTy = getLoadStoreType(I);
3944 if (hasIrregularType(ScalarTy, DL))
3945 return false;
3946
3947 // If the group involves a non-integral pointer, we may not be able to
3948 // losslessly cast all values to a common type.
3949 unsigned InterleaveFactor = Group->getFactor();
3950 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3951 for (unsigned i = 0; i < InterleaveFactor; i++) {
3952 Instruction *Member = Group->getMember(i);
3953 if (!Member)
3954 continue;
3955 auto *MemberTy = getLoadStoreType(Member);
3956 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3957 // Don't coerce non-integral pointers to integers or vice versa.
3958 if (MemberNI != ScalarNI) {
3959 // TODO: Consider adding special nullptr value case here
3960 return false;
3961 } else if (MemberNI && ScalarNI &&
3962 ScalarTy->getPointerAddressSpace() !=
3963 MemberTy->getPointerAddressSpace()) {
3964 return false;
3965 }
3966 }
3967
3968 // Check if masking is required.
3969 // A Group may need masking for one of two reasons: it resides in a block that
3970 // needs predication, or it was decided to use masking to deal with gaps
3971 // (either a gap at the end of a load-access that may result in a speculative
3972 // load, or any gaps in a store-access).
3973 bool PredicatedAccessRequiresMasking =
3974 blockNeedsPredicationForAnyReason(I->getParent()) &&
3976 bool LoadAccessWithGapsRequiresEpilogMasking =
3977 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3979 bool StoreAccessWithGapsRequiresMasking =
3980 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3981 if (!PredicatedAccessRequiresMasking &&
3982 !LoadAccessWithGapsRequiresEpilogMasking &&
3983 !StoreAccessWithGapsRequiresMasking)
3984 return true;
3985
3986 // If masked interleaving is required, we expect that the user/target had
3987 // enabled it, because otherwise it either wouldn't have been created or
3988 // it should have been invalidated by the CostModel.
3990 "Masked interleave-groups for predicated accesses are not enabled.");
3991
3992 if (Group->isReverse())
3993 return false;
3994
3995 auto *Ty = getLoadStoreType(I);
3996 const Align Alignment = getLoadStoreAlignment(I);
3997 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
3998 : TTI.isLegalMaskedStore(Ty, Alignment);
3999}
4000
4002 Instruction *I, ElementCount VF) {
4003 // Get and ensure we have a valid memory instruction.
4004 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4005
4007 auto *ScalarTy = getLoadStoreType(I);
4008
4009 // In order to be widened, the pointer should be consecutive, first of all.
4010 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4011 return false;
4012
4013 // If the instruction is a store located in a predicated block, it will be
4014 // scalarized.
4015 if (isScalarWithPredication(I, VF))
4016 return false;
4017
4018 // If the instruction's allocated size doesn't equal it's type size, it
4019 // requires padding and will be scalarized.
4020 auto &DL = I->getDataLayout();
4021 if (hasIrregularType(ScalarTy, DL))
4022 return false;
4023
4024 return true;
4025}
4026
4027void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4028 // We should not collect Uniforms more than once per VF. Right now,
4029 // this function is called from collectUniformsAndScalars(), which
4030 // already does this check. Collecting Uniforms for VF=1 does not make any
4031 // sense.
4032
4033 assert(VF.isVector() && !Uniforms.contains(VF) &&
4034 "This function should not be visited twice for the same VF");
4035
4036 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4037 // not analyze again. Uniforms.count(VF) will return 1.
4038 Uniforms[VF].clear();
4039
4040 // We now know that the loop is vectorizable!
4041 // Collect instructions inside the loop that will remain uniform after
4042 // vectorization.
4043
4044 // Global values, params and instructions outside of current loop are out of
4045 // scope.
4046 auto isOutOfScope = [&](Value *V) -> bool {
4047 Instruction *I = dyn_cast<Instruction>(V);
4048 return (!I || !TheLoop->contains(I));
4049 };
4050
4051 // Worklist containing uniform instructions demanding lane 0.
4052 SetVector<Instruction *> Worklist;
4053
4054 // Add uniform instructions demanding lane 0 to the worklist. Instructions
4055 // that are scalar with predication must not be considered uniform after
4056 // vectorization, because that would create an erroneous replicating region
4057 // where only a single instance out of VF should be formed.
4058 // TODO: optimize such seldom cases if found important, see PR40816.
4059 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4060 if (isOutOfScope(I)) {
4061 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4062 << *I << "\n");
4063 return;
4064 }
4065 if (isScalarWithPredication(I, VF)) {
4066 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4067 << *I << "\n");
4068 return;
4069 }
4070 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4071 Worklist.insert(I);
4072 };
4073
4074 // Start with the conditional branches exiting the loop. If the branch
4075 // condition is an instruction contained in the loop that is only used by the
4076 // branch, it is uniform.
4078 TheLoop->getExitingBlocks(Exiting);
4079 for (BasicBlock *E : Exiting) {
4080 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
4081 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4082 addToWorklistIfAllowed(Cmp);
4083 }
4084
4085 auto PrevVF = VF.divideCoefficientBy(2);
4086 // Return true if all lanes perform the same memory operation, and we can
4087 // thus chose to execute only one.
4088 auto isUniformMemOpUse = [&](Instruction *I) {
4089 // If the value was already known to not be uniform for the previous
4090 // (smaller VF), it cannot be uniform for the larger VF.
4091 if (PrevVF.isVector()) {
4092 auto Iter = Uniforms.find(PrevVF);
4093 if (Iter != Uniforms.end() && !Iter->second.contains(I))
4094 return false;
4095 }
4096 if (!Legal->isUniformMemOp(*I, VF))
4097 return false;
4098 if (isa<LoadInst>(I))
4099 // Loading the same address always produces the same result - at least
4100 // assuming aliasing and ordering which have already been checked.
4101 return true;
4102 // Storing the same value on every iteration.
4103 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4104 };
4105
4106 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4107 InstWidening WideningDecision = getWideningDecision(I, VF);
4108 assert(WideningDecision != CM_Unknown &&
4109 "Widening decision should be ready at this moment");
4110
4111 if (isUniformMemOpUse(I))
4112 return true;
4113
4114 return (WideningDecision == CM_Widen ||
4115 WideningDecision == CM_Widen_Reverse ||
4116 WideningDecision == CM_Interleave);
4117 };
4118
4119 // Returns true if Ptr is the pointer operand of a memory access instruction
4120 // I, I is known to not require scalarization, and the pointer is not also
4121 // stored.
4122 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4123 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
4124 return false;
4125 return getLoadStorePointerOperand(I) == Ptr &&
4126 (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
4127 };
4128
4129 // Holds a list of values which are known to have at least one uniform use.
4130 // Note that there may be other uses which aren't uniform. A "uniform use"
4131 // here is something which only demands lane 0 of the unrolled iterations;
4132 // it does not imply that all lanes produce the same value (e.g. this is not
4133 // the usual meaning of uniform)
4134 SetVector<Value *> HasUniformUse;
4135
4136 // Scan the loop for instructions which are either a) known to have only
4137 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4138 for (auto *BB : TheLoop->blocks())
4139 for (auto &I : *BB) {
4140 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4141 switch (II->getIntrinsicID()) {
4142 case Intrinsic::sideeffect:
4143 case Intrinsic::experimental_noalias_scope_decl:
4144 case Intrinsic::assume:
4145 case Intrinsic::lifetime_start:
4146 case Intrinsic::lifetime_end:
4148 addToWorklistIfAllowed(&I);
4149 break;
4150 default:
4151 break;
4152 }
4153 }
4154
4155 // ExtractValue instructions must be uniform, because the operands are
4156 // known to be loop-invariant.
4157 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4158 assert(isOutOfScope(EVI->getAggregateOperand()) &&
4159 "Expected aggregate value to be loop invariant");
4160 addToWorklistIfAllowed(EVI);
4161 continue;
4162 }
4163
4164 // If there's no pointer operand, there's nothing to do.
4166 if (!Ptr)
4167 continue;
4168
4169 if (isUniformMemOpUse(&I))
4170 addToWorklistIfAllowed(&I);
4171
4172 if (isVectorizedMemAccessUse(&I, Ptr))
4173 HasUniformUse.insert(Ptr);
4174 }
4175
4176 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4177 // demanding) users. Since loops are assumed to be in LCSSA form, this
4178 // disallows uses outside the loop as well.
4179 for (auto *V : HasUniformUse) {
4180 if (isOutOfScope(V))
4181 continue;
4182 auto *I = cast<Instruction>(V);
4183 auto UsersAreMemAccesses =
4184 llvm::all_of(I->users(), [&](User *U) -> bool {
4185 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4186 });
4187 if (UsersAreMemAccesses)
4188 addToWorklistIfAllowed(I);
4189 }
4190
4191 // Expand Worklist in topological order: whenever a new instruction
4192 // is added , its users should be already inside Worklist. It ensures
4193 // a uniform instruction will only be used by uniform instructions.
4194 unsigned idx = 0;
4195 while (idx != Worklist.size()) {
4196 Instruction *I = Worklist[idx++];
4197
4198 for (auto *OV : I->operand_values()) {
4199 // isOutOfScope operands cannot be uniform instructions.
4200 if (isOutOfScope(OV))
4201 continue;
4202 // First order recurrence Phi's should typically be considered
4203 // non-uniform.
4204 auto *OP = dyn_cast<PHINode>(OV);
4206 continue;
4207 // If all the users of the operand are uniform, then add the
4208 // operand into the uniform worklist.
4209 auto *OI = cast<Instruction>(OV);
4210 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4211 auto *J = cast<Instruction>(U);
4212 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4213 }))
4214 addToWorklistIfAllowed(OI);
4215 }
4216 }
4217
4218 // For an instruction to be added into Worklist above, all its users inside
4219 // the loop should also be in Worklist. However, this condition cannot be
4220 // true for phi nodes that form a cyclic dependence. We must process phi
4221 // nodes separately. An induction variable will remain uniform if all users
4222 // of the induction variable and induction variable update remain uniform.
4223 // The code below handles both pointer and non-pointer induction variables.
4224 BasicBlock *Latch = TheLoop->getLoopLatch();
4225 for (const auto &Induction : Legal->getInductionVars()) {
4226 auto *Ind = Induction.first;
4227 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4228
4229 // Determine if all users of the induction variable are uniform after
4230 // vectorization.
4231 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4232 auto *I = cast<Instruction>(U);
4233 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4234 isVectorizedMemAccessUse(I, Ind);
4235 });
4236 if (!UniformInd)
4237 continue;
4238
4239 // Determine if all users of the induction variable update instruction are
4240 // uniform after vectorization.
4241 auto UniformIndUpdate =
4242 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4243 auto *I = cast<Instruction>(U);
4244 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4245 isVectorizedMemAccessUse(I, IndUpdate);
4246 });
4247 if (!UniformIndUpdate)
4248 continue;
4249
4250 // The induction variable and its update instruction will remain uniform.
4251 addToWorklistIfAllowed(Ind);
4252 addToWorklistIfAllowed(IndUpdate);
4253 }
4254
4255 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4256}
4257
4259 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4260
4262 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4263 "runtime pointer checks needed. Enable vectorization of this "
4264 "loop with '#pragma clang loop vectorize(enable)' when "
4265 "compiling with -Os/-Oz",
4266 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4267 return true;
4268 }
4269
4270 if (!PSE.getPredicate().isAlwaysTrue()) {
4271 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4272 "runtime SCEV checks needed. Enable vectorization of this "
4273 "loop with '#pragma clang loop vectorize(enable)' when "
4274 "compiling with -Os/-Oz",
4275 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4276 return true;
4277 }
4278
4279 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4280 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4281 reportVectorizationFailure("Runtime stride check for small trip count",
4282 "runtime stride == 1 checks needed. Enable vectorization of "
4283 "this loop without such check by compiling with -Os/-Oz",
4284 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4285 return true;
4286 }
4287
4288 return false;
4289}
4290
4292LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4294 return ElementCount::getScalable(0);
4295
4297 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4298 "ScalableVectorizationDisabled", ORE, TheLoop);
4299 return ElementCount::getScalable(0);
4300 }
4301
4302 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4303
4304 auto MaxScalableVF = ElementCount::getScalable(
4305 std::numeric_limits<ElementCount::ScalarTy>::max());
4306
4307 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4308 // FIXME: While for scalable vectors this is currently sufficient, this should
4309 // be replaced by a more detailed mechanism that filters out specific VFs,
4310 // instead of invalidating vectorization for a whole set of VFs based on the
4311 // MaxVF.
4312
4313 // Disable scalable vectorization if the loop contains unsupported reductions.
4314 if (!canVectorizeReductions(MaxScalableVF)) {
4316 "Scalable vectorization not supported for the reduction "
4317 "operations found in this loop.",
4318 "ScalableVFUnfeasible", ORE, TheLoop);
4319 return ElementCount::getScalable(0);
4320 }
4321
4322 // Disable scalable vectorization if the loop contains any instructions
4323 // with element types not supported for scalable vectors.
4324 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4325 return !Ty->isVoidTy() &&
4327 })) {
4328 reportVectorizationInfo("Scalable vectorization is not supported "
4329 "for all element types found in this loop.",
4330 "ScalableVFUnfeasible", ORE, TheLoop);
4331 return ElementCount::getScalable(0);
4332 }
4333
4335 return MaxScalableVF;
4336
4337 // Limit MaxScalableVF by the maximum safe dependence distance.
4338 if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI))
4339 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
4340 else
4341 MaxScalableVF = ElementCount::getScalable(0);
4342
4343 if (!MaxScalableVF)
4345 "Max legal vector width too small, scalable vectorization "
4346 "unfeasible.",
4347 "ScalableVFUnfeasible", ORE, TheLoop);
4348
4349 return MaxScalableVF;
4350}
4351
4352FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4353 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4355 unsigned SmallestType, WidestType;
4356 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4357
4358 // Get the maximum safe dependence distance in bits computed by LAA.
4359 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4360 // the memory accesses that is most restrictive (involved in the smallest
4361 // dependence distance).
4362 unsigned MaxSafeElements =
4364
4365 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4366 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4367
4368 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4369 << ".\n");
4370 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4371 << ".\n");
4372
4373 // First analyze the UserVF, fall back if the UserVF should be ignored.
4374 if (UserVF) {
4375 auto MaxSafeUserVF =
4376 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4377
4378 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4379 // If `VF=vscale x N` is safe, then so is `VF=N`
4380 if (UserVF.isScalable())
4381 return FixedScalableVFPair(
4382 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4383 else
4384 return UserVF;
4385 }
4386
4387 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4388
4389 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4390 // is better to ignore the hint and let the compiler choose a suitable VF.
4391 if (!UserVF.isScalable()) {
4392 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4393 << " is unsafe, clamping to max safe VF="
4394 << MaxSafeFixedVF << ".\n");
4395 ORE->emit([&]() {
4396 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4398 TheLoop->getHeader())
4399 << "User-specified vectorization factor "
4400 << ore::NV("UserVectorizationFactor", UserVF)
4401 << " is unsafe, clamping to maximum safe vectorization factor "
4402 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4403 });
4404 return MaxSafeFixedVF;
4405 }
4406
4408 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4409 << " is ignored because scalable vectors are not "
4410 "available.\n");
4411 ORE->emit([&]() {
4412 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4414 TheLoop->getHeader())
4415 << "User-specified vectorization factor "
4416 << ore::NV("UserVectorizationFactor", UserVF)
4417 << " is ignored because the target does not support scalable "
4418 "vectors. The compiler will pick a more suitable value.";
4419 });
4420 } else {
4421 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4422 << " is unsafe. Ignoring scalable UserVF.\n");
4423 ORE->emit([&]() {
4424 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4426 TheLoop->getHeader())
4427 << "User-specified vectorization factor "
4428 << ore::NV("UserVectorizationFactor", UserVF)
4429 << " is unsafe. Ignoring the hint to let the compiler pick a "
4430 "more suitable value.";
4431 });
4432 }
4433 }
4434
4435 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4436 << " / " << WidestType << " bits.\n");
4437
4440 if (auto MaxVF =
4441 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4442 MaxSafeFixedVF, FoldTailByMasking))
4443 Result.FixedVF = MaxVF;
4444
4445 if (auto MaxVF =
4446 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4447 MaxSafeScalableVF, FoldTailByMasking))
4448 if (MaxVF.isScalable()) {
4449 Result.ScalableVF = MaxVF;
4450 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4451 << "\n");
4452 }
4453
4454 return Result;
4455}
4456
4460 // TODO: It may by useful to do since it's still likely to be dynamically
4461 // uniform if the target can skip.
4463 "Not inserting runtime ptr check for divergent target",
4464 "runtime pointer checks needed. Not enabled for divergent target",
4465 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4467 }
4468
4469 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4470 unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
4471 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4472 if (TC == 1) {
4473 reportVectorizationFailure("Single iteration (non) loop",
4474 "loop trip count is one, irrelevant for vectorization",
4475 "SingleIterationLoop", ORE, TheLoop);
4477 }
4478
4479 switch (ScalarEpilogueStatus) {
4481 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4483 [[fallthrough]];
4485 LLVM_DEBUG(
4486 dbgs() << "LV: vector predicate hint/switch found.\n"
4487 << "LV: Not allowing scalar epilogue, creating predicated "
4488 << "vector loop.\n");
4489 break;
4491 // fallthrough as a special case of OptForSize
4493 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4494 LLVM_DEBUG(
4495 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4496 else
4497 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4498 << "count.\n");
4499
4500 // Bail if runtime checks are required, which are not good when optimising
4501 // for size.
4504
4505 break;
4506 }
4507
4508 // The only loops we can vectorize without a scalar epilogue, are loops with
4509 // a bottom-test and a single exiting block. We'd have to handle the fact
4510 // that not every instruction executes on the last iteration. This will
4511 // require a lane mask which varies through the vector loop body. (TODO)
4513 // If there was a tail-folding hint/switch, but we can't fold the tail by
4514 // masking, fallback to a vectorization with a scalar epilogue.
4515 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4516 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4517 "scalar epilogue instead.\n");
4518 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4519 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4520 }
4522 }
4523
4524 // Now try the tail folding
4525
4526 // Invalidate interleave groups that require an epilogue if we can't mask
4527 // the interleave-group.
4529 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4530 "No decisions should have been taken at this point");
4531 // Note: There is no need to invalidate any cost modeling decisions here, as
4532 // non where taken so far.
4534 }
4535
4536 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4537
4538 // Avoid tail folding if the trip count is known to be a multiple of any VF
4539 // we choose.
4540 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4541 MaxFactors.FixedVF.getFixedValue();
4542 if (MaxFactors.ScalableVF) {
4543 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4544 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4545 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4546 *MaxPowerOf2RuntimeVF,
4547 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4548 } else
4549 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4550 }
4551
4552 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4553 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4554 "MaxFixedVF must be a power of 2");
4555 unsigned MaxVFtimesIC =
4556 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4557 ScalarEvolution *SE = PSE.getSE();
4558 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4559 const SCEV *ExitCount = SE->getAddExpr(
4560 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4561 const SCEV *Rem = SE->getURemExpr(
4562 SE->applyLoopGuards(ExitCount, TheLoop),
4563 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4564 if (Rem->isZero()) {
4565 // Accept MaxFixedVF if we do not have a tail.
4566 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4567 return MaxFactors;
4568 }
4569 }
4570
4571 // If we don't know the precise trip count, or if the trip count that we
4572 // found modulo the vectorization factor is not zero, try to fold the tail
4573 // by masking.
4574 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4575 setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4576 if (foldTailByMasking()) {
4578 LLVM_DEBUG(
4579 dbgs()
4580 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4581 "try to generate VP Intrinsics with scalable vector "
4582 "factors only.\n");
4583 // Tail folded loop using VP intrinsics restricts the VF to be scalable
4584 // for now.
4585 // TODO: extend it for fixed vectors, if required.
4586 assert(MaxFactors.ScalableVF.isScalable() &&
4587 "Expected scalable vector factor.");
4588
4589 MaxFactors.FixedVF = ElementCount::getFixed(1);
4590 }
4591 return MaxFactors;
4592 }
4593
4594 // If there was a tail-folding hint/switch, but we can't fold the tail by
4595 // masking, fallback to a vectorization with a scalar epilogue.
4596 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4597 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4598 "scalar epilogue instead.\n");
4599 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4600 return MaxFactors;
4601 }
4602
4603 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4604 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4606 }
4607
4608 if (TC == 0) {
4610 "Unable to calculate the loop count due to complex control flow",
4611 "unable to calculate the loop count due to complex control flow",
4612 "UnknownLoopCountComplexCFG", ORE, TheLoop);
4614 }
4615
4617 "Cannot optimize for size and vectorize at the same time.",
4618 "cannot optimize for size and vectorize at the same time. "
4619 "Enable vectorization of this loop with '#pragma clang loop "
4620 "vectorize(enable)' when compiling with -Os/-Oz",
4621 "NoTailLoopWithOptForSize", ORE, TheLoop);
4623}
4624
4625ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4626 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4627 ElementCount MaxSafeVF, bool FoldTailByMasking) {
4628 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4629 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4630 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4632
4633 // Convenience function to return the minimum of two ElementCounts.
4634 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4635 assert((LHS.isScalable() == RHS.isScalable()) &&
4636 "Scalable flags must match");
4637 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4638 };
4639
4640 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4641 // Note that both WidestRegister and WidestType may not be a powers of 2.
4642 auto MaxVectorElementCount = ElementCount::get(
4643 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4644 ComputeScalableMaxVF);
4645 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4646 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4647 << (MaxVectorElementCount * WidestType) << " bits.\n");
4648
4649 if (!MaxVectorElementCount) {
4650 LLVM_DEBUG(dbgs() << "LV: The target has no "
4651 << (ComputeScalableMaxVF ? "scalable" : "fixed")
4652 << " vector registers.\n");
4653 return ElementCount::getFixed(1);
4654 }
4655
4656 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4657 if (MaxVectorElementCount.isScalable() &&
4658 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4659 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4660 auto Min = Attr.getVScaleRangeMin();
4661 WidestRegisterMinEC *= Min;
4662 }
4663
4664 // When a scalar epilogue is required, at least one iteration of the scalar
4665 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4666 // max VF that results in a dead vector loop.
4667 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4668 MaxTripCount -= 1;
4669
4670 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4671 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4672 // If upper bound loop trip count (TC) is known at compile time there is no
4673 // point in choosing VF greater than TC (as done in the loop below). Select
4674 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4675 // scalable, we only fall back on a fixed VF when the TC is less than or
4676 // equal to the known number of lanes.
4677 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4678 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4679 "exceeding the constant trip count: "
4680 << ClampedUpperTripCount << "\n");
4681 return ElementCount::get(
4682 ClampedUpperTripCount,
4683 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4684 }
4685
4687 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4689 ElementCount MaxVF = MaxVectorElementCount;
4690 if (MaximizeBandwidth ||
4691 (MaximizeBandwidth.getNumOccurrences() == 0 &&
4694 auto MaxVectorElementCountMaxBW = ElementCount::get(
4695 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4696 ComputeScalableMaxVF);
4697 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4698
4699 // Collect all viable vectorization factors larger than the default MaxVF
4700 // (i.e. MaxVectorElementCount).
4702 for (ElementCount VS = MaxVectorElementCount * 2;
4703 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4704 VFs.push_back(VS);
4705
4706 // For each VF calculate its register usage.
4707 auto RUs = calculateRegisterUsage(VFs);
4708
4709 // Select the largest VF which doesn't require more registers than existing
4710 // ones.
4711 for (int i = RUs.size() - 1; i >= 0; --i) {
4712 bool Selected = true;
4713 for (auto &pair : RUs[i].MaxLocalUsers) {
4714 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
4715 if (pair.second > TargetNumRegisters)
4716 Selected = false;
4717 }
4718 if (Selected) {
4719 MaxVF = VFs[i];
4720 break;
4721 }
4722 }
4723 if (ElementCount MinVF =
4724 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4725 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4726 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4727 << ") with target's minimum: " << MinVF << '\n');
4728 MaxVF = MinVF;
4729 }
4730 }
4731
4732 // Invalidate any widening decisions we might have made, in case the loop
4733 // requires prediction (decided later), but we have already made some
4734 // load/store widening decisions.
4736 }
4737 return MaxVF;
4738}
4739
4740/// Convenience function that returns the value of vscale_range iff
4741/// vscale_range.min == vscale_range.max or otherwise returns the value
4742/// returned by the corresponding TTI method.
4743static std::optional<unsigned>
4745 const Function *Fn = L->getHeader()->getParent();
4746 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4747 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4748 auto Min = Attr.getVScaleRangeMin();
4749 auto Max = Attr.getVScaleRangeMax();
4750 if (Max && Min == Max)
4751 return Max;
4752 }
4753
4754 return TTI.getVScaleForTuning();
4755}
4756
4757bool LoopVectorizationPlanner::isMoreProfitable(
4758 const VectorizationFactor &A, const VectorizationFactor &B) const {
4759 InstructionCost CostA = A.Cost;
4760 InstructionCost CostB = B.Cost;
4761
4762 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4763
4764 // Improve estimate for the vector width if it is scalable.
4765 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4766 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4767 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4768 if (A.Width.isScalable())
4769 EstimatedWidthA *= *VScale;
4770 if (B.Width.isScalable())
4771 EstimatedWidthB *= *VScale;
4772 }
4773
4774 // Assume vscale may be larger than 1 (or the value being tuned for),
4775 // so that scalable vectorization is slightly favorable over fixed-width
4776 // vectorization.
4777 bool PreferScalable = A.Width.isScalable() && !B.Width.isScalable();
4778 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4779 const InstructionCost &RHS) {
4780 return PreferScalable ? LHS <= RHS : LHS < RHS;
4781 };
4782
4783 // To avoid the need for FP division:
4784 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4785 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4786 if (!MaxTripCount)
4787 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4788
4789 auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4790 InstructionCost VectorCost,
4791 InstructionCost ScalarCost) {
4792 // If the trip count is a known (possibly small) constant, the trip count
4793 // will be rounded up to an integer number of iterations under
4794 // FoldTailByMasking. The total cost in that case will be
4795 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4796 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4797 // some extra overheads, but for the purpose of comparing the costs of
4798 // different VFs we can use this to compare the total loop-body cost
4799 // expected after vectorization.
4800 if (CM.foldTailByMasking())
4801 return VectorCost * divideCeil(MaxTripCount, VF);
4802 return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4803 };
4804
4805 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4806 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4807 return CmpFn(RTCostA, RTCostB);
4808}
4809
4812 Loop *TheLoop) {
4813 if (InvalidCosts.empty())
4814 return;
4815
4816 // Emit a report of VFs with invalid costs in the loop.
4817
4818 // Group the remarks per instruction, keeping the instruction order from
4819 // InvalidCosts.
4820 std::map<Instruction *, unsigned> Numbering;
4821 unsigned I = 0;
4822 for (auto &Pair : InvalidCosts)
4823 if (!Numbering.count(Pair.first))
4824 Numbering[Pair.first] = I++;
4825
4826 // Sort the list, first on instruction(number) then on VF.
4827 sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
4828 if (Numbering[A.first] != Numbering[B.first])
4829 return Numbering[A.first] < Numbering[B.first];
4830 const auto &LHS = A.second;
4831 const auto &RHS = B.second;
4832 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
4833 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
4834 });
4835
4836 // For a list of ordered instruction-vf pairs:
4837 // [(load, vf1), (load, vf2), (store, vf1)]
4838 // Group the instructions together to emit separate remarks for:
4839 // load (vf1, vf2)
4840 // store (vf1)
4841 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
4842 auto Subset = ArrayRef<InstructionVFPair>();
4843 do {
4844 if (Subset.empty())
4845 Subset = Tail.take_front(1);
4846
4847 Instruction *I = Subset.front().first;
4848
4849 // If the next instruction is different, or if there are no other pairs,
4850 // emit a remark for the collated subset. e.g.
4851 // [(load, vf1), (load, vf2))]
4852 // to emit:
4853 // remark: invalid costs for 'load' at VF=(vf, vf2)
4854 if (Subset == Tail || Tail[Subset.size()].first != I) {
4855 std::string OutString;
4856 raw_string_ostream OS(OutString);
4857 assert(!Subset.empty() && "Unexpected empty range");
4858 OS << "Instruction with invalid costs prevented vectorization at VF=(";
4859 for (const auto &Pair : Subset)
4860 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4861 OS << "):";
4862 if (auto *CI = dyn_cast<CallInst>(I))
4863 OS << " call to " << CI->getCalledFunction()->getName();
4864 else
4865 OS << " " << I->getOpcodeName();
4866 OS.flush();
4867 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
4868 Tail = Tail.drop_front(Subset.size());
4869 Subset = {};
4870 } else
4871 // Grow the subset by one element
4872 Subset = Tail.take_front(Subset.size() + 1);
4873 } while (!Tail.empty());
4874}
4875
4876VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4877 InstructionCost ExpectedCost =
4879 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4880 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4881 assert(any_of(VPlans,
4882 [](std::unique_ptr<VPlan> &P) {
4883 return P->hasVF(ElementCount::getFixed(1));
4884 }) &&
4885 "Expected Scalar VF to be a candidate");
4886
4887 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4888 ExpectedCost);
4889 VectorizationFactor ChosenFactor = ScalarCost;
4890
4891 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4892 if (ForceVectorization &&
4893 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4894 // Ignore scalar width, because the user explicitly wants vectorization.
4895 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4896 // evaluation.
4897 ChosenFactor.Cost = InstructionCost::getMax();
4898 }
4899
4900 SmallVector<InstructionVFPair> InvalidCosts;
4901 for (auto &P : VPlans) {
4902 for (ElementCount VF : P->vectorFactors()) {
4903 // The cost for scalar VF=1 is already calculated, so ignore it.
4904 if (VF.isScalar())
4905 continue;
4906
4908 CM.expectedCost(VF, &InvalidCosts);
4909 VectorizationFactor Candidate(VF, C.first, ScalarCost.ScalarCost);
4910
4911#ifndef NDEBUG
4912 unsigned AssumedMinimumVscale =
4913 getVScaleForTuning(OrigLoop, TTI).value_or(1);
4914 unsigned Width =
4915 Candidate.Width.isScalable()
4916 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
4917 : Candidate.Width.getFixedValue();
4918 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4919 << " costs: " << (Candidate.Cost / Width));
4920 if (VF.isScalable())
4921 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4922 << AssumedMinimumVscale << ")");
4923 LLVM_DEBUG(dbgs() << ".\n");
4924#endif
4925
4926 if (!C.second && !ForceVectorization) {
4927 LLVM_DEBUG(
4928 dbgs()
4929 << "LV: Not considering vector loop of width " << VF
4930 << " because it will not generate any vector instructions.\n");
4931 continue;
4932 }
4933
4934 // If profitable add it to ProfitableVF list.
4935 if (isMoreProfitable(Candidate, ScalarCost))
4936 ProfitableVFs.push_back(Candidate);
4937
4938 if (isMoreProfitable(Candidate, ChosenFactor))
4939 ChosenFactor = Candidate;
4940 }
4941 }
4942
4943 emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
4944
4947 "There are conditional stores.",
4948 "store that is conditionally executed prevents vectorization",
4949 "ConditionalStore", ORE, OrigLoop);
4950 ChosenFactor = ScalarCost;
4951 }
4952
4953 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4954 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
4955 << "LV: Vectorization seems to be not beneficial, "
4956 << "but was forced by a user.\n");
4957 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
4958 return ChosenFactor;
4959}
4960
4961bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4962 ElementCount VF) const {
4963 // Cross iteration phis such as reductions need special handling and are
4964 // currently unsupported.
4965 if (any_of(OrigLoop->getHeader()->phis(),
4966 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
4967 return false;
4968
4969 // Phis with uses outside of the loop require special handling and are
4970 // currently unsupported.
4971 for (const auto &Entry : Legal->getInductionVars()) {
4972 // Look for uses of the value of the induction at the last iteration.
4973 Value *PostInc =
4974 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4975 for (User *U : PostInc->users())
4976 if (!OrigLoop->contains(cast<Instruction>(U)))
4977 return false;
4978 // Look for uses of penultimate value of the induction.
4979 for (User *U : Entry.first->users())
4980 if (!OrigLoop->contains(cast<Instruction>(U)))
4981 return false;
4982 }
4983
4984 // Epilogue vectorization code has not been auditted to ensure it handles
4985 // non-latch exits properly. It may be fine, but it needs auditted and
4986 // tested.
4987 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4988 return false;
4989
4990 return true;
4991}
4992
4994 const ElementCount VF) const {
4995 // FIXME: We need a much better cost-model to take different parameters such
4996 // as register pressure, code size increase and cost of extra branches into
4997 // account. For now we apply a very crude heuristic and only consider loops
4998 // with vectorization factors larger than a certain value.
4999
5000 // Allow the target to opt out entirely.
5002 return false;
5003
5004 // We also consider epilogue vectorization unprofitable for targets that don't
5005 // consider interleaving beneficial (eg. MVE).
5006 if (TTI.getMaxInterleaveFactor(VF) <= 1)
5007 return false;
5008
5009 unsigned Multiplier = 1;
5010 if (VF.isScalable())
5011 Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
5012 if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
5013 return true;
5014 return false;
5015}
5016
5018 const ElementCount MainLoopVF, unsigned IC) {
5021 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
5022 return Result;
5023 }
5024
5025 if (!CM.isScalarEpilogueAllowed()) {
5026 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
5027 "epilogue is allowed.\n");
5028 return Result;
5029 }
5030
5031 // Not really a cost consideration, but check for unsupported cases here to
5032 // simplify the logic.
5033 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
5034 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
5035 "is not a supported candidate.\n");
5036 return Result;
5037 }
5038
5040 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
5042 if (hasPlanWithVF(ForcedEC))
5043 return {ForcedEC, 0, 0};
5044 else {
5045 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
5046 "viable.\n");
5047 return Result;
5048 }
5049 }
5050
5051 if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
5052 OrigLoop->getHeader()->getParent()->hasMinSize()) {
5053 LLVM_DEBUG(
5054 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
5055 return Result;
5056 }
5057
5058 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
5059 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5060 "this loop\n");
5061 return Result;
5062 }
5063
5064 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5065 // the main loop handles 8 lanes per iteration. We could still benefit from
5066 // vectorizing the epilogue loop with VF=4.
5067 ElementCount EstimatedRuntimeVF = MainLoopVF;
5068 if (MainLoopVF.isScalable()) {
5069 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5070 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
5071 EstimatedRuntimeVF *= *VScale;
5072 }
5073
5074 ScalarEvolution &SE = *PSE.getSE();
5075 Type *TCType = Legal->getWidestInductionType();
5076 const SCEV *RemainingIterations = nullptr;
5077 for (auto &NextVF : ProfitableVFs) {
5078 // Skip candidate VFs without a corresponding VPlan.
5079 if (!hasPlanWithVF(NextVF.Width))
5080 continue;
5081
5082 // Skip candidate VFs with widths >= the estimate runtime VF (scalable
5083 // vectors) or the VF of the main loop (fixed vectors).
5084 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5085 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
5086 ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
5087 continue;
5088
5089 // If NextVF is greater than the number of remaining iterations, the
5090 // epilogue loop would be dead. Skip such factors.
5091 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
5092 // TODO: extend to support scalable VFs.
5093 if (!RemainingIterations) {
5094 const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
5095 RemainingIterations = SE.getURemExpr(
5096 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
5097 }
5098 if (SE.isKnownPredicate(
5100 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
5101 RemainingIterations))
5102 continue;
5103 }
5104
5105 if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
5106 Result = NextVF;
5107 }
5108
5109 if (Result != VectorizationFactor::Disabled())
5110 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5111 << Result.Width << "\n");
5112 return Result;
5113}
5114
5115std::pair<unsigned, unsigned>
5117 unsigned MinWidth = -1U;
5118 unsigned MaxWidth = 8;
5120 // For in-loop reductions, no element types are added to ElementTypesInLoop
5121 // if there are no loads/stores in the loop. In this case, check through the
5122 // reduction variables to determine the maximum width.
5123 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5124 // Reset MaxWidth so that we can find the smallest type used by recurrences
5125 // in the loop.
5126 MaxWidth = -1U;
5127 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
5128 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5129 // When finding the min width used by the recurrence we need to account
5130 // for casts on the input operands of the recurrence.
5131 MaxWidth = std::min<unsigned>(
5132 MaxWidth, std::min<unsigned>(
5135 }
5136 } else {
5137 for (Type *T : ElementTypesInLoop) {
5138 MinWidth = std::min<unsigned>(
5139 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5140 MaxWidth = std::max<unsigned>(
5141 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5142 }
5143 }
5144 return {MinWidth, MaxWidth};
5145}
5146
5148 ElementTypesInLoop.clear();
5149 // For each block.
5150 for (BasicBlock *BB : TheLoop->blocks()) {
5151 // For each instruction in the loop.
5152 for (Instruction &I : BB->instructionsWithoutDebug()) {
5153 Type *T = I.getType();
5154
5155 // Skip ignored values.
5156 if (ValuesToIgnore.count(&I))
5157 continue;
5158
5159 // Only examine Loads, Stores and PHINodes.
5160 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5161 continue;
5162
5163 // Examine PHI nodes that are reduction variables. Update the type to
5164 // account for the recurrence type.
5165 if (auto *PN = dyn_cast<PHINode>(&I)) {
5166 if (!Legal->isReductionVariable(PN))
5167 continue;
5168 const RecurrenceDescriptor &RdxDesc =
5169 Legal->getReductionVars().find(PN)->second;
5172 RdxDesc.getRecurrenceType(),
5174 continue;
5175 T = RdxDesc.getRecurrenceType();
5176 }
5177
5178 // Examine the stored values.
5179 if (auto *ST = dyn_cast<StoreInst>(&I))
5180 T = ST->getValueOperand()->getType();
5181
5182 assert(T->isSized() &&
5183 "Expected the load/store/recurrence type to be sized");
5184
5185 ElementTypesInLoop.insert(T);
5186 }
5187 }
5188}
5189
5190unsigned
5192 InstructionCost LoopCost) {
5193 // -- The interleave heuristics --
5194 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5195 // There are many micro-architectural considerations that we can't predict
5196 // at this level. For example, frontend pressure (on decode or fetch) due to
5197 // code size, or the number and capabilities of the execution ports.
5198 //
5199 // We use the following heuristics to select the interleave count:
5200 // 1. If the code has reductions, then we interleave to break the cross
5201 // iteration dependency.
5202 // 2. If the loop is really small, then we interleave to reduce the loop
5203 // overhead.
5204 // 3. We don't interleave if we think that we will spill registers to memory
5205 // due to the increased register pressure.
5206
5208 return 1;
5209
5210 // Do not interleave if EVL is preferred and no User IC is specified.
5211 if (foldTailWithEVL()) {
5212 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
5213 "Unroll factor forced to be 1.\n");
5214 return 1;
5215 }
5216
5217 // We used the distance for the interleave count.
5219 return 1;
5220
5221 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5222 const bool HasReductions = !Legal->getReductionVars().empty();
5223
5224 // If we did not calculate the cost for VF (because the user selected the VF)
5225 // then we calculate the cost of VF here.
5226 if (LoopCost == 0) {
5227 LoopCost = expectedCost(VF).first;
5228 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5229
5230 // Loop body is free and there is no need for interleaving.
5231 if (LoopCost == 0)
5232 return 1;
5233 }
5234
5236 // We divide by these constants so assume that we have at least one
5237 // instruction that uses at least one register.
5238 for (auto& pair : R.MaxLocalUsers) {
5239 pair.second = std::max(pair.second, 1U);
5240 }
5241
5242 // We calculate the interleave count using the following formula.
5243 // Subtract the number of loop invariants from the number of available
5244 // registers. These registers are used by all of the interleaved instances.
5245 // Next, divide the remaining registers by the number of registers that is
5246 // required by the loop, in order to estimate how many parallel instances
5247 // fit without causing spills. All of this is rounded down if necessary to be
5248 // a power of two. We want power of two interleave count to simplify any
5249 // addressing operations or alignment considerations.
5250 // We also want power of two interleave counts to ensure that the induction
5251 // variable of the vector loop wraps to zero, when tail is folded by masking;
5252 // this currently happens when OptForSize, in which case IC is set to 1 above.
5253 unsigned IC = UINT_MAX;
5254
5255 for (auto& pair : R.MaxLocalUsers) {
5256 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5257 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5258 << " registers of "
5259 << TTI.getRegisterClassName(pair.first) << " register class\n");
5260 if (VF.isScalar()) {
5261 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5262 TargetNumRegisters = ForceTargetNumScalarRegs;
5263 } else {
5264 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5265 TargetNumRegisters = ForceTargetNumVectorRegs;
5266 }
5267 unsigned MaxLocalUsers = pair.second;
5268 unsigned LoopInvariantRegs = 0;
5269 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5270 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5271
5272 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5273 MaxLocalUsers);
5274 // Don't count the induction variable as interleaved.
5276 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5277 std::max(1U, (MaxLocalUsers - 1)));
5278 }
5279
5280 IC = std::min(IC, TmpIC);
5281 }
5282
5283 // Clamp the interleave ranges to reasonable counts.
5284 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5285
5286 // Check if the user has overridden the max.
5287 if (VF.isScalar()) {
5288 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5289 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5290 } else {
5291 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5292 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5293 }
5294
5295 unsigned EstimatedVF = VF.getKnownMinValue();
5296 if (VF.isScalable()) {
5297 if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
5298 EstimatedVF *= *VScale;
5299 }
5300 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
5301
5302 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5303 if (KnownTC > 0) {
5304 // At least one iteration must be scalar when this constraint holds. So the
5305 // maximum available iterations for interleaving is one less.
5306 unsigned AvailableTC =
5307 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5308
5309 // If trip count is known we select between two prospective ICs, where
5310 // 1) the aggressive IC is capped by the trip count divided by VF
5311 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5312 // The final IC is selected in a way that the epilogue loop trip count is
5313 // minimized while maximizing the IC itself, so that we either run the
5314 // vector loop at least once if it generates a small epilogue loop, or else
5315 // we run the vector loop at least twice.
5316
5317 unsigned InterleaveCountUB = bit_floor(
5318 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
5319 unsigned InterleaveCountLB = bit_floor(std::max(
5320 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5321 MaxInterleaveCount = InterleaveCountLB;
5322
5323 if (InterleaveCountUB != InterleaveCountLB) {
5324 unsigned TailTripCountUB =
5325 (AvailableTC % (EstimatedVF * InterleaveCountUB));
5326 unsigned TailTripCountLB =
5327 (AvailableTC % (EstimatedVF * InterleaveCountLB));
5328 // If both produce same scalar tail, maximize the IC to do the same work
5329 // in fewer vector loop iterations
5330 if (TailTripCountUB == TailTripCountLB)
5331 MaxInterleaveCount = InterleaveCountUB;
5332 }
5333 } else if (BestKnownTC && *BestKnownTC > 0) {
5334 // At least one iteration must be scalar when this constraint holds. So the
5335 // maximum available iterations for interleaving is one less.
5336 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5337 ? (*BestKnownTC) - 1
5338 : *BestKnownTC;
5339
5340 // If trip count is an estimated compile time constant, limit the
5341 // IC to be capped by the trip count divided by VF * 2, such that the vector
5342 // loop runs at least twice to make interleaving seem profitable when there
5343 // is an epilogue loop present. Since exact Trip count is not known we
5344 // choose to be conservative in our IC estimate.
5345 MaxInterleaveCount = bit_floor(std::max(
5346 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5347 }
5348
5349 assert(MaxInterleaveCount > 0 &&
5350 "Maximum interleave count must be greater than 0");
5351
5352 // Clamp the calculated IC to be between the 1 and the max interleave count
5353 // that the target and trip count allows.
5354 if (IC > MaxInterleaveCount)
5355 IC = MaxInterleaveCount;
5356 else
5357 // Make sure IC is greater than 0.
5358 IC = std::max(1u, IC);
5359
5360 assert(IC > 0 && "Interleave count must be greater than 0.");
5361
5362 // Interleave if we vectorized this loop and there is a reduction that could
5363 // benefit from interleaving.
5364 if (VF.isVector() && HasReductions) {
5365 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5366 return IC;
5367 }
5368
5369 // For any scalar loop that either requires runtime checks or predication we
5370 // are better off leaving this to the unroller. Note that if we've already
5371 // vectorized the loop we will have done the runtime check and so interleaving
5372 // won't require further checks.
5373 bool ScalarInterleavingRequiresPredication =
5374 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5375 return Legal->blockNeedsPredication(BB);
5376 }));
5377 bool ScalarInterleavingRequiresRuntimePointerCheck =
5379
5380 // We want to interleave small loops in order to reduce the loop overhead and
5381 // potentially expose ILP opportunities.
5382 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5383 << "LV: IC is " << IC << '\n'
5384 << "LV: VF is " << VF << '\n');
5385 const bool AggressivelyInterleaveReductions =
5386 TTI.enableAggressiveInterleaving(HasReductions);
5387 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5388 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5389 // We assume that the cost overhead is 1 and we use the cost model
5390 // to estimate the cost of the loop and interleave until the cost of the
5391 // loop overhead is about 5% of the cost of the loop.
5392 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5393 SmallLoopCost / *LoopCost.getValue()));
5394
5395 // Interleave until store/load ports (estimated by max interleave count) are
5396 // saturated.
5397 unsigned NumStores = Legal->getNumStores();
5398 unsigned NumLoads = Legal->getNumLoads();
5399 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5400 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5401
5402 // There is little point in interleaving for reductions containing selects
5403 // and compares when VF=1 since it may just create more overhead than it's
5404 // worth for loops with small trip counts. This is because we still have to
5405 // do the final reduction after the loop.
5406 bool HasSelectCmpReductions =
5407 HasReductions &&
5408 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5409 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5410 return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5411 RdxDesc.getRecurrenceKind());
5412 });
5413 if (HasSelectCmpReductions) {
5414 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5415 return 1;
5416 }
5417
5418 // If we have a scalar reduction (vector reductions are already dealt with
5419 // by this point), we can increase the critical path length if the loop
5420 // we're interleaving is inside another loop. For tree-wise reductions
5421 // set the limit to 2, and for ordered reductions it's best to disable
5422 // interleaving entirely.
5423 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5424 bool HasOrderedReductions =
5425 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5426 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5427 return RdxDesc.isOrdered();
5428 });
5429 if (HasOrderedReductions) {
5430 LLVM_DEBUG(
5431 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5432 return 1;
5433 }
5434
5435 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5436 SmallIC = std::min(SmallIC, F);
5437 StoresIC = std::min(StoresIC, F);
5438 LoadsIC = std::min(LoadsIC, F);
5439 }
5440
5442 std::max(StoresIC, LoadsIC) > SmallIC) {
5443 LLVM_DEBUG(
5444 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5445 return std::max(StoresIC, LoadsIC);
5446 }
5447
5448 // If there are scalar reductions and TTI has enabled aggressive
5449 // interleaving for reductions, we will interleave to expose ILP.
5450 if (VF.isScalar() && AggressivelyInterleaveReductions) {
5451 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5452 // Interleave no less than SmallIC but not as aggressive as the normal IC
5453 // to satisfy the rare situation when resources are too limited.
5454 return std::max(IC / 2, SmallIC);
5455 } else {
5456 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5457 return SmallIC;
5458 }
5459 }
5460
5461 // Interleave if this is a large loop (small loops are already dealt with by
5462 // this point) that could benefit from interleaving.
5463 if (AggressivelyInterleaveReductions) {
5464 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5465 return IC;
5466 }
5467
5468 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5469 return 1;
5470}
5471
5474 // This function calculates the register usage by measuring the highest number
5475 // of values that are alive at a single location. Obviously, this is a very
5476 // rough estimation. We scan the loop in a topological order in order and
5477 // assign a number to each instruction. We use RPO to ensure that defs are
5478 // met before their users. We assume that each instruction that has in-loop
5479 // users starts an interval. We record every time that an in-loop value is
5480 // used, so we have a list of the first and last occurrences of each
5481 // instruction. Next, we transpose this data structure into a multi map that
5482 // holds the list of intervals that *end* at a specific location. This multi
5483 // map allows us to perform a linear search. We scan the instructions linearly
5484 // and record each time that a new interval starts, by placing it in a set.
5485 // If we find this value in the multi-map then we remove it from the set.
5486 // The max register usage is the maximum size of the set.
5487 // We also search for instructions that are defined outside the loop, but are
5488 // used inside the loop. We need this number separately from the max-interval
5489 // usage number because when we unroll, loop-invariant values do not take
5490 // more register.
5492 DFS.perform(LI);
5493
5494 RegisterUsage RU;
5495
5496 // Each 'key' in the map opens a new interval. The values
5497 // of the map are the index of the 'last seen' usage of the
5498 // instruction that is the key.
5500
5501 // Maps instruction to its index.
5503 // Marks the end of each interval.
5504 IntervalMap EndPoint;
5505 // Saves the list of instruction indices that are used in the loop.
5507 // Saves the list of values that are used in the loop but are defined outside
5508 // the loop (not including non-instruction values such as arguments and
5509 // constants).
5510 SmallSetVector<Instruction *, 8> LoopInvariants;
5511
5512 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5513 for (Instruction &I : BB->instructionsWithoutDebug()) {
5514 IdxToInstr.push_back(&I);
5515
5516 // Save the end location of each USE.
5517 for (Value *U : I.operands()) {
5518 auto *Instr = dyn_cast<Instruction>(U);
5519
5520 // Ignore non-instruction values such as arguments, constants, etc.
5521 // FIXME: Might need some motivation why these values are ignored. If
5522 // for example an argument is used inside the loop it will increase the
5523 // register pressure (so shouldn't we add it to LoopInvariants).
5524 if (!Instr)
5525 continue;
5526
5527 // If this instruction is outside the loop then record it and continue.
5528 if (!TheLoop->contains(Instr)) {
5529 LoopInvariants.insert(Instr);
5530 continue;
5531 }
5532
5533 // Overwrite previous end points.
5534 EndPoint[Instr] = IdxToInstr.size();
5535 Ends.insert(Instr);
5536 }
5537 }
5538 }
5539
5540 // Saves the list of intervals that end with the index in 'key'.
5541 using InstrList = SmallVector<Instruction *, 2>;
5542 DenseMap<unsigned, InstrList> TransposeEnds;
5543
5544 // Transpose the EndPoints to a list of values that end at each index.
5545 for (auto &Interval : EndPoint)
5546 TransposeEnds[Interval.second].push_back(Interval.first);
5547
5548 SmallPtrSet<Instruction *, 8> OpenIntervals;
5551
5552 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5553
5554 const auto &TTICapture = TTI;
5555 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5556 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5557 return 0;
5558 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5559 };
5560
5561 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5562 Instruction *I = IdxToInstr[i];
5563
5564 // Remove all of the instructions that end at this location.
5565 InstrList &List = TransposeEnds[i];
5566 for (Instruction *ToRemove : List)
5567 OpenIntervals.erase(ToRemove);
5568
5569 // Ignore instructions that are never used within the loop.
5570 if (!Ends.count(I))
5571 continue;
5572
5573 // Skip ignored values.
5574 if (ValuesToIgnore.count(I))
5575 continue;
5576
5578
5579 // For each VF find the maximum usage of registers.
5580 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5581 // Count the number of registers used, per register class, given all open
5582 // intervals.
5583 // Note that elements in this SmallMapVector will be default constructed
5584 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5585 // there is no previous entry for ClassID.
5587
5588 if (VFs[j].isScalar()) {
5589 for (auto *Inst : OpenIntervals) {
5590 unsigned ClassID =
5591 TTI.getRegisterClassForType(false, Inst->getType());
5592 // FIXME: The target might use more than one register for the type
5593 // even in the scalar case.
5594 RegUsage[ClassID] += 1;
5595 }
5596 } else {
5598 for (auto *Inst : OpenIntervals) {
5599 // Skip ignored values for VF > 1.
5600 if (VecValuesToIgnore.count(Inst))
5601 continue;
5602 if (isScalarAfterVectorization(Inst, VFs[j])) {
5603 unsigned ClassID =
5604 TTI.getRegisterClassForType(false, Inst->getType());
5605 // FIXME: The target might use more than one register for the type
5606 // even in the scalar case.
5607 RegUsage[ClassID] += 1;
5608 } else {
5609 unsigned ClassID =
5610 TTI.getRegisterClassForType(true, Inst->getType());
5611 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5612 }
5613 }
5614 }
5615
5616 for (auto& pair : RegUsage) {
5617 auto &Entry = MaxUsages[j][pair.first];
5618 Entry = std::max(Entry, pair.second);
5619 }
5620 }
5621
5622 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5623 << OpenIntervals.size() << '\n');
5624
5625 // Add the current instruction to the list of open intervals.
5626 OpenIntervals.insert(I);
5627 }
5628
5629 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5630 // Note that elements in this SmallMapVector will be default constructed
5631 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5632 // there is no previous entry for ClassID.
5634
5635 for (auto *Inst : LoopInvariants) {
5636 // FIXME: The target might use more than one register for the type
5637 // even in the scalar case.
5638 bool IsScalar = all_of(Inst->users(), [&](User *U) {
5639 auto *I = cast<Instruction>(U);
5640 return TheLoop != LI->getLoopFor(I->getParent()) ||
5641 isScalarAfterVectorization(I, VFs[i]);
5642 });
5643
5644 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
5645 unsigned ClassID =
5646 TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5647 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5648 }
5649
5650 LLVM_DEBUG({
5651 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5652 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5653 << " item\n";
5654 for (const auto &pair : MaxUsages[i]) {
5655 dbgs() << "LV(REG): RegisterClass: "
5656 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5657 << " registers\n";
5658 }
5659 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5660 << " item\n";
5661 for (const auto &pair : Invariant) {
5662 dbgs() << "LV(REG): RegisterClass: "
5663 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5664 << " registers\n";
5665 }
5666 });
5667
5668 RU.LoopInvariantRegs = Invariant;
5669 RU.MaxLocalUsers = MaxUsages[i];
5670 RUs[i] = RU;
5671 }
5672
5673 return RUs;
5674}
5675
5676bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5677 ElementCount VF) {
5678 // TODO: Cost model for emulated masked load/store is completely
5679 // broken. This hack guides the cost model to use an artificially
5680 // high enough value to practically disable vectorization with such
5681 // operations, except where previously deployed legality hack allowed
5682 // using very low cost values. This is to avoid regressions coming simply
5683 // from moving "masked load/store" check from legality to cost model.
5684 // Masked Load/Gather emulation was previously never allowed.
5685 // Limited number of Masked Store/Scatter emulation was allowed.
5687 "Expecting a scalar emulated instruction");
5688 return isa<LoadInst>(I) ||
5689 (isa<StoreInst>(I) &&
5690 NumPredStores > NumberOfStoresToPredicate);
5691}
5692
5694 // If we aren't vectorizing the loop, or if we've already collected the
5695 // instructions to scalarize, there's nothing to do. Collection may already
5696 // have occurred if we have a user-selected VF and are now computing the
5697 // expected cost for interleaving.
5698 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5699 return;
5700
5701 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5702 // not profitable to scalarize any instructions, the presence of VF in the
5703 // map will indicate that we've analyzed it already.
5704 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5705
5706 PredicatedBBsAfterVectorization[VF].clear();
5707
5708 // Find all the instructions that are scalar with predication in the loop and
5709 // determine if it would be better to not if-convert the blocks they are in.
5710 // If so, we also record the instructions to scalarize.
5711 for (BasicBlock *BB : TheLoop->blocks()) {
5713 continue;
5714 for (Instruction &I : *BB)
5715 if (isScalarWithPredication(&I, VF)) {
5716 ScalarCostsTy ScalarCosts;
5717 // Do not apply discount logic for:
5718 // 1. Scalars after vectorization, as there will only be a single copy
5719 // of the instruction.
5720 // 2. Scalable VF, as that would lead to invalid scalarization costs.
5721 // 3. Emulated masked memrefs, if a hacked cost is needed.
5722 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5723 !useEmulatedMaskMemRefHack(&I, VF) &&
5724 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5725 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5726 // Remember that BB will remain after vectorization.
5727 PredicatedBBsAfterVectorization[VF].insert(BB);
5728 for (auto *Pred : predecessors(BB)) {
5729 if (Pred->getSingleSuccessor() == BB)
5730 PredicatedBBsAfterVectorization[VF].insert(Pred);
5731 }
5732 }
5733 }
5734}
5735
5736InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5737 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5738 assert(!isUniformAfterVectorization(PredInst, VF) &&
5739 "Instruction marked uniform-after-vectorization will be predicated");
5740
5741 // Initialize the discount to zero, meaning that the scalar version and the
5742 // vector version cost the same.
5743 InstructionCost Discount = 0;
5744
5745 // Holds instructions to analyze. The instructions we visit are mapped in
5746 // ScalarCosts. Those instructions are the ones that would be scalarized if
5747 // we find that the scalar version costs less.
5749
5750 // Returns true if the given instruction can be scalarized.
5751 auto canBeScalarized = [&](Instruction *I) -> bool {
5752 // We only attempt to scalarize instructions forming a single-use chain
5753 // from the original predicated block that would otherwise be vectorized.
5754 // Although not strictly necessary, we give up on instructions we know will
5755 // already be scalar to avoid traversing chains that are unlikely to be
5756 // beneficial.
5757 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5759 return false;
5760
5761 // If the instruction is scalar with predication, it will be analyzed
5762 // separately. We ignore it within the context of PredInst.
5763 if (isScalarWithPredication(I, VF))
5764 return false;
5765
5766 // If any of the instruction's operands are uniform after vectorization,
5767 // the instruction cannot be scalarized. This prevents, for example, a
5768 // masked load from being scalarized.
5769 //
5770 // We assume we will only emit a value for lane zero of an instruction
5771 // marked uniform after vectorization, rather than VF identical values.
5772 // Thus, if we scalarize an instruction that uses a uniform, we would
5773 // create uses of values corresponding to the lanes we aren't emitting code
5774 // for. This behavior can be changed by allowing getScalarValue to clone
5775 // the lane zero values for uniforms rather than asserting.
5776 for (Use &U : I->operands())
5777 if (auto *J = dyn_cast<Instruction>(U.get()))
5778 if (isUniformAfterVectorization(J, VF))
5779 return false;
5780
5781 // Otherwise, we can scalarize the instruction.
5782 return true;
5783 };
5784
5785 // Compute the expected cost discount from scalarizing the entire expression
5786 // feeding the predicated instruction. We currently only consider expressions
5787 // that are single-use instruction chains.
5788 Worklist.push_back(PredInst);
5789 while (!Worklist.empty()) {
5790 Instruction *I = Worklist.pop_back_val();
5791
5792 // If we've already analyzed the instruction, there's nothing to do.
5793 if (ScalarCosts.contains(I))
5794 continue;
5795
5796 // Compute the cost of the vector instruction. Note that this cost already
5797 // includes the scalarization overhead of the predicated instruction.
5798 InstructionCost VectorCost = getInstructionCost(I, VF).first;
5799
5800 // Compute the cost of the scalarized instruction. This cost is the cost of
5801 // the instruction as if it wasn't if-converted and instead remained in the
5802 // predicated block. We will scale this cost by block probability after
5803 // computing the scalarization overhead.
5804 InstructionCost ScalarCost =
5805 VF.getFixedValue() *
5806 getInstructionCost(I, ElementCount::getFixed(1)).first;
5807
5808 // Compute the scalarization overhead of needed insertelement instructions
5809 // and phi nodes.
5811 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5812 ScalarCost += TTI.getScalarizationOverhead(
5813 cast<VectorType>(ToVectorTy(I->getType(), VF)),
5814 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5815 /*Extract*/ false, CostKind);
5816 ScalarCost +=
5817 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5818 }
5819
5820 // Compute the scalarization overhead of needed extractelement
5821 // instructions. For each of the instruction's operands, if the operand can
5822 // be scalarized, add it to the worklist; otherwise, account for the
5823 // overhead.
5824 for (Use &U : I->operands())
5825 if (auto *J = dyn_cast<Instruction>(U.get())) {
5826 assert(VectorType::isValidElementType(J->getType()) &&
5827 "Instruction has non-scalar type");
5828 if (canBeScalarized(J))
5829 Worklist.push_back(J);
5830 else if (needsExtract(J, VF)) {
5831 ScalarCost += TTI.getScalarizationOverhead(
5832 cast<VectorType>(ToVectorTy(J->getType(), VF)),
5833 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5834 /*Extract*/ true, CostKind);
5835 }
5836 }
5837
5838 // Scale the total scalar cost by block probability.
5839 ScalarCost /= getReciprocalPredBlockProb();
5840
5841 // Compute the discount. A non-negative discount means the vector version
5842 // of the instruction costs more, and scalarizing would be beneficial.
5843 Discount += VectorCost - ScalarCost;
5844 ScalarCosts[I] = ScalarCost;
5845 }
5846
5847 return Discount;
5848}
5849
5854
5855 // For each block.
5856 for (BasicBlock *BB : TheLoop->blocks()) {
5857 VectorizationCostTy BlockCost;
5858
5859 // For each instruction in the old loop.
5860 for (Instruction &I : BB->instructionsWithoutDebug()) {
5861 // Skip ignored values.
5862 if (ValuesToIgnore.count(&I) ||
5863 (VF.isVector() && VecValuesToIgnore.count(&I)))
5864 continue;
5865
5866 VectorizationCostTy C = getInstructionCost(&I, VF);
5867
5868 // Check if we should override the cost.
5869 if (C.first.isValid() &&
5870 ForceTargetInstructionCost.getNumOccurrences() > 0)
5872
5873 // Keep a list of instructions with invalid costs.
5874 if (Invalid && !C.first.isValid())
5875 Invalid->emplace_back(&I, VF);
5876
5877 BlockCost.first += C.first;
5878 BlockCost.second |= C.second;
5879 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5880 << " for VF " << VF << " For instruction: " << I
5881 << '\n');
5882 }
5883
5884 // If we are vectorizing a predicated block, it will have been
5885 // if-converted. This means that the block's instructions (aside from
5886 // stores and instructions that may divide by zero) will now be
5887 // unconditionally executed. For the scalar case, we may not always execute
5888 // the predicated block, if it is an if-else block. Thus, scale the block's
5889 // cost by the probability of executing it. blockNeedsPredication from
5890 // Legal is used so as to not include all blocks in tail folded loops.
5891 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5892 BlockCost.first /= getReciprocalPredBlockProb();
5893
5894 Cost.first += BlockCost.first;
5895 Cost.second |= BlockCost.second;
5896 }
5897
5898 return Cost;
5899}
5900
5901/// Gets Address Access SCEV after verifying that the access pattern
5902/// is loop invariant except the induction variable dependence.
5903///
5904/// This SCEV can be sent to the Target in order to estimate the address
5905/// calculation cost.
5907 Value *Ptr,
5910 const Loop *TheLoop) {
5911
5912 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5913 if (!Gep)
5914 return nullptr;
5915
5916 // We are looking for a gep with all loop invariant indices except for one
5917 // which should be an induction variable.
5918 auto SE = PSE.getSE();
5919 unsigned NumOperands = Gep->getNumOperands();
5920 for (unsigned i = 1; i < NumOperands; ++i) {
5921 Value *Opd = Gep->getOperand(i);
5922 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5923 !Legal->isInductionVariable(Opd))
5924 return nullptr;
5925 }
5926
5927 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5928 return PSE.getSCEV(Ptr);
5929}
5930
5932LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5933 ElementCount VF) {
5934 assert(VF.isVector() &&
5935 "Scalarization cost of instruction implies vectorization.");
5936 if (VF.isScalable())
5938
5939 Type *ValTy = getLoadStoreType(I);
5940 auto SE = PSE.getSE();
5941
5942 unsigned AS = getLoadStoreAddressSpace(I);
5944 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5945 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5946 // that it is being called from this specific place.
5947
5948 // Figure out whether the access is strided and get the stride value
5949 // if it's known in compile time
5950 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5951
5952 // Get the cost of the scalar memory instruction and address computation.
5954 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5955
5956 // Don't pass *I here, since it is scalar but will actually be part of a
5957 // vectorized loop where the user of it is a vectorized instruction.
5959 const Align Alignment = getLoadStoreAlignment(I);
5960 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5961 ValTy->getScalarType(),
5962 Alignment, AS, CostKind);
5963
5964 // Get the overhead of the extractelement and insertelement instructions
5965 // we might create due to scalarization.
5966 Cost += getScalarizationOverhead(I, VF, CostKind);
5967
5968 // If we have a predicated load/store, it will need extra i1 extracts and
5969 // conditional branches, but may not be executed for each vector lane. Scale
5970 // the cost by the probability of executing the predicated block.
5971 if (isPredicatedInst(I)) {
5973
5974 // Add the cost of an i1 extract and a branch
5975 auto *Vec_i1Ty =
5978 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
5979 /*Insert=*/false, /*Extract=*/true, CostKind);
5980 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5981
5982 if (useEmulatedMaskMemRefHack(I, VF))
5983 // Artificially setting to a high enough value to practically disable
5984 // vectorization with such operations.
5985 Cost = 3000000;
5986 }
5987
5988 return Cost;
5989}
5990
5992LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5993 ElementCount VF) {
5994 Type *ValTy = getLoadStoreType(I);
5995 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5997 unsigned AS = getLoadStoreAddressSpace(I);
5998 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6000
6001 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6002 "Stride should be 1 or -1 for consecutive memory access");
6003 const Align Alignment = getLoadStoreAlignment(I);
6005 if (Legal->isMaskRequired(I)) {
6006 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6007 CostKind);
6008 } else {
6009 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6010 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6011 CostKind, OpInfo, I);
6012 }
6013
6014 bool Reverse = ConsecutiveStride < 0;
6015 if (Reverse)
6017 std::nullopt, CostKind, 0);
6018 return Cost;
6019}
6020
6022LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6023 ElementCount VF) {
6024 assert(Legal->isUniformMemOp(*I, VF));
6025
6026 Type *ValTy = getLoadStoreType(I);
6027 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6028 const Align Alignment = getLoadStoreAlignment(I);
6029 unsigned AS = getLoadStoreAddressSpace(I);
6031 if (isa<LoadInst>(I)) {
6032 return TTI.getAddressComputationCost(ValTy) +
6033 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6034 CostKind) +
6036 }
6037 StoreInst *SI = cast<StoreInst>(I);
6038
6039 bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
6040 return TTI.getAddressComputationCost(ValTy) +
6041 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6042 CostKind) +
6043 (isLoopInvariantStoreValue
6044 ? 0
6045 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6046 CostKind, VF.getKnownMinValue() - 1));
6047}
6048
6050LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6051 ElementCount VF) {
6052 Type *ValTy = getLoadStoreType(I);
6053 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6054 const Align Alignment = getLoadStoreAlignment(I);
6056
6057 return TTI.getAddressComputationCost(VectorTy) +
6059 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6061}
6062
6064LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6065 ElementCount VF) {
6066 Type *ValTy = getLoadStoreType(I);
6067 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6068 unsigned AS = getLoadStoreAddressSpace(I);
6070
6071 auto Group = getInterleavedAccessGroup(I);
6072 assert(Group && "Fail to get an interleaved access group.");
6073
6074 unsigned InterleaveFactor = Group->getFactor();
6075 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6076
6077 // Holds the indices of existing members in the interleaved group.
6079 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6080 if (Group->getMember(IF))
6081 Indices.push_back(IF);
6082
6083 // Calculate the cost of the whole interleaved group.
6084 bool UseMaskForGaps =
6085 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6086 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6088 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6089 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
6090
6091 if (Group->isReverse()) {
6092 // TODO: Add support for reversed masked interleaved access.
6094 "Reverse masked interleaved access not supported.");
6095 Cost += Group->getNumMembers() *
6097 std::nullopt, CostKind, 0);
6098 }
6099 return Cost;
6100}
6101
6102std::optional<InstructionCost>
6103LoopVectorizationCostModel::getReductionPatternCost(
6104 Instruction *I, ElementCount VF, Type *Ty,
6106 using namespace llvm::PatternMatch;
6107 // Early exit for no inloop reductions
6108 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6109 return std::nullopt;
6110 auto *VectorTy = cast<VectorType>(Ty);
6111
6112 // We are looking for a pattern of, and finding the minimal acceptable cost:
6113 // reduce(mul(ext(A), ext(B))) or
6114 // reduce(mul(A, B)) or
6115 // reduce(ext(A)) or
6116 // reduce(A).
6117 // The basic idea is that we walk down the tree to do that, finding the root
6118 // reduction instruction in InLoopReductionImmediateChains. From there we find
6119 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6120 // of the components. If the reduction cost is lower then we return it for the
6121 // reduction instruction and 0 for the other instructions in the pattern. If
6122 // it is not we return an invalid cost specifying the orignal cost method
6123 // should be used.
6124 Instruction *RetI = I;
6125 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6126 if (!RetI->hasOneUser())
6127 return std::nullopt;
6128 RetI = RetI->user_back();
6129 }
6130
6131 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6132 RetI->user_back()->getOpcode() == Instruction::Add) {
6133 RetI = RetI->user_back();
6134 }
6135
6136 // Test if the found instruction is a reduction, and if not return an invalid
6137 // cost specifying the parent to use the original cost modelling.
6138 if (!InLoopReductionImmediateChains.count(RetI))
6139 return std::nullopt;
6140
6141 // Find the reduction this chain is a part of and calculate the basic cost of
6142 // the reduction on its own.
6143 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
6144 Instruction *ReductionPhi = LastChain;
6145 while (!isa<PHINode>(ReductionPhi))
6146 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
6147
6148 const RecurrenceDescriptor &RdxDesc =
6149 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6150
6152 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6153
6154 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6155 // normal fmul instruction to the cost of the fadd reduction.
6156 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6157 BaseCost +=
6158 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6159
6160 // If we're using ordered reductions then we can just return the base cost
6161 // here, since getArithmeticReductionCost calculates the full ordered
6162 // reduction cost when FP reassociation is not allowed.
6163 if (useOrderedReductions(RdxDesc))
6164 return BaseCost;
6165
6166 // Get the operand that was not the reduction chain and match it to one of the
6167 // patterns, returning the better cost if it is found.
6168 Instruction *RedOp = RetI->getOperand(1) == LastChain
6169 ? dyn_cast<Instruction>(RetI->getOperand(0))
6170 : dyn_cast<Instruction>(RetI->getOperand(1));
6171
6172 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6173
6174 Instruction *Op0, *Op1;
6175 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6176 match(RedOp,
6178 match(Op0, m_ZExtOrSExt(m_Value())) &&
6179 Op0->getOpcode() == Op1->getOpcode() &&
6180 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6182 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6183
6184 // Matched reduce.add(ext(mul(ext(A), ext(B)))
6185 // Note that the extend opcodes need to all match, or if A==B they will have
6186 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6187 // which is equally fine.
6188 bool IsUnsigned = isa<ZExtInst>(Op0);
6189 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6190 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6191
6192 InstructionCost ExtCost =
6193 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6195 InstructionCost MulCost =
6196 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6197 InstructionCost Ext2Cost =
6198 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6200
6202 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6203
6204 if (RedCost.isValid() &&
6205 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6206 return I == RetI ? RedCost : 0;
6207 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6208 !TheLoop->isLoopInvariant(RedOp)) {
6209 // Matched reduce(ext(A))
6210 bool IsUnsigned = isa<ZExtInst>(RedOp);
6211 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6213 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6214 RdxDesc.getFastMathFlags(), CostKind);
6215
6216 InstructionCost ExtCost =
6217 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6219 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6220 return I == RetI ? RedCost : 0;
6221 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6222 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6223 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6224 Op0->getOpcode() == Op1->getOpcode() &&
6226 bool IsUnsigned = isa<ZExtInst>(Op0);
6227 Type *Op0Ty = Op0->getOperand(0)->getType();
6228 Type *Op1Ty = Op1->getOperand(0)->getType();
6229 Type *LargestOpTy =
6230 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6231 : Op0Ty;
6232 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6233
6234 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6235 // different sizes. We take the largest type as the ext to reduce, and add
6236 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6238 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6241 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6243 InstructionCost MulCost =
6244 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6245
6247 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6248 InstructionCost ExtraExtCost = 0;
6249 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6250 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6251 ExtraExtCost = TTI.getCastInstrCost(
6252 ExtraExtOp->getOpcode(), ExtType,
6253 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6255 }
6256
6257 if (RedCost.isValid() &&
6258 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6259 return I == RetI ? RedCost : 0;
6260 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6261 // Matched reduce.add(mul())
6262 InstructionCost MulCost =
6263 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6264
6266 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6267
6268 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6269 return I == RetI ? RedCost : 0;
6270 }
6271 }
6272
6273 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6274}
6275
6277LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6278 ElementCount VF) {
6279 // Calculate scalar cost only. Vectorization cost should be ready at this
6280 // moment.
6281 if (VF.isScalar()) {
6282 Type *ValTy = getLoadStoreType(I);
6283 const Align Alignment = getLoadStoreAlignment(I);
6284 unsigned AS = getLoadStoreAddressSpace(I);
6285
6286 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6287 return TTI.getAddressComputationCost(ValTy) +
6288 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6289 TTI::TCK_RecipThroughput, OpInfo, I);
6290 }
6291 return getWideningCost(I, VF);
6292}
6293
6295LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6296 ElementCount VF) {
6297 // If we know that this instruction will remain uniform, check the cost of
6298 // the scalar version.
6300 VF = ElementCount::getFixed(1);
6301
6302 if (VF.isVector() && isProfitableToScalarize(I, VF))
6303 return VectorizationCostTy(InstsToScalarize[VF][I], false);
6304
6305 // Forced scalars do not have any scalarization overhead.
6306 auto ForcedScalar = ForcedScalars.find(VF);
6307 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6308 auto InstSet = ForcedScalar->second;
6309 if (InstSet.count(I))
6310 return VectorizationCostTy(
6311 (getInstructionCost(I, ElementCount::getFixed(1)).first *
6312 VF.getKnownMinValue()),
6313 false);
6314 }
6315
6316 Type *VectorTy;
6317 InstructionCost C = getInstructionCost(I, VF, VectorTy);
6318
6319 bool TypeNotScalarized = false;
6320 if (VF.isVector() && VectorTy->isVectorTy()) {
6321 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6322 if (VF.isScalable())
6323 // <vscale x 1 x iN> is assumed to be profitable over iN because
6324 // scalable registers are a distinct register class from scalar ones.
6325 // If we ever find a target which wants to lower scalable vectors
6326 // back to scalars, we'll need to update this code to explicitly
6327 // ask TTI about the register class uses for each part.
6328 TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6329 else
6330 TypeNotScalarized = NumParts < VF.getKnownMinValue();
6331 } else
6333 }
6334 return VectorizationCostTy(C, TypeNotScalarized);
6335}
6336
6337InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6339
6340 // There is no mechanism yet to create a scalable scalarization loop,
6341 // so this is currently Invalid.
6342 if (VF.isScalable())
6344
6345 if (VF.isScalar())
6346 return 0;
6347
6349 Type *RetTy = ToVectorTy(I->getType(), VF);
6350 if (!RetTy->isVoidTy() &&
6351 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6353 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6354 /*Insert*/ true,
6355 /*Extract*/ false, CostKind);
6356
6357 // Some targets keep addresses scalar.
6358 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6359 return Cost;
6360
6361 // Some targets support efficient element stores.
6362 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6363 return Cost;
6364
6365 // Collect operands to consider.
6366 CallInst *CI = dyn_cast<CallInst>(I);
6367 Instruction::op_range Ops = CI ? CI->args() : I->operands();
6368
6369 // Skip operands that do not require extraction/scalarization and do not incur
6370 // any overhead.
6372 for (auto *V : filterExtractingOperands(Ops, VF))
6373 Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6375 filterExtractingOperands(Ops, VF), Tys, CostKind);
6376}
6377
6379 if (VF.isScalar())
6380 return;
6381 NumPredStores = 0;
6382 for (BasicBlock *BB : TheLoop->blocks()) {
6383 // For each instruction in the old loop.
6384 for (Instruction &I : *BB) {
6386 if (!Ptr)
6387 continue;
6388
6389 // TODO: We should generate better code and update the cost model for
6390 // predicated uniform stores. Today they are treated as any other
6391 // predicated store (see added test cases in
6392 // invariant-store-vectorization.ll).
6393 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6394 NumPredStores++;
6395
6396 if (Legal->isUniformMemOp(I, VF)) {
6397 auto isLegalToScalarize = [&]() {
6398 if (!VF.isScalable())
6399 // Scalarization of fixed length vectors "just works".
6400 return true;
6401
6402 // We have dedicated lowering for unpredicated uniform loads and
6403 // stores. Note that even with tail folding we know that at least
6404 // one lane is active (i.e. generalized predication is not possible
6405 // here), and the logic below depends on this fact.
6406 if (!foldTailByMasking())
6407 return true;
6408
6409 // For scalable vectors, a uniform memop load is always
6410 // uniform-by-parts and we know how to scalarize that.
6411 if (isa<LoadInst>(I))
6412 return true;
6413
6414 // A uniform store isn't neccessarily uniform-by-part
6415 // and we can't assume scalarization.
6416 auto &SI = cast<StoreInst>(I);
6417 return TheLoop->isLoopInvariant(SI.getValueOperand());
6418 };
6419
6420 const InstructionCost GatherScatterCost =
6422 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6423
6424 // Load: Scalar load + broadcast
6425 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6426 // FIXME: This cost is a significant under-estimate for tail folded
6427 // memory ops.
6428 const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6429 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6430
6431 // Choose better solution for the current VF, Note that Invalid
6432 // costs compare as maximumal large. If both are invalid, we get
6433 // scalable invalid which signals a failure and a vectorization abort.
6434 if (GatherScatterCost < ScalarizationCost)
6435 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6436 else
6437 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6438 continue;
6439 }
6440
6441 // We assume that widening is the best solution when possible.
6442 if (memoryInstructionCanBeWidened(&I, VF)) {
6443 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6444 int ConsecutiveStride = Legal->isConsecutivePtr(
6446 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6447 "Expected consecutive stride.");
6448 InstWidening Decision =
6449 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6450 setWideningDecision(&I, VF, Decision, Cost);
6451 continue;
6452 }
6453
6454 // Choose between Interleaving, Gather/Scatter or Scalarization.
6456 unsigned NumAccesses = 1;
6457 if (isAccessInterleaved(&I)) {
6458 auto Group = getInterleavedAccessGroup(&I);
6459 assert(Group && "Fail to get an interleaved access group.");
6460
6461 // Make one decision for the whole group.
6462 if (getWideningDecision(&I, VF) != CM_Unknown)
6463 continue;
6464
6465 NumAccesses = Group->getNumMembers();
6467 InterleaveCost = getInterleaveGroupCost(&I, VF);
6468 }
6469
6470 InstructionCost GatherScatterCost =
6472 ? getGatherScatterCost(&I, VF) * NumAccesses
6474
6475 InstructionCost ScalarizationCost =
6476 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6477
6478 // Choose better solution for the current VF,
6479 // write down this decision and use it during vectorization.
6481 InstWidening Decision;
6482 if (InterleaveCost <= GatherScatterCost &&
6483 InterleaveCost < ScalarizationCost) {
6484 Decision = CM_Interleave;
6485 Cost = InterleaveCost;
6486 } else if (GatherScatterCost < ScalarizationCost) {
6487 Decision = CM_GatherScatter;
6488 Cost = GatherScatterCost;
6489 } else {
6490 Decision = CM_Scalarize;
6491 Cost = ScalarizationCost;
6492 }
6493 // If the instructions belongs to an interleave group, the whole group
6494 // receives the same decision. The whole group receives the cost, but
6495 // the cost will actually be assigned to one instruction.
6496 if (auto Group = getInterleavedAccessGroup(&I))
6497 setWideningDecision(Group, VF, Decision, Cost);
6498 else
6499 setWideningDecision(&I, VF, Decision, Cost);
6500 }
6501 }
6502
6503 // Make sure that any load of address and any other address computation
6504 // remains scalar unless there is gather/scatter support. This avoids
6505 // inevitable extracts into address registers, and also has the benefit of
6506 // activating LSR more, since that pass can't optimize vectorized
6507 // addresses.
6509 return;
6510
6511 // Start with all scalar pointer uses.
6513 for (BasicBlock *BB : TheLoop->blocks())
6514 for (Instruction &I : *BB) {
6515 Instruction *PtrDef =
6516 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6517 if (PtrDef && TheLoop->contains(PtrDef) &&
6519 AddrDefs.insert(PtrDef);
6520 }
6521
6522 // Add all instructions used to generate the addresses.
6524 append_range(Worklist, AddrDefs);
6525 while (!Worklist.empty()) {
6526 Instruction *I = Worklist.pop_back_val();
6527 for (auto &Op : I->operands())
6528 if (auto *InstOp = dyn_cast<Instruction>(Op))
6529 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6530 AddrDefs.insert(InstOp).second)
6531 Worklist.push_back(InstOp);
6532 }
6533
6534 for (auto *I : AddrDefs) {
6535 if (isa<LoadInst>(I)) {
6536 // Setting the desired widening decision should ideally be handled in
6537 // by cost functions, but since this involves the task of finding out
6538 // if the loaded register is involved in an address computation, it is
6539 // instead changed here when we know this is the case.
6540 InstWidening Decision = getWideningDecision(I, VF);
6541 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6542 // Scalarize a widened load of address.
6544 I, VF, CM_Scalarize,
6545 (VF.getKnownMinValue() *
6546 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6547 else if (auto Group = getInterleavedAccessGroup(I)) {
6548 // Scalarize an interleave group of address loads.
6549 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6550 if (Instruction *Member = Group->getMember(I))
6552 Member, VF, CM_Scalarize,
6553 (VF.getKnownMinValue() *
6554 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6555 }
6556 }
6557 } else
6558 // Make sure I gets scalarized and a cost estimate without
6559 // scalarization overhead.
6560 ForcedScalars[VF].insert(I);
6561 }
6562}
6563
6565 assert(!VF.isScalar() &&
6566 "Trying to set a vectorization decision for a scalar VF");
6567
6568 for (BasicBlock *BB : TheLoop->blocks()) {
6569 // For each instruction in the old loop.
6570 for (Instruction &I : *BB) {
6571 CallInst *CI = dyn_cast<CallInst>(&I);
6572
6573 if (!CI)
6574 continue;
6575
6580
6581 Function *ScalarFunc = CI->getCalledFunction();
6582 Type *ScalarRetTy = CI->getType();
6583 SmallVector<Type *, 4> Tys, ScalarTys;
6584 bool MaskRequired = Legal->isMaskRequired(CI);
6585 for (auto &ArgOp : CI->args())
6586 ScalarTys.push_back(ArgOp->getType());
6587
6588 // Compute corresponding vector type for return value and arguments.
6589 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
6590 for (Type *ScalarTy : ScalarTys)
6591 Tys.push_back(ToVectorTy(ScalarTy, VF));
6592
6593 // An in-loop reduction using an fmuladd intrinsic is a special case;
6594 // we don't want the normal cost for that intrinsic.
6596 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6599 std::nullopt, *RedCost);
6600 continue;
6601 }
6602
6603 // Estimate cost of scalarized vector call. The source operands are
6604 // assumed to be vectors, so we need to extract individual elements from
6605 // there, execute VF scalar calls, and then gather the result into the
6606 // vector return value.
6607 InstructionCost ScalarCallCost =
6608 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6609
6610 // Compute costs of unpacking argument values for the scalar calls and
6611 // packing the return values to a vector.
6612 InstructionCost ScalarizationCost =
6613 getScalarizationOverhead(CI, VF, CostKind);
6614
6615 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6616
6617 // Find the cost of vectorizing the call, if we can find a suitable
6618 // vector variant of the function.
6619 bool UsesMask = false;
6620 VFInfo FuncInfo;
6621 Function *VecFunc = nullptr;
6622 // Search through any available variants for one we can use at this VF.
6623 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6624 // Must match requested VF.
6625 if (Info.Shape.VF != VF)
6626 continue;
6627
6628 // Must take a mask argument if one is required
6629 if (MaskRequired && !Info.isMasked())
6630 continue;
6631
6632 // Check that all parameter kinds are supported
6633 bool ParamsOk = true;
6634 for (VFParameter Param : Info.Shape.Parameters) {
6635 switch (Param.ParamKind) {
6637 break;
6639 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6640 // Make sure the scalar parameter in the loop is invariant.
6641 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6642 TheLoop))
6643 ParamsOk = false;
6644 break;
6645 }
6647 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6648 // Find the stride for the scalar parameter in this loop and see if
6649 // it matches the stride for the variant.
6650 // TODO: do we need to figure out the cost of an extract to get the
6651 // first lane? Or do we hope that it will be folded away?
6652 ScalarEvolution *SE = PSE.getSE();
6653 const auto *SAR =
6654 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6655
6656 if (!SAR || SAR->getLoop() != TheLoop) {
6657 ParamsOk = false;
6658 break;
6659 }
6660
6661 const SCEVConstant *Step =
6662 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6663
6664 if (!Step ||
6665 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6666 ParamsOk = false;
6667
6668 break;
6669 }
6671 UsesMask = true;
6672 break;
6673 default:
6674 ParamsOk = false;
6675 break;
6676 }
6677 }
6678
6679 if (!ParamsOk)
6680 continue;
6681
6682 // Found a suitable candidate, stop here.
6683 VecFunc = CI->getModule()->getFunction(Info.VectorName);
6684 FuncInfo = Info;
6685 break;
6686 }
6687
6688 // Add in the cost of synthesizing a mask if one wasn't required.
6689 InstructionCost MaskCost = 0;
6690 if (VecFunc && UsesMask && !MaskRequired)
6691 MaskCost = TTI.getShuffleCost(
6694 VecFunc->getFunctionType()->getContext()),
6695 VF));
6696
6697 if (TLI && VecFunc && !CI->isNoBuiltin())
6698 VectorCost =
6699 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6700
6701 // Find the cost of an intrinsic; some targets may have instructions that
6702 // perform the operation without needing an actual call.
6704 if (IID != Intrinsic::not_intrinsic)
6705 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6706
6707 InstructionCost Cost = ScalarCost;
6708 InstWidening Decision = CM_Scalarize;
6709
6710 if (VectorCost <= Cost) {
6711 Cost = VectorCost;
6712 Decision = CM_VectorCall;
6713 }
6714
6715 if (IntrinsicCost <= Cost) {
6716 Cost = IntrinsicCost;
6717 Decision = CM_IntrinsicCall;
6718 }
6719
6720 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6722 }
6723 }
6724}
6725
6727LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6728 Type *&VectorTy) {
6729 Type *RetTy = I->getType();
6731 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6732 auto SE = PSE.getSE();
6734
6735 auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6736 ElementCount VF) -> bool {
6737 if (VF.isScalar())
6738 return true;
6739
6740 auto Scalarized = InstsToScalarize.find(VF);
6741 assert(Scalarized != InstsToScalarize.end() &&
6742 "VF not yet analyzed for scalarization profitability");
6743 return !Scalarized->second.count(I) &&
6744 llvm::all_of(I->users(), [&](User *U) {
6745 auto *UI = cast<Instruction>(U);
6746 return !Scalarized->second.count(UI);
6747 });
6748 };
6749 (void) hasSingleCopyAfterVectorization;
6750
6751 if (isScalarAfterVectorization(I, VF)) {
6752 // With the exception of GEPs and PHIs, after scalarization there should
6753 // only be one copy of the instruction generated in the loop. This is
6754 // because the VF is either 1, or any instructions that need scalarizing
6755 // have already been dealt with by the time we get here. As a result,
6756 // it means we don't have to multiply the instruction cost by VF.
6757 assert(I->getOpcode() == Instruction::GetElementPtr ||
6758 I->getOpcode() == Instruction::PHI ||
6759 (I->getOpcode() == Instruction::BitCast &&
6760 I->getType()->isPointerTy()) ||
6761 hasSingleCopyAfterVectorization(I, VF));
6762 VectorTy = RetTy;
6763 } else
6764 VectorTy = ToVectorTy(RetTy, VF);
6765
6766 // TODO: We need to estimate the cost of intrinsic calls.
6767 switch (I->getOpcode()) {
6768 case Instruction::GetElementPtr:
6769 // We mark this instruction as zero-cost because the cost of GEPs in
6770 // vectorized code depends on whether the corresponding memory instruction
6771 // is scalarized or not. Therefore, we handle GEPs with the memory
6772 // instruction cost.
6773 return 0;
6774 case Instruction::Br: {
6775 // In cases of scalarized and predicated instructions, there will be VF
6776 // predicated blocks in the vectorized loop. Each branch around these
6777 // blocks requires also an extract of its vector compare i1 element.
6778 // Note that the conditional branch from the loop latch will be replaced by
6779 // a single branch controlling the loop, so there is no extra overhead from
6780 // scalarization.
6781 bool ScalarPredicatedBB = false;
6782 BranchInst *BI = cast<BranchInst>(I);
6783 if (VF.isVector() && BI->isConditional() &&
6784 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6785 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6786 BI->getParent() != TheLoop->getLoopLatch())
6787 ScalarPredicatedBB = true;
6788
6789 if (ScalarPredicatedBB) {
6790 // Not possible to scalarize scalable vector with predicated instructions.
6791 if (VF.isScalable())
6793 // Return cost for branches around scalarized and predicated blocks.
6794 auto *Vec_i1Ty =
6795 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6796 return (
6798 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
6799 /*Insert*/ false, /*Extract*/ true, CostKind) +
6800 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6801 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6802 // The back-edge branch will remain, as will all scalar branches.
6803 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6804 else
6805 // This branch will be eliminated by if-conversion.
6806 return 0;
6807 // Note: We currently assume zero cost for an unconditional branch inside
6808 // a predicated block since it will become a fall-through, although we
6809 // may decide in the future to call TTI for all branches.
6810 }
6811 case Instruction::PHI: {
6812 auto *Phi = cast<PHINode>(I);
6813
6814 // First-order recurrences are replaced by vector shuffles inside the loop.
6815 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6817 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6819 cast<VectorType>(VectorTy), Mask, CostKind,
6820 VF.getKnownMinValue() - 1);
6821 }
6822
6823 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6824 // converted into select instructions. We require N - 1 selects per phi
6825 // node, where N is the number of incoming values.
6826 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6827 return (Phi->getNumIncomingValues() - 1) *
6829 Instruction::Select, ToVectorTy(Phi->getType(), VF),
6830 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6832
6833 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6834 }
6835 case Instruction::UDiv:
6836 case Instruction::SDiv:
6837 case Instruction::URem:
6838 case Instruction::SRem:
6839 if (VF.isVector() && isPredicatedInst(I)) {
6840 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6841 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6842 ScalarCost : SafeDivisorCost;
6843 }
6844 // We've proven all lanes safe to speculate, fall through.
6845 [[fallthrough]];
6846 case Instruction::Add:
6847 case Instruction::FAdd:
6848 case Instruction::Sub:
6849 case Instruction::FSub:
6850 case Instruction::Mul:
6851 case Instruction::FMul:
6852 case Instruction::FDiv:
6853 case Instruction::FRem:
6854 case Instruction::Shl:
6855 case Instruction::LShr:
6856 case Instruction::AShr:
6857 case Instruction::And:
6858 case Instruction::Or:
6859 case Instruction::Xor: {
6860 // If we're speculating on the stride being 1, the multiplication may
6861 // fold away. We can generalize this for all operations using the notion
6862 // of neutral elements. (TODO)
6863 if (I->getOpcode() == Instruction::Mul &&
6864 (PSE.getSCEV(I->getOperand(0))->isOne() ||
6865 PSE.getSCEV(I->getOperand(1))->isOne()))
6866 return 0;
6867
6868 // Detect reduction patterns
6869 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6870 return *RedCost;
6871
6872 // Certain instructions can be cheaper to vectorize if they have a constant
6873 // second vector operand. One example of this are shifts on x86.
6874 Value *Op2 = I->getOperand(1);
6875 auto Op2Info = TTI.getOperandInfo(Op2);
6876 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6877 Legal->isInvariant(Op2))
6879
6880 SmallVector<const Value *, 4> Operands(I->operand_values());
6882 I->getOpcode(), VectorTy, CostKind,
6883 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6884 Op2Info, Operands, I, TLI);
6885 }
6886 case Instruction::FNeg: {
6888 I->getOpcode(), VectorTy, CostKind,
6889 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6890 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6891 I->getOperand(0), I);
6892 }
6893 case Instruction::Select: {
6894 SelectInst *SI = cast<SelectInst>(I);
6895 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6896 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6897
6898 const Value *Op0, *Op1;
6899 using namespace llvm::PatternMatch;
6900 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6901 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6902 // select x, y, false --> x & y
6903 // select x, true, y --> x | y
6904 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6905 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6906 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6907 Op1->getType()->getScalarSizeInBits() == 1);
6908
6911 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6912 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6913 }
6914
6915 Type *CondTy = SI->getCondition()->getType();
6916 if (!ScalarCond)
6917 CondTy = VectorType::get(CondTy, VF);
6918
6920 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6921 Pred = Cmp->getPredicate();
6922 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6923 CostKind, I);
6924 }
6925 case Instruction::ICmp:
6926 case Instruction::FCmp: {
6927 Type *ValTy = I->getOperand(0)->getType();
6928 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6929 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6930 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6931 VectorTy = ToVectorTy(ValTy, VF);
6932 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6933 cast<CmpInst>(I)->getPredicate(), CostKind,
6934 I);
6935 }
6936 case Instruction::Store:
6937 case Instruction::Load: {
6938 ElementCount Width = VF;
6939 if (Width.isVector()) {
6940 InstWidening Decision = getWideningDecision(I, Width);
6941 assert(Decision != CM_Unknown &&
6942 "CM decision should be taken at this point");
6945 if (Decision == CM_Scalarize)
6946 Width = ElementCount::getFixed(1);
6947 }
6948 VectorTy = ToVectorTy(getLoadStoreType(I), Width);
6949 return getMemoryInstructionCost(I, VF);
6950 }
6951 case Instruction::BitCast:
6952 if (I->getType()->isPointerTy())
6953 return 0;
6954 [[fallthrough]];
6955 case Instruction::ZExt:
6956 case Instruction::SExt:
6957 case Instruction::FPToUI:
6958 case Instruction::FPToSI:
6959 case Instruction::FPExt:
6960 case Instruction::PtrToInt:
6961 case Instruction::IntToPtr:
6962 case Instruction::SIToFP:
6963 case Instruction::UIToFP:
6964 case Instruction::Trunc:
6965 case Instruction::FPTrunc: {
6966 // Computes the CastContextHint from a Load/Store instruction.
6967 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6968 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6969 "Expected a load or a store!");
6970
6971 if (VF.isScalar() || !TheLoop->contains(I))
6973
6974 switch (getWideningDecision(I, VF)) {
6986 llvm_unreachable("Instr did not go through cost modelling?");
6989 llvm_unreachable_internal("Instr has invalid widening decision");
6990 }
6991
6992 llvm_unreachable("Unhandled case!");
6993 };
6994
6995 unsigned Opcode = I->getOpcode();
6997 // For Trunc, the context is the only user, which must be a StoreInst.
6998 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6999 if (I->hasOneUse())
7000 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7001 CCH = ComputeCCH(Store);
7002 }
7003 // For Z/Sext, the context is the operand, which must be a LoadInst.
7004 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7005 Opcode == Instruction::FPExt) {
7006 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7007 CCH = ComputeCCH(Load);
7008 }
7009
7010 // We optimize the truncation of induction variables having constant
7011 // integer steps. The cost of these truncations is the same as the scalar
7012 // operation.
7013 if (isOptimizableIVTruncate(I, VF)) {
7014 auto *Trunc = cast<TruncInst>(I);
7015 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7016 Trunc->getSrcTy(), CCH, CostKind, Trunc);
7017 }
7018
7019 // Detect reduction patterns
7020 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7021 return *RedCost;
7022
7023 Type *SrcScalarTy = I->getOperand(0)->getType();
7024 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7025 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7026 SrcScalarTy =
7027 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
7028 Type *SrcVecTy =
7029 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7030
7031 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7032 }
7033 case Instruction::Call:
7034 return getVectorCallCost(cast<CallInst>(I), VF);
7035 case Instruction::ExtractValue:
7037 case Instruction::Alloca:
7038 // We cannot easily widen alloca to a scalable alloca, as
7039 // the result would need to be a vector of pointers.
7040 if (VF.isScalable())
7042 [[fallthrough]];
7043 default:
7044 // This opcode is unknown. Assume that it is the same as 'mul'.
7045 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7046 } // end of switch.
7047}
7048
7050 // Ignore ephemeral values.
7052
7053 SmallSetVector<Value *, 4> DeadInterleavePointerOps;
7054 for (BasicBlock *BB : TheLoop->blocks())
7055 for (Instruction &I : *BB) {
7056 // Find all stores to invariant variables. Since they are going to sink
7057 // outside the loop we do not need calculate cost for them.
7058 StoreInst *SI;
7059 if ((SI = dyn_cast<StoreInst>(&I)) &&
7060 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7061 ValuesToIgnore.insert(&I);
7062
7063 // For interleave groups, we only create a pointer for the start of the
7064 // interleave group. Queue up addresses of group members except the insert
7065 // position for further processing.
7066 if (isAccessInterleaved(&I)) {
7067 auto *Group = getInterleavedAccessGroup(&I);
7068 if (Group->getInsertPos() == &I)
7069 continue;
7070 Value *PointerOp = getLoadStorePointerOperand(&I);
7071 DeadInterleavePointerOps.insert(PointerOp);
7072 }
7073 }
7074
7075 // Mark ops feeding interleave group members as free, if they are only used
7076 // by other dead computations.
7077 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
7078 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
7079 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
7080 Instruction *UI = cast<Instruction>(U);
7081 return !VecValuesToIgnore.contains(U) &&
7082 (!isAccessInterleaved(UI) ||
7083 getInterleavedAccessGroup(UI)->getInsertPos() == UI);
7084 }))
7085 continue;
7086 VecValuesToIgnore.insert(Op);
7087 DeadInterleavePointerOps.insert(Op->op_begin(), Op->op_end());
7088 }
7089
7090 // Ignore type-promoting instructions we identified during reduction
7091 // detection.
7092 for (const auto &Reduction : Legal->getReductionVars()) {
7093 const RecurrenceDescriptor &RedDes = Reduction.second;
7094 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7095 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7096 }
7097 // Ignore type-casting instructions we identified during induction
7098 // detection.
7099 for (const auto &Induction : Legal->getInductionVars()) {
7100 const InductionDescriptor &IndDes = Induction.second;
7101 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7102 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7103 }
7104}
7105
7107 for (const auto &Reduction : Legal->getReductionVars()) {
7108 PHINode *Phi = Reduction.first;
7109 const RecurrenceDescriptor &RdxDesc = Reduction.second;
7110
7111 // We don't collect reductions that are type promoted (yet).
7112 if (RdxDesc.getRecurrenceType() != Phi->getType())
7113 continue;
7114
7115 // If the target would prefer this reduction to happen "in-loop", then we
7116 // want to record it as such.
7117 unsigned Opcode = RdxDesc.getOpcode();
7118 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7119 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7121 continue;
7122
7123 // Check that we can correctly put the reductions into the loop, by
7124 // finding the chain of operations that leads from the phi to the loop
7125 // exit value.
7126 SmallVector<Instruction *, 4> ReductionOperations =
7127 RdxDesc.getReductionOpChain(Phi, TheLoop);
7128 bool InLoop = !ReductionOperations.empty();
7129
7130 if (InLoop) {
7131 InLoopReductions.insert(Phi);
7132 // Add the elements to InLoopReductionImmediateChains for cost modelling.
7133 Instruction *LastChain = Phi;
7134 for (auto *I : ReductionOperations) {
7135 InLoopReductionImmediateChains[I] = LastChain;
7136 LastChain = I;
7137 }
7138 }
7139 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7140 << " reduction for phi: " << *Phi << "\n");
7141 }
7142}
7143
7145 DebugLoc DL, const Twine &Name) {
7147 Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
7148 return tryInsertInstruction(
7149 new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
7150}
7151
7152// This function will select a scalable VF if the target supports scalable
7153// vectors and a fixed one otherwise.
7154// TODO: we could return a pair of values that specify the max VF and
7155// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7156// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7157// doesn't have a cost model that can choose which plan to execute if
7158// more than one is generated.
7161 unsigned WidestType;
7162 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7163
7168
7170 unsigned N = RegSize.getKnownMinValue() / WidestType;
7171 return ElementCount::get(N, RegSize.isScalable());
7172}
7173
7176 ElementCount VF = UserVF;
7177 // Outer loop handling: They may require CFG and instruction level
7178 // transformations before even evaluating whether vectorization is profitable.
7179 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7180 // the vectorization pipeline.
7181 if (!OrigLoop->isInnermost()) {
7182 // If the user doesn't provide a vectorization factor, determine a
7183 // reasonable one.
7184 if (UserVF.isZero()) {
7185 VF = determineVPlanVF(TTI, CM);
7186 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7187
7188 // Make sure we have a VF > 1 for stress testing.
7189 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7190 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7191 << "overriding computed VF.\n");
7192 VF = ElementCount::getFixed(4);
7193 }
7194 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7196 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7197 << "not supported by the target.\n");
7199 "Scalable vectorization requested but not supported by the target",
7200 "the scalable user-specified vectorization width for outer-loop "
7201 "vectorization cannot be used because the target does not support "
7202 "scalable vectors.",
7203 "ScalableVFUnfeasible", ORE, OrigLoop);
7205 }
7206 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7208 "VF needs to be a power of two");
7209 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7210 << "VF " << VF << " to build VPlans.\n");
7211 buildVPlans(VF, VF);
7212
7213 // For VPlan build stress testing, we bail out after VPlan construction.
7216
7217 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7218 }
7219
7220 LLVM_DEBUG(
7221 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7222 "VPlan-native path.\n");
7224}
7225
7226std::optional<VectorizationFactor>
7228 assert(OrigLoop->isInnermost() && "Inner loop expected.");
7231
7232 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7233 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7234 return std::nullopt;
7235
7236 // Invalidate interleave groups if all blocks of loop will be predicated.
7237 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7239 LLVM_DEBUG(
7240 dbgs()
7241 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7242 "which requires masked-interleaved support.\n");
7244 // Invalidating interleave groups also requires invalidating all decisions
7245 // based on them, which includes widening decisions and uniform and scalar
7246 // values.
7248 }
7249
7250 ElementCount MaxUserVF =
7251 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7252 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7253 if (!UserVF.isZero() && UserVFIsLegal) {
7255 "VF needs to be a power of two");
7256 // Collect the instructions (and their associated costs) that will be more
7257 // profitable to scalarize.
7259 if (CM.selectUserVectorizationFactor(UserVF)) {
7260 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7261 buildVPlansWithVPRecipes(UserVF, UserVF);
7262 if (!hasPlanWithVF(UserVF)) {
7263 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
7264 << ".\n");
7265 return std::nullopt;
7266 }
7267
7269 return {{UserVF, 0, 0}};
7270 } else
7271 reportVectorizationInfo("UserVF ignored because of invalid costs.",
7272 "InvalidCost", ORE, OrigLoop);
7273 }
7274
7275 // Collect the Vectorization Factor Candidates.
7276 SmallVector<ElementCount> VFCandidates;
7277 for (auto VF = ElementCount::getFixed(1);
7278 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7279 VFCandidates.push_back(VF);
7280 for (auto VF = ElementCount::getScalable(1);
7281 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7282 VFCandidates.push_back(VF);
7283
7285 for (const auto &VF : VFCandidates) {
7286 // Collect Uniform and Scalar instructions after vectorization with VF.
7288
7289 // Collect the instructions (and their associated costs) that will be more
7290 // profitable to scalarize.
7291 if (VF.isVector())
7293 }
7294
7295 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7296 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7297
7299 if (VPlans.empty())
7300 return std::nullopt;
7301 if (all_of(VPlans,
7302 [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }))
7304
7305 // Select the optimal vectorization factor.
7306 VectorizationFactor VF = selectVectorizationFactor();
7307 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7308 if (!hasPlanWithVF(VF.Width)) {
7309 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
7310 << ".\n");
7311 return std::nullopt;
7312 }
7313 return VF;
7314}
7315
7317 assert(count_if(VPlans,
7318 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7319 1 &&
7320 "Best VF has not a single VPlan.");
7321
7322 for (const VPlanPtr &Plan : VPlans) {
7323 if (Plan->hasVF(VF))
7324 return *Plan.get();
7325 }
7326 llvm_unreachable("No plan found!");
7327}
7328
7331 // Reserve first location for self reference to the LoopID metadata node.
7332 MDs.push_back(nullptr);
7333 bool IsUnrollMetadata = false;
7334 MDNode *LoopID = L->getLoopID();
7335 if (LoopID) {
7336 // First find existing loop unrolling disable metadata.
7337 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7338 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7339 if (MD) {
7340 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7341 IsUnrollMetadata =
7342 S && S->getString().starts_with("llvm.loop.unroll.disable");
7343 }
7344 MDs.push_back(LoopID->getOperand(i));
7345 }
7346 }
7347
7348 if (!IsUnrollMetadata) {
7349 // Add runtime unroll disable metadata.
7350 LLVMContext &Context = L->getHeader()->getContext();
7351 SmallVector<Metadata *, 1> DisableOperands;
7352 DisableOperands.push_back(
7353 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7354 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7355 MDs.push_back(DisableNode);
7356 MDNode *NewLoopID = MDNode::get(Context, MDs);
7357 // Set operand 0 to refer to the loop id itself.
7358 NewLoopID->replaceOperandWith(0, NewLoopID);
7359 L->setLoopID(NewLoopID);
7360 }
7361}
7362
7363// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7364// create a merge phi node for it and add it to \p ReductionResumeValues.
7366 VPInstruction *RedResult,
7368 VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock,
7369 bool VectorizingEpilogue) {
7370 if (!RedResult ||
7372 return;
7373
7374 auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
7375 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7376
7377 Value *FinalValue =
7378 State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
7379 auto *ResumePhi =
7380 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
7381 if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind(
7382 RdxDesc.getRecurrenceKind())) {
7383 auto *Cmp = cast<ICmpInst>(PhiR->getStartValue()->getUnderlyingValue());
7384 assert(Cmp->getPredicate() == CmpInst::ICMP_NE);
7385 assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue());
7386 ResumePhi = cast<PHINode>(Cmp->getOperand(0));
7387 }
7388 assert((!VectorizingEpilogue || ResumePhi) &&
7389 "when vectorizing the epilogue loop, we need a resume phi from main "
7390 "vector loop");
7391
7392 // TODO: bc.merge.rdx should not be created here, instead it should be
7393 // modeled in VPlan.
7394 BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7395 // Create a phi node that merges control-flow from the backedge-taken check
7396 // block and the middle block.
7397 auto *BCBlockPhi =
7398 PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
7399 LoopScalarPreHeader->getTerminator()->getIterator());
7400
7401 // If we are fixing reductions in the epilogue loop then we should already
7402 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7403 // we carry over the incoming values correctly.
7404 for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
7405 if (Incoming == LoopMiddleBlock)
7406 BCBlockPhi->addIncoming(FinalValue, Incoming);
7407 else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
7408 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
7409 Incoming);
7410 else
7411 BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming);
7412 }
7413
7414 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
7415 // TODO: This fixup should instead be modeled in VPlan.
7416 // Fix the scalar loop reduction variable with the incoming reduction sum
7417 // from the vector body and from the backedge value.
7418 int IncomingEdgeBlockIdx =
7419 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
7420 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
7421 // Pick the other block.
7422 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7423 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
7424 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7425 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
7426
7427 ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7428}
7429
7430std::pair<DenseMap<const SCEV *, Value *>,
7433 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7434 InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7435 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7436 assert(BestVPlan.hasVF(BestVF) &&
7437 "Trying to execute plan with unsupported VF");
7438 assert(BestVPlan.hasUF(BestUF) &&
7439 "Trying to execute plan with unsupported UF");
7440 assert(
7441 (IsEpilogueVectorization || !ExpandedSCEVs) &&
7442 "expanded SCEVs to reuse can only be used during epilogue vectorization");
7443 (void)IsEpilogueVectorization;
7444
7445 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7446
7447 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
7448 << ", UF=" << BestUF << '\n');
7449 BestVPlan.setName("Final VPlan");
7450 LLVM_DEBUG(BestVPlan.dump());
7451
7452 // Perform the actual loop transformation.
7453 VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7454 OrigLoop->getHeader()->getContext());
7455
7456 // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7457 // before making any changes to the CFG.
7458 if (!BestVPlan.getPreheader()->empty()) {
7459 State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7461 BestVPlan.getPreheader()->execute(&State);
7462 }
7463 if (!ILV.getTripCount())
7464 ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
7465 else
7466 assert(IsEpilogueVectorization && "should only re-use the existing trip "
7467 "count during epilogue vectorization");
7468
7469 // 1. Set up the skeleton for vectorization, including vector pre-header and
7470 // middle block. The vector loop is created during VPlan execution.
7471 Value *CanonicalIVStartValue;
7472 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7473 ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
7474 : State.ExpandedSCEVs);
7475
7476 // Only use noalias metadata when using memory checks guaranteeing no overlap
7477 // across all iterations.
7478 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7479 std::unique_ptr<LoopVersioning> LVer = nullptr;
7480 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7482
7483 // We currently don't use LoopVersioning for the actual loop cloning but we
7484 // still use it to add the noalias metadata.
7485 // TODO: Find a better way to re-use LoopVersioning functionality to add
7486 // metadata.
7487 LVer = std::make_unique<LoopVersioning>(
7488 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7489 PSE.getSE());
7490 State.LVer = &*LVer;
7492 }
7493
7495
7496 //===------------------------------------------------===//
7497 //
7498 // Notice: any optimization or new instruction that go
7499 // into the code below should also be implemented in
7500 // the cost-model.
7501 //
7502 //===------------------------------------------------===//
7503
7504 // 2. Copy and widen instructions from the old loop into the new loop.
7505 BestVPlan.prepareToExecute(ILV.getTripCount(),
7506 ILV.getOrCreateVectorTripCount(nullptr),
7507 CanonicalIVStartValue, State);
7508
7509 BestVPlan.execute(&State);
7510
7511 // 2.5 Collect reduction resume values.
7513 auto *ExitVPBB =
7514 cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7515 for (VPRecipeBase &R : *ExitVPBB) {
7517 dyn_cast<VPInstruction>(&R), ReductionResumeValues, State, OrigLoop,
7518 State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
7519 }
7520
7521 // 2.6. Maintain Loop Hints
7522 // Keep all loop hints from the original loop on the vector loop (we'll
7523 // replace the vectorizer-specific hints below).
7524 MDNode *OrigLoopID = OrigLoop->getLoopID();
7525
7526 std::optional<MDNode *> VectorizedLoopID =
7529
7530 VPBasicBlock *HeaderVPBB =
7532 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7533 if (VectorizedLoopID)
7534 L->setLoopID(*VectorizedLoopID);
7535 else {
7536 // Keep all loop hints from the original loop on the vector loop (we'll
7537 // replace the vectorizer-specific hints below).
7538 if (MDNode *LID = OrigLoop->getLoopID())
7539 L->setLoopID(LID);
7540
7541 LoopVectorizeHints Hints(L, true, *ORE);
7542 Hints.setAlreadyVectorized();
7543 }
7545 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7546 if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
7548
7549 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7550 // predication, updating analyses.
7551 ILV.fixVectorizedLoop(State, BestVPlan);
7552
7554
7555 return {State.ExpandedSCEVs, ReductionResumeValues};
7556}
7557
7558#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7560 for (const auto &Plan : VPlans)
7562 Plan->printDOT(O);
7563 else
7564 Plan->print(O);
7565}
7566#endif
7567
7568//===--------------------------------------------------------------------===//
7569// EpilogueVectorizerMainLoop
7570//===--------------------------------------------------------------------===//
7571
7572/// This function is partially responsible for generating the control flow
7573/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7574std::pair<BasicBlock *, Value *>
7576 const SCEV2ValueTy &ExpandedSCEVs) {
7578
7579 // Generate the code to check the minimum iteration count of the vector
7580 // epilogue (see below).
7584
7585 // Generate the code to check any assumptions that we've made for SCEV
7586 // expressions.
7588
7589 // Generate the code that checks at runtime if arrays overlap. We put the
7590 // checks into a separate block to make the more common case of few elements
7591 // faster.
7593
7594 // Generate the iteration count check for the main loop, *after* the check
7595 // for the epilogue loop, so that the path-length is shorter for the case
7596 // that goes directly through the vector epilogue. The longer-path length for
7597 // the main loop is compensated for, by the gain from vectorizing the larger
7598 // trip count. Note: the branch will get updated later on when we vectorize
7599 // the epilogue.
7602
7603 // Generate the induction variable.
7605
7606 // Skip induction resume value creation here because they will be created in
7607 // the second pass for the scalar loop. The induction resume values for the
7608 // inductions in the epilogue loop are created before executing the plan for
7609 // the epilogue loop.
7610
7611 return {completeLoopSkeleton(), nullptr};
7612}
7613
7615 LLVM_DEBUG({
7616 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7617 << "Main Loop VF:" << EPI.MainLoopVF
7618 << ", Main Loop UF:" << EPI.MainLoopUF
7619 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7620 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7621 });
7622}
7623
7626 dbgs() << "intermediate fn:\n"
7627 << *OrigLoop->getHeader()->getParent() << "\n";
7628 });
7629}
7630
7631BasicBlock *
7633 bool ForEpilogue) {
7634 assert(Bypass && "Expected valid bypass basic block.");
7635 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7636 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7637 Value *Count = getTripCount();
7638 // Reuse existing vector loop preheader for TC checks.
7639 // Note that new preheader block is generated for vector loop.
7640 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7641 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7642
7643 // Generate code to check if the loop's trip count is less than VF * UF of the
7644 // main vector loop.
7645 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7646 : VF.isVector())
7649
7650 Value *CheckMinIters = Builder.CreateICmp(
7651 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7652 "min.iters.check");
7653
7654 if (!ForEpilogue)
7655 TCCheckBlock->setName("vector.main.loop.iter.check");
7656
7657 // Create new preheader for vector loop.
7658 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7659 DT, LI, nullptr, "vector.ph");
7660
7661 if (ForEpilogue) {
7662 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7663 DT->getNode(Bypass)->getIDom()) &&
7664 "TC check is expected to dominate Bypass");
7665
7666 // Update dominator for Bypass & LoopExit.
7667 DT->changeImmediateDominator(Bypass, TCCheckBlock);
7668 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7669 // For loops with multiple exits, there's no edge from the middle block
7670 // to exit blocks (as the epilogue must run) and thus no need to update
7671 // the immediate dominator of the exit blocks.
7673
7674 LoopBypassBlocks.push_back(TCCheckBlock);
7675
7676 // Save the trip count so we don't have to regenerate it in the
7677 // vec.epilog.iter.check. This is safe to do because the trip count
7678 // generated here dominates the vector epilog iter check.
7679 EPI.TripCount = Count;
7680 }
7681
7682 BranchInst &BI =
7683 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7685 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7686 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7687
7688 return TCCheckBlock;
7689}
7690
7691//===--------------------------------------------------------------------===//
7692// EpilogueVectorizerEpilogueLoop
7693//===--------------------------------------------------------------------===//
7694
7695/// This function is partially responsible for generating the control flow
7696/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7697std::pair<BasicBlock *, Value *>
7699 const SCEV2ValueTy &ExpandedSCEVs) {
7700 createVectorLoopSkeleton("vec.epilog.");
7701
7702 // Now, compare the remaining count and if there aren't enough iterations to
7703 // execute the vectorized epilogue skip to the scalar part.
7704 LoopVectorPreHeader->setName("vec.epilog.ph");
7705 BasicBlock *VecEpilogueIterationCountCheck =
7707 nullptr, "vec.epilog.iter.check", true);
7709 VecEpilogueIterationCountCheck);
7710
7711 // Adjust the control flow taking the state info from the main loop
7712 // vectorization into account.
7714 "expected this to be saved from the previous pass.");
7716 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7717
7720
7722 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7723
7724 if (EPI.SCEVSafetyCheck)
7726 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7727 if (EPI.MemSafetyCheck)
7729 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7730
7732 VecEpilogueIterationCountCheck,
7733 VecEpilogueIterationCountCheck->getSinglePredecessor());
7734
7737 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7738 // If there is an epilogue which must run, there's no edge from the
7739 // middle block to exit blocks and thus no need to update the immediate
7740 // dominator of the exit blocks.
7743
7744 // Keep track of bypass blocks, as they feed start values to the induction and
7745 // reduction phis in the scalar loop preheader.
7746 if (EPI.SCEVSafetyCheck)
7748 if (EPI.MemSafetyCheck)
7751
7752 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7753 // reductions which merge control-flow from the latch block and the middle
7754 // block. Update the incoming values here and move the Phi into the preheader.
7755 SmallVector<PHINode *, 4> PhisInBlock;
7756 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7757 PhisInBlock.push_back(&Phi);
7758
7759 for (PHINode *Phi : PhisInBlock) {
7760 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7761 Phi->replaceIncomingBlockWith(
7762 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7763 VecEpilogueIterationCountCheck);
7764
7765 // If the phi doesn't have an incoming value from the
7766 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7767 // value and also those from other check blocks. This is needed for
7768 // reduction phis only.
7769 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7770 return EPI.EpilogueIterationCountCheck == IncB;
7771 }))
7772 continue;
7773 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7774 if (EPI.SCEVSafetyCheck)
7775 Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7776 if (EPI.MemSafetyCheck)
7777 Phi->removeIncomingValue(EPI.MemSafetyCheck);
7778 }
7779
7780 // Generate a resume induction for the vector epilogue and put it in the
7781 // vector epilogue preheader
7782 Type *IdxTy = Legal->getWidestInductionType();
7783 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
7785 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7786 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7788
7789 // Generate induction resume values. These variables save the new starting
7790 // indexes for the scalar loop. They are used to test if there are any tail
7791 // iterations left once the vector loop has completed.
7792 // Note that when the vectorized epilogue is skipped due to iteration count
7793 // check, then the resume value for the induction variable comes from
7794 // the trip count of the main vector loop, hence passing the AdditionalBypass
7795 // argument.
7796 createInductionResumeValues(ExpandedSCEVs,
7797 {VecEpilogueIterationCountCheck,
7798 EPI.VectorTripCount} /* AdditionalBypass */);
7799
7800 return {completeLoopSkeleton(), EPResumeVal};
7801}
7802
7803BasicBlock *
7805 BasicBlock *Bypass, BasicBlock *Insert) {
7806
7808 "Expected trip count to have been safed in the first pass.");
7809 assert(
7810 (!isa<Instruction>(EPI.TripCount) ||
7811 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7812 "saved trip count does not dominate insertion point.");
7813 Value *TC = EPI.TripCount;
7814 IRBuilder<> Builder(Insert->getTerminator());
7815 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7816
7817 // Generate code to check if the loop's trip count is less than VF * UF of the
7818 // vector epilogue loop.
7819 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7822
7823 Value *CheckMinIters =
7824 Builder.CreateICmp(P, Count,
7827 "min.epilog.iters.check");
7828
7829 BranchInst &BI =
7830 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7832 unsigned MainLoopStep = UF * VF.getKnownMinValue();
7833 unsigned EpilogueLoopStep =
7835 // We assume the remaining `Count` is equally distributed in
7836 // [0, MainLoopStep)
7837 // So the probability for `Count < EpilogueLoopStep` should be
7838 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7839 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7840 const uint32_t Weights[] = {EstimatedSkipCount,
7841 MainLoopStep - EstimatedSkipCount};
7842 setBranchWeights(BI, Weights, /*IsExpected=*/false);
7843 }
7844 ReplaceInstWithInst(Insert->getTerminator(), &BI);
7845
7846 LoopBypassBlocks.push_back(Insert);
7847 return Insert;
7848}
7849
7851 LLVM_DEBUG({
7852 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7853 << "Epilogue Loop VF:" << EPI.EpilogueVF
7854 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7855 });
7856}
7857
7860 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7861 });
7862}
7863
7865 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7866 assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7867 bool PredicateAtRangeStart = Predicate(Range.Start);
7868
7869 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
7870 if (Predicate(TmpVF) != PredicateAtRangeStart) {
7871 Range.End = TmpVF;
7872 break;
7873 }
7874
7875 return PredicateAtRangeStart;
7876}
7877
7878/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7879/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7880/// of VF's starting at a given VF and extending it as much as possible. Each
7881/// vectorization decision can potentially shorten this sub-range during
7882/// buildVPlan().
7884 ElementCount MaxVF) {
7885 auto MaxVFTimes2 = MaxVF * 2;
7886 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
7887 VFRange SubRange = {VF, MaxVFTimes2};
7888 VPlans.push_back(buildVPlan(SubRange));
7889 VF = SubRange.End;
7890 }
7891}
7892
7893iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
7895 std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
7896 if (auto *I = dyn_cast<Instruction>(Op)) {
7897 if (auto *R = Ingredient2Recipe.lookup(I))
7898 return R->getVPSingleValue();
7899 }
7900 return Plan.getOrAddLiveIn(Op);
7901 };
7902 return map_range(Operands, Fn);
7903}
7904
7906 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7907
7908 // Look for cached value.
7909 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7910 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7911 if (ECEntryIt != EdgeMaskCache.end())
7912 return ECEntryIt->second;
7913
7914 VPValue *SrcMask = getBlockInMask(Src);
7915
7916 // The terminator has to be a branch inst!
7917 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7918 assert(BI && "Unexpected terminator found");
7919
7920 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7921 return EdgeMaskCache[Edge] = SrcMask;
7922
7923 // If source is an exiting block, we know the exit edge is dynamically dead
7924 // in the vector loop, and thus we don't need to restrict the mask. Avoid
7925 // adding uses of an otherwise potentially dead instruction.
7926 if (OrigLoop->isLoopExiting(Src))
7927 return EdgeMaskCache[Edge] = SrcMask;
7928
7929 VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition(), Plan);
7930 assert(EdgeMask && "No Edge Mask found for condition");
7931
7932 if (BI->getSuccessor(0) != Dst)
7933 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
7934
7935 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
7936 // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
7937 // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
7938 // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
7939 EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
7940 }
7941
7942 return EdgeMaskCache[Edge] = EdgeMask;
7943}
7944
7946 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7947
7948 // Look for cached value.
7949 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7950 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
7951 assert(ECEntryIt != EdgeMaskCache.end() &&
7952 "looking up mask for edge which has not been created");
7953 return ECEntryIt->second;
7954}
7955
7957 BasicBlock *Header = OrigLoop->getHeader();
7958
7959 // When not folding the tail, use nullptr to model all-true mask.
7960 if (!CM.foldTailByMasking()) {
7961 BlockMaskCache[Header] = nullptr;
7962 return;
7963 }
7964
7965 // Introduce the early-exit compare IV <= BTC to form header block mask.
7966 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
7967 // constructing the desired canonical IV in the header block as its first
7968 // non-phi instructions.
7969
7970 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
7971 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
7972 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
7973 HeaderVPBB->insert(IV, NewInsertionPoint);
7974
7975 VPBuilder::InsertPointGuard Guard(Builder);
7976 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
7977 VPValue *BlockMask = nullptr;
7979 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
7980 BlockMaskCache[Header] = BlockMask;
7981}
7982
7984 // Return the cached value.
7985 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
7986 assert(BCEntryIt != BlockMaskCache.end() &&
7987 "Trying to access mask for block without one.");
7988 return BCEntryIt->second;
7989}
7990
7992 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7993 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
7994 assert(OrigLoop->getHeader() != BB &&
7995 "Loop header must have cached block mask");
7996
7997 // All-one mask is modelled as no-mask following the convention for masked
7998 // load/store/gather/scatter. Initialize BlockMask to no-mask.
7999 VPValue *BlockMask = nullptr;
8000 // This is the block mask. We OR all incoming edges.
8001 for (auto *Predecessor : predecessors(BB)) {
8002 VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
8003 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8004 BlockMaskCache[BB] = EdgeMask;
8005 return;
8006 }
8007
8008 if (!BlockMask) { // BlockMask has its initialized nullptr value.
8009 BlockMask = EdgeMask;
8010 continue;
8011 }
8012
8013 BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8014 }
8015
8016 BlockMaskCache[BB] = BlockMask;
8017}
8018
8020VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8021 VFRange &Range) {
8022 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8023 "Must be called with either a load or store");
8024
8025 auto willWiden = [&](ElementCount VF) -> bool {
8027 CM.getWideningDecision(I, VF);
8029 "CM decision should be taken at this point.");
8031 return true;
8032 if (CM.isScalarAfterVectorization(I, VF) ||
8033 CM.isProfitableToScalarize(I, VF))
8034 return false;
8036 };
8037
8039 return nullptr;
8040
8041 VPValue *Mask = nullptr;
8042 if (Legal->isMaskRequired(I))
8043 Mask = getBlockInMask(I->getParent());
8044
8045 // Determine if the pointer operand of the access is either consecutive or
8046 // reverse consecutive.
8048 CM.getWideningDecision(I, Range.Start);
8050 bool Consecutive =
8052
8053 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8054 if (Consecutive) {
8055 auto *GEP = dyn_cast<GetElementPtrInst>(
8056 Ptr->getUnderlyingValue()->stripPointerCasts());
8057 auto *VectorPtr = new VPVectorPointerRecipe(
8058 Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
8059 I->getDebugLoc());
8060 Builder.getInsertBlock()->appendRecipe(VectorPtr);
8061 Ptr = VectorPtr;
8062 }
8063 if (LoadInst *Load = dyn_cast<LoadInst>(I))
8064 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
8065 I->getDebugLoc());
8066
8067 StoreInst *Store = cast<StoreInst>(I);
8068 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
8069 Reverse, I->getDebugLoc());
8070}
8071
8072/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8073/// insert a recipe to expand the step for the induction recipe.
8076 VPValue *Start, const InductionDescriptor &IndDesc,
8077 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
8078 assert(IndDesc.getStartValue() ==
8079 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8080 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8081 "step must be loop invariant");
8082
8083 VPValue *Step =
8085 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8086 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
8087 }
8088 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8089 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
8090}
8091
8092VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8094
8095 // Check if this is an integer or fp induction. If so, build the recipe that
8096 // produces its scalar and vector values.
8097 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8098 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8099 *PSE.getSE(), *OrigLoop);
8100
8101 // Check if this is pointer induction. If so, build the recipe for it.
8102 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8103 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8104 *PSE.getSE());
8106 Phi, Operands[0], Step, *II,
8108 [&](ElementCount VF) {
8109 return CM.isScalarAfterVectorization(Phi, VF);
8110 },
8111 Range));
8112 }
8113 return nullptr;
8114}
8115
8116VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8118 // Optimize the special case where the source is a constant integer
8119 // induction variable. Notice that we can only optimize the 'trunc' case
8120 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8121 // (c) other casts depend on pointer size.
8122
8123 // Determine whether \p K is a truncation based on an induction variable that
8124 // can be optimized.
8125 auto isOptimizableIVTruncate =
8126 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8127 return [=](ElementCount VF) -> bool {
8128 return CM.isOptimizableIVTruncate(K, VF);
8129 };
8130 };
8131
8133 isOptimizableIVTruncate(I), Range)) {
8134
8135 auto *Phi = cast<PHINode>(I->getOperand(0));
8137 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
8138 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8139 *OrigLoop);
8140 }
8141 return nullptr;
8142}
8143
8144VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8146 unsigned NumIncoming = Phi->getNumIncomingValues();
8147
8148 // We know that all PHIs in non-header blocks are converted into selects, so
8149 // we don't have to worry about the insertion order and we can just use the
8150 // builder. At this point we generate the predication tree. There may be
8151 // duplications since this is a simple recursive scan, but future
8152 // optimizations will clean it up.
8153 // TODO: At the moment the first mask is always skipped, but it would be
8154 // better to skip the most expensive mask.
8155 SmallVector<VPValue *, 2> OperandsWithMask;
8156
8157 for (unsigned In = 0; In < NumIncoming; In++) {
8158 OperandsWithMask.push_back(Operands[In]);
8159 VPValue *EdgeMask =
8160 getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8161 if (!EdgeMask) {
8162 assert(In == 0 && "Both null and non-null edge masks found");
8164 "Distinct incoming values with one having a full mask");
8165 break;
8166 }
8167 if (In == 0)
8168 continue;
8169 OperandsWithMask.push_back(EdgeMask);
8170 }
8171 return new VPBlendRecipe(Phi, OperandsWithMask);
8172}
8173
8174VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8176 VFRange &Range) {
8178 [this, CI](ElementCount VF) {
8179 return CM.isScalarWithPredication(CI, VF);
8180 },
8181 Range);
8182
8183 if (IsPredicated)
8184 return nullptr;
8185
8187 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8188 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8189 ID == Intrinsic::pseudoprobe ||
8190 ID == Intrinsic::experimental_noalias_scope_decl))
8191 return nullptr;
8192
8193 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8194 Ops.push_back(Operands.back());
8195
8196 // Is it beneficial to perform intrinsic call compared to lib call?
8197 bool ShouldUseVectorIntrinsic =
8199 [&](ElementCount VF) -> bool {
8200 return CM.getCallWideningDecision(CI, VF).Kind ==
8202 },
8203 Range);
8204 if (ShouldUseVectorIntrinsic)
8205 return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()), ID,
8206 CI->getDebugLoc());
8207
8208 Function *Variant = nullptr;
8209 std::optional<unsigned> MaskPos;
8210 // Is better to call a vectorized version of the function than to to scalarize
8211 // the call?
8212 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8213 [&](ElementCount VF) -> bool {
8214 // The following case may be scalarized depending on the VF.
8215 // The flag shows whether we can use a usual Call for vectorized
8216 // version of the instruction.
8217
8218 // If we've found a variant at a previous VF, then stop looking. A
8219 // vectorized variant of a function expects input in a certain shape
8220 // -- basically the number of input registers, the number of lanes
8221 // per register, and whether there's a mask required.
8222 // We store a pointer to the variant in the VPWidenCallRecipe, so
8223 // once we have an appropriate variant it's only valid for that VF.
8224 // This will force a different vplan to be generated for each VF that
8225 // finds a valid variant.
8226 if (Variant)
8227 return false;
8229 CM.getCallWideningDecision(CI, VF);
8231 Variant = Decision.Variant;
8232 MaskPos = Decision.MaskPos;
8233 return true;
8234 }
8235
8236 return false;
8237 },
8238 Range);
8239 if (ShouldUseVectorCall) {
8240 if (MaskPos.has_value()) {
8241 // We have 2 cases that would require a mask:
8242 // 1) The block needs to be predicated, either due to a conditional
8243 // in the scalar loop or use of an active lane mask with
8244 // tail-folding, and we use the appropriate mask for the block.
8245 // 2) No mask is required for the block, but the only available
8246 // vector variant at this VF requires a mask, so we synthesize an
8247 // all-true mask.
8248 VPValue *Mask = nullptr;
8249 if (Legal->isMaskRequired(CI))
8250 Mask = getBlockInMask(CI->getParent());
8251 else
8253 IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
8254
8255 Ops.insert(Ops.begin() + *MaskPos, Mask);
8256 }
8257
8258 return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()),
8260 Variant);
8261 }
8262
8263 return nullptr;
8264}
8265
8266bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8267 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8268 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8269 // Instruction should be widened, unless it is scalar after vectorization,
8270 // scalarization is profitable or it is predicated.
8271 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8272 return CM.isScalarAfterVectorization(I, VF) ||
8273 CM.isProfitableToScalarize(I, VF) ||
8274 CM.isScalarWithPredication(I, VF);
8275 };
8277 Range);
8278}
8279
8280VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8282 VPBasicBlock *VPBB) {
8283 switch (I->getOpcode()) {
8284 default:
8285 return nullptr;
8286 case Instruction::SDiv:
8287 case Instruction::UDiv:
8288 case Instruction::SRem:
8289 case Instruction::URem: {
8290 // If not provably safe, use a select to form a safe divisor before widening the
8291 // div/rem operation itself. Otherwise fall through to general handling below.
8292 if (CM.isPredicatedInst(I)) {
8293 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8294 VPValue *Mask = getBlockInMask(I->getParent());
8295 VPValue *One =
8296 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8297 auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
8298 Ops[1] = SafeRHS;
8299 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8300 }
8301 [[fallthrough]];
8302 }
8303 case Instruction::Add:
8304 case Instruction::And:
8305 case Instruction::AShr:
8306 case Instruction::FAdd:
8307 case Instruction::FCmp:
8308 case Instruction::FDiv:
8309 case Instruction::FMul:
8310 case Instruction::FNeg:
8311 case Instruction::FRem:
8312 case Instruction::FSub:
8313 case Instruction::ICmp:
8314 case Instruction::LShr:
8315 case Instruction::Mul:
8316 case Instruction::Or:
8317 case Instruction::Select:
8318 case Instruction::Shl:
8319 case Instruction::Sub:
8320 case Instruction::Xor:
8321 case Instruction::Freeze:
8322 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8323 };
8324}
8325
8327 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8328 for (VPHeaderPHIRecipe *R : PhisToFix) {
8329 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8330 VPRecipeBase *IncR =
8331 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8332 R->addOperand(IncR->getVPSingleValue());
8333 }
8334}
8335
8337 VFRange &Range) {
8339 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8340 Range);
8341
8342 bool IsPredicated = CM.isPredicatedInst(I);
8343
8344 // Even if the instruction is not marked as uniform, there are certain
8345 // intrinsic calls that can be effectively treated as such, so we check for
8346 // them here. Conservatively, we only do this for scalable vectors, since
8347 // for fixed-width VFs we can always fall back on full scalarization.
8348 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8349 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8350 case Intrinsic::assume:
8351 case Intrinsic::lifetime_start:
8352 case Intrinsic::lifetime_end:
8353 // For scalable vectors if one of the operands is variant then we still
8354 // want to mark as uniform, which will generate one instruction for just
8355 // the first lane of the vector. We can't scalarize the call in the same
8356 // way as for fixed-width vectors because we don't know how many lanes
8357 // there are.
8358 //
8359 // The reasons for doing it this way for scalable vectors are:
8360 // 1. For the assume intrinsic generating the instruction for the first
8361 // lane is still be better than not generating any at all. For
8362 // example, the input may be a splat across all lanes.
8363 // 2. For the lifetime start/end intrinsics the pointer operand only
8364 // does anything useful when the input comes from a stack object,
8365 // which suggests it should always be uniform. For non-stack objects
8366 // the effect is to poison the object, which still allows us to
8367 // remove the call.
8368 IsUniform = true;
8369 break;
8370 default:
8371 break;
8372 }
8373 }
8374 VPValue *BlockInMask = nullptr;
8375 if (!IsPredicated) {
8376 // Finalize the recipe for Instr, first if it is not predicated.
8377 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8378 } else {
8379 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8380 // Instructions marked for predication are replicated and a mask operand is
8381 // added initially. Masked replicate recipes will later be placed under an
8382 // if-then construct to prevent side-effects. Generate recipes to compute
8383 // the block mask for this region.
8384 BlockInMask = getBlockInMask(I->getParent());
8385 }
8386
8387 auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
8388 IsUniform, BlockInMask);
8389 return Recipe;
8390}
8391
8395 VFRange &Range, VPBasicBlock *VPBB) {
8396 // First, check for specific widening recipes that deal with inductions, Phi
8397 // nodes, calls and memory operations.
8398 VPRecipeBase *Recipe;
8399 if (auto Phi = dyn_cast<PHINode>(Instr)) {
8400 if (Phi->getParent() != OrigLoop->getHeader())
8401 return tryToBlend(Phi, Operands);
8402
8403 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8404 return Recipe;
8405
8406 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8407 assert((Legal->isReductionVariable(Phi) ||
8408 Legal->isFixedOrderRecurrence(Phi)) &&
8409 "can only widen reductions and fixed-order recurrences here");
8410 VPValue *StartV = Operands[0];
8411 if (Legal->isReductionVariable(Phi)) {
8412 const RecurrenceDescriptor &RdxDesc =
8413 Legal->getReductionVars().find(Phi)->second;
8414 assert(RdxDesc.getRecurrenceStartValue() ==
8415 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8416 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8417 CM.isInLoopReduction(Phi),
8418 CM.useOrderedReductions(RdxDesc));
8419 } else {
8420 // TODO: Currently fixed-order recurrences are modeled as chains of
8421 // first-order recurrences. If there are no users of the intermediate
8422 // recurrences in the chain, the fixed order recurrence should be modeled
8423 // directly, enabling more efficient codegen.
8424 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8425 }
8426
8427 PhisToFix.push_back(PhiRecipe);
8428 return PhiRecipe;
8429 }
8430
8431 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8432 cast<TruncInst>(Instr), Operands, Range)))
8433 return Recipe;
8434
8435 // All widen recipes below deal only with VF > 1.
8437 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8438 return nullptr;
8439
8440 if (auto *CI = dyn_cast<CallInst>(Instr))
8441 return tryToWidenCall(CI, Operands, Range);
8442
8443 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8444 return tryToWidenMemory(Instr, Operands, Range);
8445
8446 if (!shouldWiden(Instr, Range))
8447 return nullptr;
8448
8449 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8450 return new VPWidenGEPRecipe(GEP,
8451 make_range(Operands.begin(), Operands.end()));
8452
8453 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8454 return new VPWidenSelectRecipe(
8455 *SI, make_range(Operands.begin(), Operands.end()));
8456 }
8457
8458 if (auto *CI = dyn_cast<CastInst>(Instr)) {
8459 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8460 *CI);
8461 }
8462
8463 return tryToWiden(Instr, Operands, VPBB);
8464}
8465
8466void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8467 ElementCount MaxVF) {
8468 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8469
8470 auto MaxVFTimes2 = MaxVF * 2;
8471 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8472 VFRange SubRange = {VF, MaxVFTimes2};
8473 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8474 // Now optimize the initial VPlan.
8475 if (!Plan->hasVF(ElementCount::getFixed(1)))
8477 *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
8478 VPlanTransforms::optimize(*Plan, *PSE.getSE());
8479 // TODO: try to put it close to addActiveLaneMask().
8480 // Discard the plan if it is not EVL-compatible
8481 if (CM.foldTailWithEVL() &&
8483 break;
8484 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8485 VPlans.push_back(std::move(Plan));
8486 }
8487 VF = SubRange.End;
8488 }
8489}
8490
8491// Add the necessary canonical IV and branch recipes required to control the
8492// loop.
8493static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8494 DebugLoc DL) {
8495 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8496 auto *StartV = Plan.getOrAddLiveIn(StartIdx);
8497
8498 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8499 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8500 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8501 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8502 Header->insert(CanonicalIVPHI, Header->begin());
8503
8504 VPBuilder Builder(TopRegion->getExitingBasicBlock());
8505 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8506 auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8507 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
8508 "index.next");
8509 CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8510
8511 // Add the BranchOnCount VPInstruction to the latch.
8513 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8514}
8515
8516// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8517// original exit block.
8518static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
8519 VPRecipeBuilder &Builder, VPlan &Plan) {
8520 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8521 BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8522 // Only handle single-exit loops with unique exit blocks for now.
8523 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8524 return;
8525
8526 // Introduce VPUsers modeling the exit values.
8527 for (PHINode &ExitPhi : ExitBB->phis()) {
8528 Value *IncomingValue =
8529 ExitPhi.getIncomingValueForBlock(ExitingBB);
8530 VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan);
8531 Plan.addLiveOut(&ExitPhi, V);
8532 }
8533}
8534
8536LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8537
8539
8540 // ---------------------------------------------------------------------------
8541 // Build initial VPlan: Scan the body of the loop in a topological order to
8542 // visit each basic block after having visited its predecessor basic blocks.
8543 // ---------------------------------------------------------------------------
8544
8545 // Create initial VPlan skeleton, having a basic block for the pre-header
8546 // which contains SCEV expansions that need to happen before the CFG is
8547 // modified; a basic block for the vector pre-header, followed by a region for
8548 // the vector loop, followed by the middle basic block. The skeleton vector
8549 // loop region contains a header and latch basic blocks.
8551 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8552 *PSE.getSE(), OrigLoop->getLoopPreheader());
8553 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8554 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8555 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8556 Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);
8557 Plan->getVectorLoopRegion()->setExiting(LatchVPBB);
8558
8559 // Don't use getDecisionAndClampRange here, because we don't know the UF
8560 // so this function is better to be conservative, rather than to split
8561 // it up into different VPlans.
8562 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8563 bool IVUpdateMayOverflow = false;
8564 for (ElementCount VF : Range)
8565 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8566
8568 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8569 // When not folding the tail, we know that the induction increment will not
8570 // overflow.
8571 bool HasNUW = Style == TailFoldingStyle::None;
8572 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8573
8574 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
8575
8576 // ---------------------------------------------------------------------------
8577 // Pre-construction: record ingredients whose recipes we'll need to further
8578 // process after constructing the initial VPlan.
8579 // ---------------------------------------------------------------------------
8580
8581 // For each interleave group which is relevant for this (possibly trimmed)
8582 // Range, add it to the set of groups to be later applied to the VPlan and add
8583 // placeholders for its members' Recipes which we'll be replacing with a
8584 // single VPInterleaveRecipe.
8586 auto applyIG = [IG, this](ElementCount VF) -> bool {
8587 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8588 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8590 // For scalable vectors, the only interleave factor currently supported
8591 // is 2 since we require the (de)interleave2 intrinsics instead of
8592 // shufflevectors.
8593 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8594 "Unsupported interleave factor for scalable vectors");
8595 return Result;
8596 };
8597 if (!getDecisionAndClampRange(applyIG, Range))
8598 continue;
8599 InterleaveGroups.insert(IG);
8600 };
8601
8602 // ---------------------------------------------------------------------------
8603 // Construct recipes for the instructions in the loop
8604 // ---------------------------------------------------------------------------
8605
8606 // Scan the body of the loop in a topological order to visit each basic block
8607 // after having visited its predecessor basic blocks.
8608 LoopBlocksDFS DFS(OrigLoop);
8609 DFS.perform(LI);
8610
8611 VPBasicBlock *VPBB = HeaderVPBB;
8612 BasicBlock *HeaderBB = OrigLoop->getHeader();
8613 bool NeedsMasks =
8614 CM.foldTailByMasking() ||
8615 any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
8616 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
8617 return Legal->blockNeedsPredication(BB) || NeedsBlends;
8618 });
8619 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8620 // Relevant instructions from basic block BB will be grouped into VPRecipe
8621 // ingredients and fill a new VPBasicBlock.
8622 if (VPBB != HeaderVPBB)
8623 VPBB->setName(BB->getName());
8624 Builder.setInsertPoint(VPBB);
8625
8626 if (VPBB == HeaderVPBB)
8627 RecipeBuilder.createHeaderMask();
8628 else if (NeedsMasks)
8629 RecipeBuilder.createBlockInMask(BB);
8630
8631 // Introduce each ingredient into VPlan.
8632 // TODO: Model and preserve debug intrinsics in VPlan.
8633 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
8634 Instruction *Instr = &I;
8636 auto *Phi = dyn_cast<PHINode>(Instr);
8637 if (Phi && Phi->getParent() == HeaderBB) {
8638 Operands.push_back(Plan->getOrAddLiveIn(
8639 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8640 } else {
8641 auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
8642 Operands = {OpRange.begin(), OpRange.end()};
8643 }
8644
8645 // Invariant stores inside loop will be deleted and a single store
8646 // with the final reduction value will be added to the exit block
8647 StoreInst *SI;
8648 if ((SI = dyn_cast<StoreInst>(&I)) &&
8649 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8650 continue;
8651
8652 VPRecipeBase *Recipe =
8653 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
8654 if (!Recipe)
8655 Recipe = RecipeBuilder.handleReplication(Instr, Range);
8656
8657 RecipeBuilder.setRecipe(Instr, Recipe);
8658 if (isa<VPHeaderPHIRecipe>(Recipe)) {
8659 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8660 // the following cases, VPHeaderPHIRecipes may be created after non-phi
8661 // recipes and need to be moved to the phi section of HeaderVPBB:
8662 // * tail-folding (non-phi recipes computing the header mask are
8663 // introduced earlier than regular header phi recipes, and should appear
8664 // after them)
8665 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8666
8667 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8668 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8669 "unexpected recipe needs moving");
8670 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8671 } else
8672 VPBB->appendRecipe(Recipe);
8673 }
8674
8676 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8677 }
8678
8679 // After here, VPBB should not be used.
8680 VPBB = nullptr;
8681
8682 if (CM.requiresScalarEpilogue(Range)) {
8683 // No edge from the middle block to the unique exit block has been inserted
8684 // and there is nothing to fix from vector loop; phis should have incoming
8685 // from scalar loop only.
8686 } else
8687 addUsersInExitBlock(HeaderVPBB, OrigLoop, RecipeBuilder, *Plan);
8688
8689 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8690 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8691 "entry block must be set to a VPRegionBlock having a non-empty entry "
8692 "VPBasicBlock");
8693 RecipeBuilder.fixHeaderPhis();
8694
8695 // ---------------------------------------------------------------------------
8696 // Transform initial VPlan: Apply previously taken decisions, in order, to
8697 // bring the VPlan to its final state.
8698 // ---------------------------------------------------------------------------
8699
8700 // Adjust the recipes for any inloop reductions.
8701 adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start);
8702
8703 // Interleave memory: for each Interleave Group we marked earlier as relevant
8704 // for this VPlan, replace the Recipes widening its memory instructions with a
8705 // single VPInterleaveRecipe at its insertion point.
8706 for (const auto *IG : InterleaveGroups) {
8707 auto *Recipe =
8708 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos()));
8709 SmallVector<VPValue *, 4> StoredValues;
8710 for (unsigned i = 0; i < IG->getFactor(); ++i)
8711 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8712 auto *StoreR = cast<VPWidenStoreRecipe>(RecipeBuilder.getRecipe(SI));
8713 StoredValues.push_back(StoreR->getStoredValue());
8714 }
8715
8716 bool NeedsMaskForGaps =
8717 IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8718 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8719 Recipe->getMask(), NeedsMaskForGaps);
8720 VPIG->insertBefore(Recipe);
8721 unsigned J = 0;
8722 for (unsigned i = 0; i < IG->getFactor(); ++i)
8723 if (Instruction *Member = IG->getMember(i)) {
8724 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
8725 if (!Member->getType()->isVoidTy()) {
8726 VPValue *OriginalV = MemberR->getVPSingleValue();
8727 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8728 J++;
8729 }
8730 MemberR->eraseFromParent();
8731 }
8732 }
8733
8734 for (ElementCount VF : Range)
8735 Plan->addVF(VF);
8736 Plan->setName("Initial VPlan");
8737
8738 // Replace VPValues for known constant strides guaranteed by predicate scalar
8739 // evolution.
8740 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8741 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8742 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
8743 // Only handle constant strides for now.
8744 if (!ScevStride)
8745 continue;
8746
8747 auto *CI = Plan->getOrAddLiveIn(
8748 ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
8749 if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
8750 StrideVPV->replaceAllUsesWith(CI);
8751
8752 // The versioned value may not be used in the loop directly but through a
8753 // sext/zext. Add new live-ins in those cases.
8754 for (Value *U : StrideV->users()) {
8755 if (!isa<SExtInst, ZExtInst>(U))
8756 continue;
8757 VPValue *StrideVPV = Plan->getLiveIn(U);
8758 if (!StrideVPV)
8759 continue;
8760 unsigned BW = U->getType()->getScalarSizeInBits();
8761 APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
8762 : ScevStride->getAPInt().zext(BW);
8763 VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
8764 StrideVPV->replaceAllUsesWith(CI);
8765 }
8766 }
8767
8769 return Legal->blockNeedsPredication(BB);
8770 });
8771
8772 // Sink users of fixed-order recurrence past the recipe defining the previous
8773 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8775 return nullptr;
8776
8777 if (useActiveLaneMask(Style)) {
8778 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8779 // TailFoldingStyle is visible there.
8780 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8781 bool WithoutRuntimeCheck =
8783 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8784 WithoutRuntimeCheck);
8785 }
8786 return Plan;
8787}
8788
8789VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8790 // Outer loop handling: They may require CFG and instruction level
8791 // transformations before even evaluating whether vectorization is profitable.
8792 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8793 // the vectorization pipeline.
8794 assert(!OrigLoop->isInnermost());
8795 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8796
8797 // Create new empty VPlan
8798 auto Plan = VPlan::createInitialVPlan(
8799 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8800 *PSE.getSE(), OrigLoop->getLoopPreheader());
8801
8802 // Build hierarchical CFG
8803 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8804 HCFGBuilder.buildHierarchicalCFG();
8805
8806 for (ElementCount VF : Range)
8807 Plan->addVF(VF);
8808
8810 Plan,
8811 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
8812 *PSE.getSE(), *TLI);
8813
8814 // Remove the existing terminator of the exiting block of the top-most region.
8815 // A BranchOnCount will be added instead when adding the canonical IV recipes.
8816 auto *Term =
8817 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8818 Term->eraseFromParent();
8819
8820 // Tail folding is not supported for outer loops, so the induction increment
8821 // is guaranteed to not wrap.
8822 bool HasNUW = true;
8823 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
8824 DebugLoc());
8825 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8826 return Plan;
8827}
8828
8829// Adjust the recipes for reductions. For in-loop reductions the chain of
8830// instructions leading from the loop exit instr to the phi need to be converted
8831// to reductions, with one operand being vector and the other being the scalar
8832// reduction chain. For other reductions, a select is introduced between the phi
8833// and live-out recipes when folding the tail.
8834//
8835// A ComputeReductionResult recipe is added to the middle block, also for
8836// in-loop reductions which compute their result in-loop, because generating
8837// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8838//
8839// Adjust AnyOf reductions; replace the reduction phi for the selected value
8840// with a boolean reduction phi node to check if the condition is true in any
8841// iteration. The final value is selected by the final ComputeReductionResult.
8842void LoopVectorizationPlanner::adjustRecipesForReductions(
8843 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
8844 ElementCount MinVF) {
8845 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8846 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8847 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8848 // sank outside of the loop would keep the same order as they had in the
8849 // original loop.
8850 SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8851 for (VPRecipeBase &R : Header->phis()) {
8852 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
8853 ReductionPHIList.emplace_back(ReductionPhi);
8854 }
8855 bool HasIntermediateStore = false;
8856 stable_sort(ReductionPHIList,
8857 [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8858 const VPReductionPHIRecipe *R2) {
8859 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8860 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8861 HasIntermediateStore |= IS1 || IS2;
8862
8863 // If neither of the recipes has an intermediate store, keep the
8864 // order the same.
8865 if (!IS1 && !IS2)
8866 return false;
8867
8868 // If only one of the recipes has an intermediate store, then
8869 // move it towards the beginning of the list.
8870 if (IS1 && !IS2)
8871 return true;
8872
8873 if (!IS1 && IS2)
8874 return false;
8875
8876 // If both recipes have an intermediate store, then the recipe
8877 // with the later store should be processed earlier. So it
8878 // should go to the beginning of the list.
8879 return DT->dominates(IS2, IS1);
8880 });
8881
8882 if (HasIntermediateStore && ReductionPHIList.size() > 1)
8883 for (VPRecipeBase *R : ReductionPHIList)
8884 R->moveBefore(*Header, Header->getFirstNonPhi());
8885
8886 for (VPRecipeBase &R : Header->phis()) {
8887 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8888 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8889 continue;
8890
8891 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
8892 RecurKind Kind = RdxDesc.getRecurrenceKind();
8894 "AnyOf reductions are not allowed for in-loop reductions");
8895
8896 // Collect the chain of "link" recipes for the reduction starting at PhiR.
8898 Worklist.insert(PhiR);
8899 for (unsigned I = 0; I != Worklist.size(); ++I) {
8900 VPSingleDefRecipe *Cur = Worklist[I];
8901 for (VPUser *U : Cur->users()) {
8902 auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
8903 if (!UserRecipe) {
8904 assert(isa<VPLiveOut>(U) &&
8905 "U must either be a VPSingleDef or VPLiveOut");
8906 continue;
8907 }
8908 Worklist.insert(UserRecipe);
8909 }
8910 }
8911
8912 // Visit operation "Links" along the reduction chain top-down starting from
8913 // the phi until LoopExitValue. We keep track of the previous item
8914 // (PreviousLink) to tell which of the two operands of a Link will remain
8915 // scalar and which will be reduced. For minmax by select(cmp), Link will be
8916 // the select instructions. Blend recipes of in-loop reduction phi's will
8917 // get folded to their non-phi operand, as the reduction recipe handles the
8918 // condition directly.
8919 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
8920 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
8921 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
8922
8923 // Index of the first operand which holds a non-mask vector operand.
8924 unsigned IndexOfFirstOperand;
8925 // Recognize a call to the llvm.fmuladd intrinsic.
8926 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
8927 VPValue *VecOp;
8928 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
8929 if (IsFMulAdd) {
8930 assert(
8932 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
8933 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
8934 isa<VPWidenCallRecipe>(CurrentLink)) &&
8935 CurrentLink->getOperand(2) == PreviousLink &&
8936 "expected a call where the previous link is the added operand");
8937
8938 // If the instruction is a call to the llvm.fmuladd intrinsic then we
8939 // need to create an fmul recipe (multiplying the first two operands of
8940 // the fmuladd together) to use as the vector operand for the fadd
8941 // reduction.
8942 VPInstruction *FMulRecipe = new VPInstruction(
8943 Instruction::FMul,
8944 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
8945 CurrentLinkI->getFastMathFlags());
8946 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
8947 VecOp = FMulRecipe;
8948 } else {
8949 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
8950 if (PhiR->isInLoop() && Blend) {
8951 assert(Blend->getNumIncomingValues() == 2 &&
8952 "Blend must have 2 incoming values");
8953 if (Blend->getIncomingValue(0) == PhiR)
8954 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
8955 else {
8956 assert(Blend->getIncomingValue(1) == PhiR &&
8957 "PhiR must be an operand of the blend");
8958 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
8959 }
8960 continue;
8961 }
8962
8964 if (isa<VPWidenRecipe>(CurrentLink)) {
8965 assert(isa<CmpInst>(CurrentLinkI) &&
8966 "need to have the compare of the select");
8967 continue;
8968 }
8969 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
8970 "must be a select recipe");
8971 IndexOfFirstOperand = 1;
8972 } else {
8973 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
8974 "Expected to replace a VPWidenSC");
8975 IndexOfFirstOperand = 0;
8976 }
8977 // Note that for non-commutable operands (cmp-selects), the semantics of
8978 // the cmp-select are captured in the recurrence kind.
8979 unsigned VecOpId =
8980 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
8981 ? IndexOfFirstOperand + 1
8982 : IndexOfFirstOperand;
8983 VecOp = CurrentLink->getOperand(VecOpId);
8984 assert(VecOp != PreviousLink &&
8985 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
8986 (VecOpId - IndexOfFirstOperand)) ==
8987 PreviousLink &&
8988 "PreviousLink must be the operand other than VecOp");
8989 }
8990
8991 BasicBlock *BB = CurrentLinkI->getParent();
8992 VPValue *CondOp = nullptr;
8994 CondOp = RecipeBuilder.getBlockInMask(BB);
8995
8996 VPReductionRecipe *RedRecipe =
8997 new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp,
8998 CondOp, CM.useOrderedReductions(RdxDesc));
8999 // Append the recipe to the end of the VPBasicBlock because we need to
9000 // ensure that it comes after all of it's inputs, including CondOp.
9001 // Note that this transformation may leave over dead recipes (including
9002 // CurrentLink), which will be cleaned by a later VPlan transform.
9003 LinkVPBB->appendRecipe(RedRecipe);
9004 CurrentLink->replaceAllUsesWith(RedRecipe);
9005 PreviousLink = RedRecipe;
9006 }
9007 }
9008 Builder.setInsertPoint(&*LatchVPBB->begin());
9009 for (VPRecipeBase &R :
9010 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9011 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9012 if (!PhiR)
9013 continue;
9014
9015 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9016 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9017 // with a boolean reduction phi node to check if the condition is true in
9018 // any iteration. The final value is selected by the final
9019 // ComputeReductionResult.
9021 RdxDesc.getRecurrenceKind())) {
9022 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9023 return isa<VPWidenSelectRecipe>(U) ||
9024 (isa<VPReplicateRecipe>(U) &&
9025 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9026 Instruction::Select);
9027 }));
9028 VPValue *Cmp = Select->getOperand(0);
9029 // If the compare is checking the reduction PHI node, adjust it to check
9030 // the start value.
9031 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
9032 for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
9033 if (CmpR->getOperand(I) == PhiR)
9034 CmpR->setOperand(I, PhiR->getStartValue());
9035 }
9036 VPBuilder::InsertPointGuard Guard(Builder);
9037 Builder.setInsertPoint(Select);
9038
9039 // If the true value of the select is the reduction phi, the new value is
9040 // selected if the negated condition is true in any iteration.
9041 if (Select->getOperand(1) == PhiR)
9042 Cmp = Builder.createNot(Cmp);
9043 VPValue *Or = Builder.createOr(PhiR, Cmp);
9044 Select->getVPSingleValue()->replaceAllUsesWith(Or);
9045
9046 // Convert the reduction phi to operate on bools.
9047 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9048 OrigLoop->getHeader()->getContext())));
9049 }
9050
9051 // If tail is folded by masking, introduce selects between the phi
9052 // and the live-out instruction of each reduction, at the beginning of the
9053 // dedicated latch block.
9054 auto *OrigExitingVPV = PhiR->getBackedgeValue();
9055 auto *NewExitingVPV = PhiR->getBackedgeValue();
9056 if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9057 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9058 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9059 "reduction recipe must be defined before latch");
9060 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9061 std::optional<FastMathFlags> FMFs =
9062 PhiTy->isFloatingPointTy()
9063 ? std::make_optional(RdxDesc.getFastMathFlags())
9064 : std::nullopt;
9065 NewExitingVPV =
9066 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9067 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9068 return isa<VPInstruction>(&U) &&
9069 cast<VPInstruction>(&U)->getOpcode() ==
9071 });
9074 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
9076 PhiR->setOperand(1, NewExitingVPV);
9077 }
9078
9079 // If the vector reduction can be performed in a smaller type, we truncate
9080 // then extend the loop exit value to enable InstCombine to evaluate the
9081 // entire expression in the smaller type.
9082 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9083 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9085 RdxDesc.getRecurrenceKind())) {
9086 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9087 Type *RdxTy = RdxDesc.getRecurrenceType();
9088 auto *Trunc =
9089 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9090 auto *Extnd =
9091 RdxDesc.isSigned()
9092 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9093 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9094
9095 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9096 Extnd->insertAfter(Trunc);
9097 if (PhiR->getOperand(1) == NewExitingVPV)
9098 PhiR->setOperand(1, Extnd->getVPSingleValue());
9099 NewExitingVPV = Extnd;
9100 }
9101
9102 // We want code in the middle block to appear to execute on the location of
9103 // the scalar loop's latch terminator because: (a) it is all compiler
9104 // generated, (b) these instructions are always executed after evaluating
9105 // the latch conditional branch, and (c) other passes may add new
9106 // predecessors which terminate on this line. This is the easiest way to
9107 // ensure we don't accidentally cause an extra step back into the loop while
9108 // debugging.
9109 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9110
9111 // TODO: At the moment ComputeReductionResult also drives creation of the
9112 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9113 // even for in-loop reductions, until the reduction resume value handling is
9114 // also modeled in VPlan.
9115 auto *FinalReductionResult = new VPInstruction(
9116 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9117 cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
9118 ->appendRecipe(FinalReductionResult);
9119 OrigExitingVPV->replaceUsesWithIf(
9120 FinalReductionResult,
9121 [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
9122 }
9123
9125}
9126
9127#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9129 VPSlotTracker &SlotTracker) const {
9130 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9131 IG->getInsertPos()->printAsOperand(O, false);
9132 O << ", ";
9134 VPValue *Mask = getMask();
9135 if (Mask) {
9136 O << ", ";
9137 Mask->printAsOperand(O, SlotTracker);
9138 }
9139
9140 unsigned OpIdx = 0;
9141 for (unsigned i = 0; i < IG->getFactor(); ++i) {
9142 if (!IG->getMember(i))
9143 continue;
9144 if (getNumStoreOperands() > 0) {
9145 O << "\n" << Indent << " store ";
9146 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9147 O << " to index " << i;
9148 } else {
9149 O << "\n" << Indent << " ";
9151 O << " = load from index " << i;
9152 }
9153 ++OpIdx;
9154 }
9155}
9156#endif
9157
9160 "Not a pointer induction according to InductionDescriptor!");
9161 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9162 "Unexpected type.");
9164 "Recipe should have been replaced");
9165
9166 auto *IVR = getParent()->getPlan()->getCanonicalIV();
9167 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
9168 Type *PhiType = IndDesc.getStep()->getType();
9169
9170 // Build a pointer phi
9171 Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9172 Type *ScStValueType = ScalarStartValue->getType();
9173 PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
9174 CanonicalIV->getIterator());
9175
9176 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9177 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9178
9179 // A pointer induction, performed by using a gep
9180 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
9181
9182 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9183 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9184 Value *NumUnrolledElems =
9185 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9186 Value *InductionGEP = GetElementPtrInst::Create(
9187 State.Builder.getInt8Ty(), NewPointerPhi,
9188 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9189 InductionLoc);
9190 // Add induction update using an incorrect block temporarily. The phi node
9191 // will be fixed after VPlan execution. Note that at this point the latch
9192 // block cannot be used, as it does not exist yet.
9193 // TODO: Model increment value in VPlan, by turning the recipe into a
9194 // multi-def and a subclass of VPHeaderPHIRecipe.
9195 NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9196
9197 // Create UF many actual address geps that use the pointer
9198 // phi as base and a vectorized version of the step value
9199 // (<step*0, ..., step*N>) as offset.
9200 for (unsigned Part = 0; Part < State.UF; ++Part) {
9201 Type *VecPhiType = VectorType::get(PhiType, State.VF);
9202 Value *StartOffsetScalar =
9203 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9204 Value *StartOffset =
9205 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9206 // Create a vector of consecutive numbers from zero to VF.
9207 StartOffset = State.Builder.CreateAdd(
9208 StartOffset, State.Builder.CreateStepVector(VecPhiType));
9209
9210 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
9211 "scalar step must be the same across all parts");
9212 Value *GEP = State.Builder.CreateGEP(
9213 State.Builder.getInt8Ty(), NewPointerPhi,
9214 State.Builder.CreateMul(
9215 StartOffset,
9216 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9217 "vector.gep"));
9218 State.set(this, GEP, Part);
9219 }
9220}
9221
9223 assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9224
9225 // Fast-math-flags propagate from the original induction instruction.
9227 if (FPBinOp)
9228 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9229
9230 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9231 Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0));
9232 Value *DerivedIV = emitTransformedIndex(
9233 State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9234 Kind, cast_if_present<BinaryOperator>(FPBinOp));
9235 DerivedIV->setName("offset.idx");
9236 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9237
9238 State.set(this, DerivedIV, VPIteration(0, 0));
9239}
9240
9242 assert(!State.Instance && "Interleave group being replicated.");
9243 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9245 NeedsMaskForGaps);
9246}
9247
9250 if (State.Instance) { // Generate a single instance.
9251 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9252 State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
9253 // Insert scalar instance packing it into a vector.
9254 if (State.VF.isVector() && shouldPack()) {
9255 // If we're constructing lane 0, initialize to start from poison.
9256 if (State.Instance->Lane.isFirstLane()) {
9257 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9258 Value *Poison = PoisonValue::get(
9259 VectorType::get(UI->getType(), State.VF));
9260 State.set(this, Poison, State.Instance->Part);
9261 }
9262 State.packScalarIntoVectorValue(this, *State.Instance);
9263 }
9264 return;
9265 }
9266
9267 if (IsUniform) {
9268 // If the recipe is uniform across all parts (instead of just per VF), only
9269 // generate a single instance.
9270 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9271 all_of(operands(), [](VPValue *Op) {
9272 return Op->isDefinedOutsideVectorRegions();
9273 })) {
9274 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
9275 if (user_begin() != user_end()) {
9276 for (unsigned Part = 1; Part < State.UF; ++Part)
9277 State.set(this, State.get(this, VPIteration(0, 0)),
9278 VPIteration(Part, 0));
9279 }
9280 return;
9281 }
9282
9283 // Uniform within VL means we need to generate lane 0 only for each
9284 // unrolled copy.
9285 for (unsigned Part = 0; Part < State.UF; ++Part)
9286 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
9287 return;
9288 }
9289
9290 // A store of a loop varying value to a uniform address only needs the last
9291 // copy of the store.
9292 if (isa<StoreInst>(UI) &&
9294 auto Lane = VPLane::getLastLaneForVF(State.VF);
9295 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
9296 State);
9297 return;
9298 }
9299
9300 // Generate scalar instances for all VF lanes of all UF parts.
9301 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9302 const unsigned EndLane = State.VF.getKnownMinValue();
9303 for (unsigned Part = 0; Part < State.UF; ++Part)
9304 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9305 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
9306}
9307
9309 auto *LI = cast<LoadInst>(&Ingredient);
9310
9311 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9312 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9313 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9314 bool CreateGather = !isConsecutive();
9315
9316 auto &Builder = State.Builder;
9318 for (unsigned Part = 0; Part < State.UF; ++Part) {
9319 Value *NewLI;
9320 Value *Mask = nullptr;
9321 if (auto *VPMask = getMask()) {
9322 // Mask reversal is only needed for non-all-one (null) masks, as reverse
9323 // of a null all-one mask is a null mask.
9324 Mask = State.get(VPMask, Part);
9325 if (isReverse())
9326 Mask = Builder.CreateVectorReverse(Mask, "reverse");
9327 }
9328
9329 Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather);
9330 if (CreateGather) {
9331 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
9332 "wide.masked.gather");
9333 } else if (Mask) {
9334 NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
9335 PoisonValue::get(DataTy),
9336 "wide.masked.load");
9337 } else {
9338 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
9339 }
9340 // Add metadata to the load, but setVectorValue to the reverse shuffle.
9341 State.addMetadata(NewLI, LI);
9342 if (Reverse)
9343 NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9344 State.set(this, NewLI, Part);
9345 }
9346}
9347
9348/// Use all-true mask for reverse rather than actual mask, as it avoids a
9349/// dependence w/o affecting the result.
9351 Value *EVL, const Twine &Name) {
9352 VectorType *ValTy = cast<VectorType>(Operand->getType());
9353 Value *AllTrueMask =
9354 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
9355 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
9356 {Operand, AllTrueMask, EVL}, nullptr, Name);
9357}
9358
9360 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9361 "explicit vector length.");
9362 auto *LI = cast<LoadInst>(&Ingredient);
9363
9364 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9365 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9366 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9367 bool CreateGather = !isConsecutive();
9368
9369 auto &Builder = State.Builder;
9371 CallInst *NewLI;
9372 Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9373 Value *Addr = State.get(getAddr(), 0, !CreateGather);
9374 Value *Mask = nullptr;
9375 if (VPValue *VPMask = getMask()) {
9376 Mask = State.get(VPMask, 0);
9377 if (isReverse())
9378 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
9379 } else {
9380 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9381 }
9382
9383 if (CreateGather) {
9384 NewLI =
9385 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
9386 nullptr, "wide.masked.gather");
9387 } else {
9388 VectorBuilder VBuilder(Builder);
9389 VBuilder.setEVL(EVL).setMask(Mask);
9390 NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
9391 Instruction::Load, DataTy, Addr, "vp.op.load"));
9392 }
9393 NewLI->addParamAttr(
9394 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
9395 State.addMetadata(NewLI, LI);
9396 Instruction *Res = NewLI;
9397 if (isReverse())
9398 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
9399 State.set(this, Res, 0);
9400}
9401
9403 auto *SI = cast<StoreInst>(&Ingredient);
9404
9405 VPValue *StoredVPValue = getStoredValue();
9406 bool CreateScatter = !isConsecutive();
9407 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9408
9409 auto &Builder = State.Builder;
9411
9412 for (unsigned Part = 0; Part < State.UF; ++Part) {
9413 Instruction *NewSI = nullptr;
9414 Value *Mask = nullptr;
9415 if (auto *VPMask = getMask()) {
9416 // Mask reversal is only needed for non-all-one (null) masks, as reverse
9417 // of a null all-one mask is a null mask.
9418 Mask = State.get(VPMask, Part);
9419 if (isReverse())
9420 Mask = Builder.CreateVectorReverse(Mask, "reverse");
9421 }
9422
9423 Value *StoredVal = State.get(StoredVPValue, Part);
9424 if (isReverse()) {
9425 // If we store to reverse consecutive memory locations, then we need
9426 // to reverse the order of elements in the stored value.
9427 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9428 // We don't want to update the value in the map as it might be used in
9429 // another expression. So don't call resetVectorValue(StoredVal).
9430 }
9431 Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter);
9432 if (CreateScatter)
9433 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
9434 else if (Mask)
9435 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
9436 else
9437 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
9438 State.addMetadata(NewSI, SI);
9439 }
9440}
9441
9443 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9444 "explicit vector length.");
9445 auto *SI = cast<StoreInst>(&Ingredient);
9446
9447 VPValue *StoredValue = getStoredValue();
9448 bool CreateScatter = !isConsecutive();
9449 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9450
9451 auto &Builder = State.Builder;
9453
9454 CallInst *NewSI = nullptr;
9455 Value *StoredVal = State.get(StoredValue, 0);
9456 Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9457 if (isReverse())
9458 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
9459 Value *Mask = nullptr;
9460 if (VPValue *VPMask = getMask()) {
9461 Mask = State.get(VPMask, 0);
9462 if (isReverse())
9463 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
9464 } else {
9465 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9466 }
9467 Value *Addr = State.get(getAddr(), 0, !CreateScatter);
9468 if (CreateScatter) {
9469 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
9470 Intrinsic::vp_scatter,
9471 {StoredVal, Addr, Mask, EVL});
9472 } else {
9473 VectorBuilder VBuilder(Builder);
9474 VBuilder.setEVL(EVL).setMask(Mask);
9475 NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
9476 Instruction::Store, Type::getVoidTy(EVL->getContext()),
9477 {StoredVal, Addr}));
9478 }
9479 NewSI->addParamAttr(
9480 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
9481 State.addMetadata(NewSI, SI);
9482}
9483
9484// Determine how to lower the scalar epilogue, which depends on 1) optimising
9485// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9486// predication, and 4) a TTI hook that analyses whether the loop is suitable
9487// for predication.
9492 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9493 // don't look at hints or options, and don't request a scalar epilogue.
9494 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9495 // LoopAccessInfo (due to code dependency and not being able to reliably get
9496 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9497 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9498 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9499 // back to the old way and vectorize with versioning when forced. See D81345.)
9500 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9504
9505 // 2) If set, obey the directives
9506 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9514 };
9515 }
9516
9517 // 3) If set, obey the hints
9518 switch (Hints.getPredicate()) {
9523 };
9524
9525 // 4) if the TTI hook indicates this is profitable, request predication.
9526 TailFoldingInfo TFI(TLI, &LVL, IAI);
9529
9531}
9532
9533// Process the loop in the VPlan-native vectorization path. This path builds
9534// VPlan upfront in the vectorization pipeline, which allows to apply
9535// VPlan-to-VPlan transformations from the very beginning without modifying the
9536// input LLVM IR.
9543 LoopVectorizationRequirements &Requirements) {
9544
9545 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9546 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9547 return false;
9548 }
9549 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9550 Function *F = L->getHeader()->getParent();
9551 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9552
9554 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9555
9556 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9557 &Hints, IAI);
9558 // Use the planner for outer loop vectorization.
9559 // TODO: CM is not used at this point inside the planner. Turn CM into an
9560 // optional argument if we don't need it in the future.
9561 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9562 ORE);
9563
9564 // Get user vectorization factor.
9565 ElementCount UserVF = Hints.getWidth();
9566
9568
9569 // Plan how to best vectorize, return the best VF and its cost.
9570 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9571
9572 // If we are stress testing VPlan builds, do not attempt to generate vector
9573 // code. Masked vector code generation support will follow soon.
9574 // Also, do not attempt to vectorize if no vector code will be produced.
9576 return false;
9577
9578 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9579
9580 {
9581 bool AddBranchWeights =
9582 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9583 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9584 F->getDataLayout(), AddBranchWeights);
9585 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9586 VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9587 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9588 << L->getHeader()->getParent()->getName() << "\"\n");
9589 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9590 }
9591
9592 reportVectorization(ORE, L, VF, 1);
9593
9594 // Mark the loop as already vectorized to avoid vectorizing again.
9595 Hints.setAlreadyVectorized();
9596 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9597 return true;
9598}
9599
9600// Emit a remark if there are stores to floats that required a floating point
9601// extension. If the vectorized loop was generated with floating point there
9602// will be a performance penalty from the conversion overhead and the change in
9603// the vector width.
9606 for (BasicBlock *BB : L->getBlocks()) {
9607 for (Instruction &Inst : *BB) {
9608 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9609 if (S->getValueOperand()->getType()->isFloatTy())
9610 Worklist.push_back(S);
9611 }
9612 }
9613 }
9614
9615 // Traverse the floating point stores upwards searching, for floating point
9616 // conversions.
9619 while (!Worklist.empty()) {
9620 auto *I = Worklist.pop_back_val();
9621 if (!L->contains(I))
9622 continue;
9623 if (!Visited.insert(I).second)
9624 continue;
9625
9626 // Emit a remark if the floating point store required a floating
9627 // point conversion.
9628 // TODO: More work could be done to identify the root cause such as a
9629 // constant or a function return type and point the user to it.
9630 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9631 ORE->emit([&]() {
9632 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9633 I->getDebugLoc(), L->getHeader())
9634 << "floating point conversion changes vector width. "
9635 << "Mixed floating point precision requires an up/down "
9636 << "cast that will negatively impact performance.";
9637 });
9638
9639 for (Use &Op : I->operands())
9640 if (auto *OpI = dyn_cast<Instruction>(Op))
9641 Worklist.push_back(OpI);
9642 }
9643}
9644
9645static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9647 std::optional<unsigned> VScale, Loop *L,
9648 ScalarEvolution &SE,
9650 InstructionCost CheckCost = Checks.getCost();
9651 if (!CheckCost.isValid())
9652 return false;
9653
9654 // When interleaving only scalar and vector cost will be equal, which in turn
9655 // would lead to a divide by 0. Fall back to hard threshold.
9656 if (VF.Width.isScalar()) {
9657 if (CheckCost > VectorizeMemoryCheckThreshold) {
9658 LLVM_DEBUG(
9659 dbgs()
9660 << "LV: Interleaving only is not profitable due to runtime checks\n");
9661 return false;
9662 }
9663 return true;
9664 }
9665
9666 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9667 uint64_t ScalarC = *VF.ScalarCost.getValue();
9668 if (ScalarC == 0)
9669 return true;
9670
9671 // First, compute the minimum iteration count required so that the vector
9672 // loop outperforms the scalar loop.
9673 // The total cost of the scalar loop is
9674 // ScalarC * TC
9675 // where
9676 // * TC is the actual trip count of the loop.
9677 // * ScalarC is the cost of a single scalar iteration.
9678 //
9679 // The total cost of the vector loop is
9680 // RtC + VecC * (TC / VF) + EpiC
9681 // where
9682 // * RtC is the cost of the generated runtime checks
9683 // * VecC is the cost of a single vector iteration.
9684 // * TC is the actual trip count of the loop
9685 // * VF is the vectorization factor
9686 // * EpiCost is the cost of the generated epilogue, including the cost
9687 // of the remaining scalar operations.
9688 //
9689 // Vectorization is profitable once the total vector cost is less than the
9690 // total scalar cost:
9691 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9692 //
9693 // Now we can compute the minimum required trip count TC as
9694 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
9695 //
9696 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9697 // the computations are performed on doubles, not integers and the result
9698 // is rounded up, hence we get an upper estimate of the TC.
9699 unsigned IntVF = VF.Width.getKnownMinValue();
9700 if (VF.Width.isScalable()) {
9701 unsigned AssumedMinimumVscale = 1;
9702 if (VScale)
9703 AssumedMinimumVscale = *VScale;
9704 IntVF *= AssumedMinimumVscale;
9705 }
9706 uint64_t RtC = *CheckCost.getValue();
9707 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
9708 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
9709
9710 // Second, compute a minimum iteration count so that the cost of the
9711 // runtime checks is only a fraction of the total scalar loop cost. This
9712 // adds a loop-dependent bound on the overhead incurred if the runtime
9713 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9714 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9715 // cost, compute
9716 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9717 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
9718
9719 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9720 // epilogue is allowed, choose the next closest multiple of VF. This should
9721 // partly compensate for ignoring the epilogue cost.
9722 uint64_t MinTC = std::max(MinTC1, MinTC2);
9723 if (SEL == CM_ScalarEpilogueAllowed)
9724 MinTC = alignTo(MinTC, IntVF);
9726
9727 LLVM_DEBUG(
9728 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9729 << VF.MinProfitableTripCount << "\n");
9730
9731 // Skip vectorization if the expected trip count is less than the minimum
9732 // required trip count.
9733 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9736 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9737 "trip count < minimum profitable VF ("
9738 << *ExpectedTC << " < " << VF.MinProfitableTripCount
9739 << ")\n");
9740
9741 return false;
9742 }
9743 }
9744 return true;
9745}
9746
9748 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9750 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9752
9754 assert((EnableVPlanNativePath || L->isInnermost()) &&
9755 "VPlan-native path is not enabled. Only process inner loops.");
9756
9757 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9758 << L->getHeader()->getParent()->getName() << "' from "
9759 << L->getLocStr() << "\n");
9760
9761 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9762
9763 LLVM_DEBUG(
9764 dbgs() << "LV: Loop hints:"
9765 << " force="
9767 ? "disabled"
9769 ? "enabled"
9770 : "?"))
9771 << " width=" << Hints.getWidth()
9772 << " interleave=" << Hints.getInterleave() << "\n");
9773
9774 // Function containing loop
9775 Function *F = L->getHeader()->getParent();
9776
9777 // Looking at the diagnostic output is the only way to determine if a loop
9778 // was vectorized (other than looking at the IR or machine code), so it
9779 // is important to generate an optimization remark for each loop. Most of
9780 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9781 // generated as OptimizationRemark and OptimizationRemarkMissed are
9782 // less verbose reporting vectorized loops and unvectorized loops that may
9783 // benefit from vectorization, respectively.
9784
9785 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9786 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9787 return false;
9788 }
9789
9790 PredicatedScalarEvolution PSE(*SE, *L);
9791
9792 // Check if it is legal to vectorize the loop.
9793 LoopVectorizationRequirements Requirements;
9794 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9795 &Requirements, &Hints, DB, AC, BFI, PSI);
9797 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9798 Hints.emitRemarkWithHints();
9799 return false;
9800 }
9801
9802 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9803 // here. They may require CFG and instruction level transformations before
9804 // even evaluating whether vectorization is profitable. Since we cannot modify
9805 // the incoming IR, we need to build VPlan upfront in the vectorization
9806 // pipeline.
9807 if (!L->isInnermost())
9808 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9809 ORE, BFI, PSI, Hints, Requirements);
9810
9811 assert(L->isInnermost() && "Inner loop expected.");
9812
9813 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9814 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9815
9816 // If an override option has been passed in for interleaved accesses, use it.
9817 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9818 UseInterleaved = EnableInterleavedMemAccesses;
9819
9820 // Analyze interleaved memory accesses.
9821 if (UseInterleaved)
9823
9824 // Check the function attributes and profiles to find out if this function
9825 // should be optimized for size.
9827 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9828
9829 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9830 // count by optimizing for size, to minimize overheads.
9831 auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9832 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9833 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9834 << "This loop is worth vectorizing only if no scalar "
9835 << "iteration overheads are incurred.");
9837 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9838 else {
9839 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
9840 LLVM_DEBUG(dbgs() << "\n");
9841 // Predicate tail-folded loops are efficient even when the loop
9842 // iteration count is low. However, setting the epilogue policy to
9843 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9844 // with runtime checks. It's more effective to let
9845 // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9846 // for the loop.
9849 } else {
9850 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
9851 "small to consider vectorizing.\n");
9853 "The trip count is below the minial threshold value.",
9854 "loop trip count is too low, avoiding vectorization",
9855 "LowTripCount", ORE, L);
9856 Hints.emitRemarkWithHints();
9857 return false;
9858 }
9859 }
9860 }
9861
9862 // Check the function attributes to see if implicit floats or vectors are
9863 // allowed.
9864 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9866 "Can't vectorize when the NoImplicitFloat attribute is used",
9867 "loop not vectorized due to NoImplicitFloat attribute",
9868 "NoImplicitFloat", ORE, L);
9869 Hints.emitRemarkWithHints();
9870 return false;
9871 }
9872
9873 // Check if the target supports potentially unsafe FP vectorization.
9874 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9875 // for the target we're vectorizing for, to make sure none of the
9876 // additional fp-math flags can help.
9877 if (Hints.isPotentiallyUnsafe() &&
9880 "Potentially unsafe FP op prevents vectorization",
9881 "loop not vectorized due to unsafe FP support.",
9882 "UnsafeFP", ORE, L);
9883 Hints.emitRemarkWithHints();
9884 return false;
9885 }
9886
9887 bool AllowOrderedReductions;
9888 // If the flag is set, use that instead and override the TTI behaviour.
9889 if (ForceOrderedReductions.getNumOccurrences() > 0)
9890 AllowOrderedReductions = ForceOrderedReductions;
9891 else
9892 AllowOrderedReductions = TTI->enableOrderedReductions();
9893 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9894 ORE->emit([&]() {
9895 auto *ExactFPMathInst = Requirements.getExactFPInst();
9896 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9897 ExactFPMathInst->getDebugLoc(),
9898 ExactFPMathInst->getParent())
9899 << "loop not vectorized: cannot prove it is safe to reorder "
9900 "floating-point operations";
9901 });
9902 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9903 "reorder floating-point operations\n");
9904 Hints.emitRemarkWithHints();
9905 return false;
9906 }
9907
9908 // Use the cost model.
9909 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9910 F, &Hints, IAI);
9911 // Use the planner for vectorization.
9912 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9913 ORE);
9914
9915 // Get user vectorization factor and interleave count.
9916 ElementCount UserVF = Hints.getWidth();
9917 unsigned UserIC = Hints.getInterleave();
9918
9919 // Plan how to best vectorize, return the best VF and its cost.
9920 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9921
9923 unsigned IC = 1;
9924
9925 bool AddBranchWeights =
9926 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9927 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9928 F->getDataLayout(), AddBranchWeights);
9929 if (MaybeVF) {
9930 VF = *MaybeVF;
9931 // Select the interleave count.
9932 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9933
9934 unsigned SelectedIC = std::max(IC, UserIC);
9935 // Optimistically generate runtime checks if they are needed. Drop them if
9936 // they turn out to not be profitable.
9937 if (VF.Width.isVector() || SelectedIC > 1)
9938 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
9939
9940 // Check if it is profitable to vectorize with runtime checks.
9941 bool ForceVectorization =
9943 if (!ForceVectorization &&
9945 *PSE.getSE(), SEL)) {
9946 ORE->emit([&]() {
9948 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9949 L->getHeader())
9950 << "loop not vectorized: cannot prove it is safe to reorder "
9951 "memory operations";
9952 });
9953 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9954 Hints.emitRemarkWithHints();
9955 return false;
9956 }
9957 }
9958
9959 // Identify the diagnostic messages that should be produced.
9960 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9961 bool VectorizeLoop = true, InterleaveLoop = true;
9962 if (VF.Width.isScalar()) {
9963 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9964 VecDiagMsg = std::make_pair(
9965 "VectorizationNotBeneficial",
9966 "the cost-model indicates that vectorization is not beneficial");
9967 VectorizeLoop = false;
9968 }
9969
9970 if (!MaybeVF && UserIC > 1) {
9971 // Tell the user interleaving was avoided up-front, despite being explicitly
9972 // requested.
9973 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9974 "interleaving should be avoided up front\n");
9975 IntDiagMsg = std::make_pair(
9976 "InterleavingAvoided",
9977 "Ignoring UserIC, because interleaving was avoided up front");
9978 InterleaveLoop = false;
9979 } else if (IC == 1 && UserIC <= 1) {
9980 // Tell the user interleaving is not beneficial.
9981 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9982 IntDiagMsg = std::make_pair(
9983 "InterleavingNotBeneficial",
9984 "the cost-model indicates that interleaving is not beneficial");
9985 InterleaveLoop = false;
9986 if (UserIC == 1) {
9987 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9988 IntDiagMsg.second +=
9989 " and is explicitly disabled or interleave count is set to 1";
9990 }
9991 } else if (IC > 1 && UserIC == 1) {
9992 // Tell the user interleaving is beneficial, but it explicitly disabled.
9993 LLVM_DEBUG(
9994 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9995 IntDiagMsg = std::make_pair(
9996 "InterleavingBeneficialButDisabled",
9997 "the cost-model indicates that interleaving is beneficial "
9998 "but is explicitly disabled or interleave count is set to 1");
9999 InterleaveLoop = false;
10000 }
10001
10002 // Override IC if user provided an interleave count.
10003 IC = UserIC > 0 ? UserIC : IC;
10004
10005 // Emit diagnostic messages, if any.
10006 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10007 if (!VectorizeLoop && !InterleaveLoop) {
10008 // Do not vectorize or interleaving the loop.
10009 ORE->emit([&]() {
10010 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10011 L->getStartLoc(), L->getHeader())
10012 << VecDiagMsg.second;
10013 });
10014 ORE->emit([&]() {
10015 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10016 L->getStartLoc(), L->getHeader())
10017 << IntDiagMsg.second;
10018 });
10019 return false;
10020 } else if (!VectorizeLoop && InterleaveLoop) {
10021 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10022 ORE->emit([&]() {
10023 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10024 L->getStartLoc(), L->getHeader())
10025 << VecDiagMsg.second;
10026 });
10027 } else if (VectorizeLoop && !InterleaveLoop) {
10028 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10029 << ") in " << L->getLocStr() << '\n');
10030 ORE->emit([&]() {
10031 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10032 L->getStartLoc(), L->getHeader())
10033 << IntDiagMsg.second;
10034 });
10035 } else if (VectorizeLoop && InterleaveLoop) {
10036 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10037 << ") in " << L->getLocStr() << '\n');
10038 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10039 }
10040
10041 bool DisableRuntimeUnroll = false;
10042 MDNode *OrigLoopID = L->getLoopID();
10043 {
10044 using namespace ore;
10045 if (!VectorizeLoop) {
10046 assert(IC > 1 && "interleave count should not be 1 or 0");
10047 // If we decided that it is not legal to vectorize the loop, then
10048 // interleave it.
10049 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10050 &CM, BFI, PSI, Checks);
10051
10052 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10053 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10054
10055 ORE->emit([&]() {
10056 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10057 L->getHeader())
10058 << "interleaved loop (interleaved count: "
10059 << NV("InterleaveCount", IC) << ")";
10060 });
10061 } else {
10062 // If we decided that it is *legal* to vectorize the loop, then do it.
10063
10064 // Consider vectorizing the epilogue too if it's profitable.
10065 VectorizationFactor EpilogueVF =
10067 if (EpilogueVF.Width.isVector()) {
10068
10069 // The first pass vectorizes the main loop and creates a scalar epilogue
10070 // to be vectorized by executing the plan (potentially with a different
10071 // factor) again shortly afterwards.
10072 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10073 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10074 EPI, &LVL, &CM, BFI, PSI, Checks);
10075
10076 std::unique_ptr<VPlan> BestMainPlan(
10078 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
10079 EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
10080 ++LoopsVectorized;
10081
10082 // Second pass vectorizes the epilogue and adjusts the control flow
10083 // edges from the first pass.
10084 EPI.MainLoopVF = EPI.EpilogueVF;
10085 EPI.MainLoopUF = EPI.EpilogueUF;
10086 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10087 ORE, EPI, &LVL, &CM, BFI, PSI,
10088 Checks);
10089
10090 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10091 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10092 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10093 Header->setName("vec.epilog.vector.body");
10094
10095 // Re-use the trip count and steps expanded for the main loop, as
10096 // skeleton creation needs it as a value that dominates both the scalar
10097 // and vector epilogue loops
10098 // TODO: This is a workaround needed for epilogue vectorization and it
10099 // should be removed once induction resume value creation is done
10100 // directly in VPlan.
10101 EpilogILV.setTripCount(MainILV.getTripCount());
10102 for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
10103 auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
10104 auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn(
10105 ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10106 ExpandR->replaceAllUsesWith(ExpandedVal);
10107 if (BestEpiPlan.getTripCount() == ExpandR)
10108 BestEpiPlan.resetTripCount(ExpandedVal);
10109 ExpandR->eraseFromParent();
10110 }
10111
10112 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10113 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10114 // before vectorizing the epilogue loop.
10115 for (VPRecipeBase &R : Header->phis()) {
10116 if (isa<VPCanonicalIVPHIRecipe>(&R))
10117 continue;
10118
10119 Value *ResumeV = nullptr;
10120 // TODO: Move setting of resume values to prepareToExecute.
10121 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10122 const RecurrenceDescriptor &RdxDesc =
10123 ReductionPhi->getRecurrenceDescriptor();
10124 RecurKind RK = RdxDesc.getRecurrenceKind();
10125 ResumeV = ReductionResumeValues.find(&RdxDesc)->second;
10127 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10128 // start value; compare the final value from the main vector loop
10129 // to the start value.
10130 IRBuilder<> Builder(
10131 cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10132 ResumeV = Builder.CreateICmpNE(ResumeV,
10133 RdxDesc.getRecurrenceStartValue());
10134 }
10135 } else {
10136 // Create induction resume values for both widened pointer and
10137 // integer/fp inductions and update the start value of the induction
10138 // recipes to use the resume value.
10139 PHINode *IndPhi = nullptr;
10140 const InductionDescriptor *ID;
10141 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10142 IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10143 ID = &Ind->getInductionDescriptor();
10144 } else {
10145 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10146 IndPhi = WidenInd->getPHINode();
10147 ID = &WidenInd->getInductionDescriptor();
10148 }
10149
10150 ResumeV = MainILV.createInductionResumeValue(
10151 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
10153 }
10154 assert(ResumeV && "Must have a resume value");
10155 VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV);
10156 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10157 }
10158
10159 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10160 DT, true, &ExpandedSCEVs);
10161 ++LoopsEpilogueVectorized;
10162
10163 if (!MainILV.areSafetyChecksAdded())
10164 DisableRuntimeUnroll = true;
10165 } else {
10166 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10167 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10168 PSI, Checks);
10169
10170 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10171 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10172 ++LoopsVectorized;
10173
10174 // Add metadata to disable runtime unrolling a scalar loop when there
10175 // are no runtime checks about strides and memory. A scalar loop that is
10176 // rarely used is not worth unrolling.
10177 if (!LB.areSafetyChecksAdded())
10178 DisableRuntimeUnroll = true;
10179 }
10180 // Report the vectorization decision.
10181 reportVectorization(ORE, L, VF, IC);
10182 }
10183
10186 }
10187
10188 std::optional<MDNode *> RemainderLoopID =
10191 if (RemainderLoopID) {
10192 L->setLoopID(*RemainderLoopID);
10193 } else {
10194 if (DisableRuntimeUnroll)
10196
10197 // Mark the loop as already vectorized to avoid vectorizing again.
10198 Hints.setAlreadyVectorized();
10199 }
10200
10201 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10202 return true;
10203}
10204
10210 SE = &SE_;
10211 LI = &LI_;
10212 TTI = &TTI_;
10213 DT = &DT_;
10214 BFI = BFI_;
10215 TLI = TLI_;
10216 AC = &AC_;
10217 LAIs = &LAIs_;
10218 DB = &DB_;
10219 ORE = &ORE_;
10220 PSI = PSI_;
10221
10222 // Don't attempt if
10223 // 1. the target claims to have no vector registers, and
10224 // 2. interleaving won't help ILP.
10225 //
10226 // The second condition is necessary because, even if the target has no
10227 // vector registers, loop vectorization may still enable scalar
10228 // interleaving.
10231 return LoopVectorizeResult(false, false);
10232
10233 bool Changed = false, CFGChanged = false;
10234
10235 // The vectorizer requires loops to be in simplified form.
10236 // Since simplification may add new inner loops, it has to run before the
10237 // legality and profitability checks. This means running the loop vectorizer
10238 // will simplify all loops, regardless of whether anything end up being
10239 // vectorized.
10240 for (const auto &L : *LI)
10241 Changed |= CFGChanged |=
10242 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10243
10244 // Build up a worklist of inner-loops to vectorize. This is necessary as
10245 // the act of vectorizing or partially unrolling a loop creates new loops
10246 // and can invalidate iterators across the loops.
10247 SmallVector<Loop *, 8> Worklist;
10248
10249 for (Loop *L : *LI)
10250 collectSupportedLoops(*L, LI, ORE, Worklist);
10251
10252 LoopsAnalyzed += Worklist.size();
10253
10254 // Now walk the identified inner loops.
10255 while (!Worklist.empty()) {
10256 Loop *L = Worklist.pop_back_val();
10257
10258 // For the inner loops we actually process, form LCSSA to simplify the
10259 // transform.
10260 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10261
10262 Changed |= CFGChanged |= processLoop(L);
10263
10264 if (Changed) {
10265 LAIs->clear();
10266
10267#ifndef NDEBUG
10268 if (VerifySCEV)
10269 SE->verify();
10270#endif
10271 }
10272 }
10273
10274 // Process each loop nest in the function.
10275 return LoopVectorizeResult(Changed, CFGChanged);
10276}
10277
10280 auto &LI = AM.getResult<LoopAnalysis>(F);
10281 // There are no loops in the function. Return before computing other expensive
10282 // analyses.
10283 if (LI.empty())
10284 return PreservedAnalyses::all();
10286 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10287 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10288 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10289 auto &AC = AM.getResult<AssumptionAnalysis>(F);
10290 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10292
10294 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10296 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10297 BlockFrequencyInfo *BFI = nullptr;
10298 if (PSI && PSI->hasProfileSummary())
10300 LoopVectorizeResult Result =
10301 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10302 if (!Result.MadeAnyChange)
10303 return PreservedAnalyses::all();
10305
10306 if (isAssignmentTrackingEnabled(*F.getParent())) {
10307 for (auto &BB : F)
10309 }
10310
10311 PA.preserve<LoopAnalysis>();
10314
10315 if (Result.MadeCFGChange) {
10316 // Making CFG changes likely means a loop got vectorized. Indicate that
10317 // extra simplification passes should be run.
10318 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10319 // be run if runtime checks have been added.
10322 } else {
10324 }
10325 return PA;
10326}
10327
10329 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10330 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10331 OS, MapClassName2PassName);
10332
10333 OS << '<';
10334 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10335 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10336 OS << '>';
10337}
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
ReachingDefAnalysis InstSet & ToRemove
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
#define DEBUG_WITH_TYPE(TYPE, X)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition: Debug.h:64
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
uint64_t Addr
std::string Name
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define Check(C,...)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
This header provides classes for managing per-loop analyses.
static const char * VerboseDebug
loop Loop Strength Reduction
#define LV_NAME
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static void createAndCollectMergePhiForReduction(VPInstruction *RedResult, DenseMap< const RecurrenceDescriptor *, Value * > &ReductionResumeValues, VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock, bool VectorizingEpilogue)
static std::optional< unsigned > getSmallBestKnownTC(ScalarEvolution &SE, Loop *L)
Returns "best known" trip count for the specified loop L as defined by the following procedure: 1) Re...
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static Instruction * createReverseEVL(IRBuilderBase &Builder, Value *Operand, Value *EVL, const Twine &Name)
Use all-true mask for reverse rather than actual mask, as it avoids a dependence w/o affecting the re...
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static Value * interleaveVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vals, const Twine &Name)
Return a vector containing interleaved elements from multiple smaller input vectors.
static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or it's operands.
static void emitInvalidCostRemarks(SmallVector< InstructionVFPair > InvalidCosts, OptimizationRemarkEmitter *ORE, Loop *TheLoop)
static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan)
const char LLVMLoopVectorizeFollowupAll[]
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, DebugLoc DL)
static std::optional< unsigned > getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI)
Convenience function that returns the value of vscale_range iff vscale_range.min == vscale_range....
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static constexpr uint32_t MemCheckBypassWeights[]
static Type * MaybeVectorizeType(Type *Elt, ElementCount VF)
static cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing."))
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, std::optional< unsigned > VScale, Loop *L, ScalarEvolution &SE, ScalarEpilogueLowering SEL)
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
const char VerboseDebug[]
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop)
Creates a VPWidenIntOrFpInductionRecpipe for Phi.
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I)
Create an analysis remark that explains why vectorization failed.
static constexpr uint32_t SCEVCheckBypassWeights[]
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
const char LLVMLoopVectorizeFollowupVectorized[]
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
const char LLVMLoopVectorizeFollowupEpilogue[]
static bool useActiveLaneMask(TailFoldingStyle Style)
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
static unsigned getReciprocalPredBlockProb()
A helper function that returns the reciprocal of the block probability of predicated blocks.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
#define DEBUG_TYPE
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static void AddRuntimeUnrollDisableMetaData(Loop *L)
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static cl::opt< bool > PrintVPlansInDotFormat("vplan-print-in-dot-format", cl::Hidden, cl::desc("Use dot format instead of plain text when dumping VPlans"))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
#define R2(n)
This file contains the declarations for metadata subclasses.
Module.h This file contains the declarations for the Module class.
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(VerifyEach)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
separate const offset from gep
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
This defines the Use class.
This file defines the VPlanHCFGBuilder class which contains the public interface (buildHierarchicalCF...
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:77
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:213
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1521
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
void registerAssumption(AssumeInst *CI)
Add an @llvm.assume intrinsic to this function's cache.
unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
Definition: Attributes.cpp:459
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:232
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:438
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:507
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
Definition: BasicBlock.cpp:372
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:365
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:457
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:487
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:209
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:167
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:229
BinaryOps getOpcode() const
Definition: InstrTypes.h:442
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
void setCondition(Value *V)
bool isConditional() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1965
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1401
unsigned arg_size() const
Definition: InstrTypes.h:1408
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1594
This class represents a function call, abstracting a target machine's calling convention.
static bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ FIRST_ICMP_PREDICATE
Definition: InstrTypes.h:788
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:783
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:850
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:857
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
A debug info location.
Definition: DebugLoc.h:33
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:211
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
DomTreeNodeBase * getIDom() const
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:323
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:311
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:308
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition: TypeSize.h:314
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:319
BasicBlock * emitMinimumVectorEpilogueIterCountCheck(BasicBlock *Bypass, BasicBlock *Insert)
Emits an iteration count bypass check after the main vector loop has finished to see if there are any...
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (ie the ...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the main loop strategy (ie the firs...
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent function types.
Definition: DerivedTypes.h:103
param_iterator param_begin() const
Definition: DerivedTypes.h:128
param_iterator param_end() const
Definition: DerivedTypes.h:129
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:698
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:207
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:362
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:716
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:695
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:690
static GetElementPtrInst * Create(Type *PointeeType, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Definition: Instructions.h:937
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:92
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:921
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:509
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1805
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1192
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2514
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:464
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:578
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1090
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:173
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:524
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:309
Value * CreateVectorReverse(Value *V, const Twine &Name="")
Return a vector value that contains the vector V reversed.
Definition: IRBuilder.cpp:1150
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2243
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1864
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1719
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:484
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2203
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2395
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2239
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:143
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1342
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2492
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:598
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1325
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:469
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1664
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:178
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1824
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2349
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:514
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1402
Value * CreateStepVector(Type *DstType, const Twine &Name="")
Creates a vector of type DstType with the linear sequence <0, 1, ...>
Definition: IRBuilder.cpp:109
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1359
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2664
A struct for saving information about induction variables.
InductionKind getKind() const
const SCEV * getStep() const
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
Value * getStartValue() const
An extension of the inner loop vectorizer that creates a skeleton for a vectorized loop that has its ...
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
virtual std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)=0
The interface for creating a vectorized skeleton using one of two different strategies,...
std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
PHINode * createInductionResumeValue(PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, ArrayRef< BasicBlock * > BypassBlocks, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create a new phi node for the induction variable OrigPhi to resume iteration count in the scalar epil...
void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
BasicBlock * LoopScalarBody
The scalar loop body.
Value * TripCount
Trip count of the original loop.
void sinkScalarOperands(Instruction *PredInst)
Iteratively sink the scalarized operands of a predicated instruction into the block that was created ...
const TargetLibraryInfo * TLI
Target Library Info.
DenseMap< PHINode *, Value * > IVEndValues
ElementCount MinProfitableTripCount
Value * createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL)
Returns a bitcasted value to the requested vector type.
const TargetTransformInfo * TTI
Target Transform Info.
Value * VectorTripCount
Trip count of the widened loop (TripCount - TripCount % (VF*UF))
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
BasicBlock * emitSCEVChecks(BasicBlock *Bypass)
Emit a bypass check to see if all of the SCEV assumptions we've had to make are correct.
LoopVectorizationCostModel * Cost
The profitablity analysis.
SmallMapVector< const RecurrenceDescriptor *, PHINode *, 4 > ReductionResumeValues
BlockFrequencyInfo * BFI
BFI and PSI are used to check for profile guided size optimizations.
Value * getTripCount() const
Returns the original loop trip count.
BasicBlock * LoopMiddleBlock
Middle Block between the vector and the scalar.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
SmallVector< Instruction *, 4 > PredicatedInstructions
Store instructions that were predicated.
void createVectorLoopSkeleton(StringRef Prefix)
Emit basic blocks (prefixed with Prefix) for the iteration check, vector loop preheader,...
BasicBlock * completeLoopSkeleton()
Complete the loop skeleton by adding debug MDs, creating appropriate conditional branches in the midd...
BasicBlock * emitMemRuntimeChecks(BasicBlock *Bypass)
Emit bypass checks to check any memory assumptions we may have made.
BasicBlock * LoopScalarPreHeader
The scalar-loop preheader.
LoopVectorizationLegality * Legal
The legality analysis.
void fixFixedOrderRecurrence(VPLiveOut *LO, VPTransformState &State)
Create the phi node for the resume value of first order recurrences in the scalar preheader and updat...
void emitIterationCountCheck(BasicBlock *Bypass)
Emit a bypass check to see if the vector trip count is zero, including if it overflows.
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, BasicBlock *MiddleBlock, BasicBlock *VectorHeader, VPlan &Plan, VPTransformState &State)
Set up the values of the IVs correctly when exiting the vector loop.
LoopInfo * LI
Loop Info.
void createInductionResumeValues(const SCEV2ValueTy &ExpandedSCEVs, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create new phi nodes for the induction variables to resume iteration count in the scalar epilogue,...
ProfileSummaryInfo * PSI
void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State)
Fix the non-induction PHIs in Plan.
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
BasicBlock * LoopVectorPreHeader
The vector-loop preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
Value * getOrCreateVectorTripCount(BasicBlock *InsertBlock)
Returns (and creates if needed) the trip count of the widened loop.
IRBuilder Builder
The builder that we use.
void vectorizeInterleaveGroup(const InterleaveGroup< Instruction > *Group, ArrayRef< VPValue * > VPDefs, VPTransformState &State, VPValue *Addr, ArrayRef< VPValue * > StoredValues, VPValue *BlockInMask, bool NeedsMaskForGaps)
Try to vectorize interleaved access group Group with the base address given in Addr,...
virtual std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
unsigned UF
The vectorization unroll factor to use.
void fixVectorizedLoop(VPTransformState &State, VPlan &Plan)
Fix the vectorized code, taking care of header phi's, live-outs, and more.
BasicBlock * LoopExitBlock
The unique ExitBlock of the scalar loop if one exists.
SmallVector< BasicBlock *, 4 > LoopBypassBlocks
A list of all bypass blocks. The first block is the entry of the loop.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
Definition: Instruction.cpp:97
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:476
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
void replaceSuccessorWith(BasicBlock *OldBB, BasicBlock *NewBB)
Replace specified successor OldBB to point at the provided block.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:169
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:473
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
The group of interleaved loads/stores sharing the same stride and close to each other.
Definition: VectorUtils.h:453
uint32_t getFactor() const
Definition: VectorUtils.h:469
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
Definition: VectorUtils.h:523
uint32_t getIndex(const InstTy *Instr) const
Get the index for the given member.
Definition: VectorUtils.h:530
bool isReverse() const
Definition: VectorUtils.h:468
InstTy * getInsertPos() const
Definition: VectorUtils.h:539
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
Align getAlign() const
Definition: VectorUtils.h:470
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:595
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
Definition: VectorUtils.h:640
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
Definition: VectorUtils.h:651
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
Definition: VectorUtils.h:632
bool invalidateGroups()
Invalidate groups, e.g., in case all blocks in loop will be predicated contrary to original assumptio...
Definition: VectorUtils.h:615
iterator_range< SmallPtrSetIterator< llvm::InterleaveGroup< Instruction > * > > getInterleaveGroups()
Definition: VectorUtils.h:645
void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:173
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:571
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
iterator_range< block_iterator > blocks() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
Store the result of a depth first search within basic blocks contained by a single loop.
Definition: LoopIterator.h:97
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
Definition: LoopIterator.h:136
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopInfo.cpp:1254
RPOIterator endRPO() const
Definition: LoopIterator.h:140
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopIterator.h:180
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool requiresScalarEpilogue(VFRange Range) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI)
LoopVectorizationLegality * Legal
Vectorization legality.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
std::pair< InstructionCost, bool > VectorizationCostTy
The vectorization cost is a combination of the cost itself and a boolean indicating whether any of th...
DemandedBits * DB
Demanded bits analysis.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr) const
Get the interleaved access group that Instr belongs to.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
Loop * TheLoop
The loop that we evaluate.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool isAccessInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleaved access group.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallVector< RegisterUsage, 8 > calculateRegisterUsage(ArrayRef< ElementCount > VFs)
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
VectorizationCostTy expectedCost(ElementCount VF, SmallVectorImpl< InstructionVFPair > *Invalid=nullptr)
Returns the expected execution cost.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
bool isEpilogueVectorizationProfitable(const ElementCount VF) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
void collectUniformsAndScalars(ElementCount VF)
Collect Uniform and Scalar values for the given VF.
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const
Returns true if I is an instruction which requires predication and for which our chosen predication s...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost)
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
bool hasVectorCallVariants() const
Returns true if there is at least one function call in the loop which has a vectorized variant availa...
bool isInvariantAddressOfReduction(Value *V)
Returns True if given address is invariant and is used to store recurrent expression.
bool blockNeedsPredication(BasicBlock *BB) const
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const
Check if this pointer is consecutive when vectorizing.
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
bool isReductionVariable(PHINode *PN) const
Returns True if PN is a reduction variable in this loop.
bool isFixedOrderRecurrence(const PHINode *Phi) const
Returns True if Phi is a fixed-order recurrence in this loop.
const InductionDescriptor * getPointerInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is pointer induction.
const InductionDescriptor * getIntOrFpInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is an integer or floating point induction.
bool isInductionPhi(const Value *V) const
Returns True if V is a Phi node of an induction variable in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool isInvariant(Value *V) const
Returns true if value V is uniform across VF lanes, when VF is provided, and otherwise if V is invari...
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Type * getWidestInductionType()
Returns the widest induction type.
const LoopAccessInfo * getLAI() const
bool prepareToFoldTailByMasking()
Return true if we can vectorize this loop while folding its tail by masking, and mark all respective ...
bool isUniformMemOp(Instruction &I, ElementCount VF) const
A uniform memory op is a load or store which accesses the same memory location on all VF lanes,...
bool isMaskRequired(const Instruction *I) const
Returns true if vector representation of the instruction I requires mask.
const RuntimePointerChecking * getRuntimePointerChecking() const
Returns the information that we collected about runtime memory check.
Planner drives the vectorization process after having passed Legality checks.
std::optional< VectorizationFactor > plan(ElementCount UserVF, unsigned UserIC)
Plan how to best vectorize, return the best VF and its cost, or std::nullopt if vectorization and int...
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
std::pair< DenseMap< const SCEV *, Value * >, DenseMap< const RecurrenceDescriptor *, Value * > > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool IsEpilogueVectorization, const DenseMap< const SCEV *, Value * > *ExpandedSCEVs=nullptr)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
VPlan & getBestPlanFor(ElementCount VF) const
Return the best VPlan for VF.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
void printPlans(raw_ostream &O)
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
void prepareNoAliasMetadata()
Set up the aliasing scopes based on the memchecks.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition: LoopInfo.cpp:67
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:632
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:61
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Definition: LoopInfo.cpp:502
Metadata node.
Definition: Metadata.h:1067
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1071
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1428
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1541
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1434
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:600
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
size_type size() const
Definition: MapVector.h:60
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:193
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:688
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
void setIncomingValueForBlock(const BasicBlock *BB, Value *V)
Set every incoming value(s) for block BB to V.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static unsigned getIncomingValueNumForOperand(unsigned i)
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1814
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
void preserve()
Mark an analysis as preserved.
Definition: Analysis.h:131
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:71
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
Instruction * getLoopExitInstr() const
static unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
TrackingVH< Value > getRecurrenceStartValue() const
SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
bool Need
This flag indicates if we need to add the runtime check.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class represents a constant integer value.
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
This class represents an assumption made using SCEV expressions which can be checked at run-time.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isZero() const
Return true if the expression is a constant zero.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getSmallConstantMaxTripCount(const Loop *L)
Returns the upper bound of the loop trip count as a normal unsigned value.
const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVMContext & getContext() const
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
value_type pop_back_val()
Definition: SetVector.h:285
This class provides computation of slot numbers for LLVM Assembly writing.
Definition: AsmWriter.cpp:696
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:323
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:361
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:412
iterator end() const
Definition: SmallPtrSet.h:437
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
iterator begin() const
Definition: SmallPtrSet.h:432
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:289
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
std::optional< unsigned > getVScaleForTuning() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const
Return true if the target supports masked scatter.
bool enableAggressiveInterleaving(bool LoopHasReductions) const
Don't restrict interleaved unrolling to small loops.
bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
bool prefersVectorizedAddressing() const
Return true if target doesn't mind addresses in vectors.
bool hasBranchDivergence(const Function *F=nullptr) const
Return true if branch divergence exists.
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE=nullptr, const SCEV *Ptr=nullptr) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const
Get target-customized preferences for the generic loop unrolling transformation.
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instructions unique non-constant operands.
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
std::optional< unsigned > getMaxVScale() const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool enableOrderedReductions() const
Return true if we should be enabling ordered reductions for the target.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Query the target what the preferred style of tail folding is.
unsigned getRegUsageForType(Type *Ty) const
Returns the estimated number of registers required to represent Ty.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add ...
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isElementTypeLegalForScalableVector(Type *Ty) const
ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const
bool enableMaskedInterleavedAccessVectorization() const
Enable matching of interleaved access groups that contain predicated accesses or gaps and therefore v...
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
const char * getRegisterClassName(unsigned ClassID) const
bool preferEpilogueVectorization() const
Return true if the loop vectorizer should consider vectorizing an otherwise scalar epilogue loop.
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
bool isLegalMaskedStore(Type *DataType, Align Alignment) const
Return true if the target supports masked store.
bool enableInterleavedAccessVectorization() const
Enable matching of interleaved access groups.
unsigned getMinTripCountTailFoldingThreshold() const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
unsigned getMaxInterleaveFactor(ElementCount VF) const
unsigned getNumberOfParts(Type *Tp) const
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr) const
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
Query the target whether it would be prefered to create a predicated vector loop, which can avoid the...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool isLegalMaskedLoad(Type *DataType, Align Alignment) const
Return true if the target supports masked load.
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition: Type.h:243
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isTokenTy() const
Return true if this is 'token'.
Definition: Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
op_iterator op_begin()
Definition: User.h:234
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
op_iterator op_end()
Definition: User.h:236
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:71
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:2844
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition: VPlan.h:2916
void execute(VPTransformState *State) override
The method which generates the output IR instructions that correspond to this VPBasicBlock,...
Definition: VPlan.cpp:476
iterator end()
Definition: VPlan.h:2878
iterator begin()
Recipe iterator methods.
Definition: VPlan.h:2876
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition: VPlan.cpp:211
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition: VPlan.h:2907
bool empty() const
Definition: VPlan.h:2887
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition: VPlan.h:1967
VPRegionBlock * getParent()
Definition: VPlan.h:497
const VPBasicBlock * getExitingBasicBlock() const
Definition: VPlan.cpp:176
void setName(const Twine &newName)
Definition: VPlan.h:490
VPlan * getPlan()
Definition: VPlan.cpp:149
const VPBasicBlock * getEntryBasicBlock() const
Definition: VPlan.cpp:154
VPBlockBase * getSingleSuccessor() const
Definition: VPlan.h:532
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBase NewBlock after BlockPtr.
Definition: VPlan.h:3444
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPBasicBlock * getInsertBlock() const
VPValue * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL={}, const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createOverflowingOp(unsigned Opcode, std::initializer_list< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL={}, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPValue * createNot(VPValue *Operand, DebugLoc DL={}, const Twine &Name="")
VPValue * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPValue * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL={}, const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition: VPlan.h:2583
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition: VPlanValue.h:418
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition: VPlanValue.h:396
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition: VPlanValue.h:408
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPValue * getStepValue() const
Definition: VPlan.h:2782
VPValue * getStartValue() const
Definition: VPlan.h:2781
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition: VPlan.h:1653
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition: VPlan.h:1697
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition: VPlan.h:1686
This is a concrete Recipe that models a single VPlan-level instruction.
Definition: VPlan.h:1180
unsigned getOpcode() const
Definition: VPlan.h:1287
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition: VPlan.h:2024
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition: VPlan.h:2065
VPValue * getMask() const
Return the mask used by this recipe.
Definition: VPlan.h:2071
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition: VPlan.h:2078
unsigned getNumStoreOperands() const
Returns the number of stored operands of this interleave group.
Definition: VPlan.h:2098
static VPLane getLastLaneForVF(const ElementCount &VF)
Definition: VPlan.h:184
static VPLane getFirstLane()
Definition: VPlan.h:168
A value that is used outside the VPlan.
Definition: VPlan.h:686
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition: VPlan.h:726
VPBasicBlock * getParent()
Definition: VPlan.h:751
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition: VPlan.h:817
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPValue * getVPValueOrAddLiveIn(Value *V, VPlan &Plan)
VPValue * createEdgeMask(BasicBlock *Src, BasicBlock *Dst)
A helper function that computes the predicate of the edge between SRC and DST.
VPReplicateRecipe * handleReplication(Instruction *I, VFRange &Range)
Build a VPReplicationRecipe for I.
VPValue * getBlockInMask(BasicBlock *BB) const
Returns the entry mask for the block BB.
VPValue * getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const
A helper that returns the previously computed predicate of the edge between SRC and DST.
iterator_range< mapped_iterator< Use *, std::function< VPValue *(Value *)> > > mapToVPValues(User::op_range Operands)
Returns a range mapping the values of the range Operands to their corresponding VPValues.
void fixHeaderPhis()
Add the incoming values from the backedge to reduction & first-order recurrence cross-iteration phis.
VPRecipeBase * tryToCreateWidenRecipe(Instruction *Instr, ArrayRef< VPValue * > Operands, VFRange &Range, VPBasicBlock *VPBB)
Create and return a widened recipe for I if one can be created within the given VF Range.
void createHeaderMask()
Create the mask for the vector loop header block.
void createBlockInMask(BasicBlock *BB)
A helper function that computes the predicate of the block BB, assuming that the header block of the ...
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
void setFlags(Instruction *I) const
Set the IR flags for I.
Definition: VPlan.h:1100
A recipe for handling reduction phis.
Definition: VPlan.h:1908
bool isInLoop() const
Returns true, if the phi is part of an in-loop reduction.
Definition: VPlan.h:1962
const RecurrenceDescriptor & getRecurrenceDescriptor() const
Definition: VPlan.h:1954
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition: VPlan.h:2113
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition: VPlan.h:3019
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition: VPlan.h:3090
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition: VPlan.h:2161
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition: VPlan.h:843
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition: VPlan.h:906
This class can be used to assign names to VPValues.
Definition: VPlanValue.h:449
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition: VPlanValue.h:203
operand_range operands()
Definition: VPlanValue.h:273
void setOperand(unsigned I, VPValue *New)
Definition: VPlanValue.h:258
unsigned getNumOperands() const
Definition: VPlanValue.h:252
VPValue * getOperand(unsigned N) const
Definition: VPlanValue.h:253
void addOperand(VPValue *Operand)
Definition: VPlanValue.h:247
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const
Definition: VPlan.cpp:1320
void replaceAllUsesWith(VPValue *New)
Definition: VPlan.cpp:1288
user_iterator user_begin()
Definition: VPlanValue.h:129
Value * getLiveInIRValue()
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition: VPlanValue.h:173
user_iterator user_end()
Definition: VPlanValue.h:131
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition: VPlanValue.h:168
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition: VPlan.cpp:1292
user_range users()
Definition: VPlanValue.h:133
A recipe to compute the pointers for widened memory accesses of IndexTy for all parts.
Definition: VPlan.h:1597
A recipe for widening Call instructions.
Definition: VPlan.h:1468
A Recipe for widening the canonical induction variable of the vector loop.
Definition: VPlan.h:2708
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition: VPlan.h:1379
A recipe for handling GEP instructions.
Definition: VPlan.h:1555
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition: VPlan.h:1710
A common base class for widening memory operations.
Definition: VPlan.h:2318
bool Reverse
Whether the consecutive accessed addresses are in reverse order.
Definition: VPlan.h:2326
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
Definition: VPlan.h:2365
Instruction & Ingredient
Definition: VPlan.h:2320
VPValue * getMask() const
Return the mask used by this recipe.
Definition: VPlan.h:2379
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition: VPlan.h:2372
bool isReverse() const
Return whether the consecutive loaded/stored addresses are in reverse order.
Definition: VPlan.h:2369
A recipe for handling phis that are widened in the vector loop.
Definition: VPlan.h:1836
VPValue * getIncomingValue(unsigned I)
Returns the I th incoming VPValue.
Definition: VPlan.h:1875
VPBasicBlock * getIncomingBlock(unsigned I)
Returns the I th incoming VPBasicBlock.
Definition: VPlan.h:1872
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void execute(VPTransformState &State) override
Generate vector values for the pointer induction.
VPWidenRecipe is a recipe for producing a copy of vector type its ingredient.
Definition: VPlan.h:1347
Main class to build the VPlan H-CFG for an incoming IR.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition: VPlan.h:3120
void prepareToExecute(Value *TripCount, Value *VectorTripCount, Value *CanonicalIVStartValue, VPTransformState &State)
Prepare the plan for execution, setting up the required live-in values.
Definition: VPlan.cpp:821
VPBasicBlock * getEntry()
Definition: VPlan.h:3215
VPValue & getVectorTripCount()
The vector trip count.
Definition: VPlan.h:3240
void setName(const Twine &newName)
Definition: VPlan.h:3277
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition: VPlan.h:3243
VPValue * getTripCount() const
The trip count of the original loop.
Definition: VPlan.h:3219
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition: VPlan.h:3233
void removeLiveOut(PHINode *PN)
Definition: VPlan.h:3334
void addLiveOut(PHINode *PN, VPValue *V)
Definition: VPlan.cpp:1049
VPBasicBlock * getPreheader()
Definition: VPlan.h:3353
VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition: VPlan.h:3315
bool hasVF(ElementCount VF)
Definition: VPlan.h:3253
bool hasUF(unsigned UF) const
Definition: VPlan.h:3266
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition: VPlan.h:3226
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition: VPlan.h:3281
LLVM_DUMP_METHOD void dump() const
Dump the plan to stderr (for debugging).
Definition: VPlan.cpp:1046
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition: VPlan.cpp:876
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition: VPlan.h:3323
const MapVector< PHINode *, VPLiveOut * > & getLiveOuts() const
Definition: VPlan.h:3339
VPValue * getSCEVExpansion(const SCEV *S) const
Definition: VPlan.h:3343
static VPlanPtr createInitialVPlan(const SCEV *TripCount, ScalarEvolution &PSE, BasicBlock *PH)
Create initial VPlan skeleton, having an "entry" VPBasicBlock (wrapping original scalar pre-header PH...
Definition: VPlan.cpp:806
VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition: VPlan.cpp:1092
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition: Value.cpp:157
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition: Value.cpp:693
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
VectorBuilder & setEVL(Value *NewExplicitVectorLength)
Definition: VectorBuilder.h:77
VectorBuilder & setMask(Value *NewMask)
Definition: VectorBuilder.h:73
Value * createVectorInstruction(unsigned Opcode, Type *ReturnTy, ArrayRef< Value * > VecOpArray, const Twine &Name=Twine())
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:683
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:229
constexpr bool isNonZero() const
Definition: TypeSize.h:158
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:215
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition: TypeSize.h:255
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr bool isZero() const
Definition: TypeSize.h:156
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:222
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:251
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:236
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
A range adaptor for a pair of iterators.
IteratorT end() const
IteratorT begin() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ Entry
Definition: COFF.h:811
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition: DwarfDebug.h:190
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:816
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:236
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE)
Get or create a VPValue that corresponds to the expansion of Expr.
Definition: VPlan.cpp:1477
bool isUniformAfterVectorization(VPValue *VPV)
Returns true if VPV is uniform after vectorization.
Definition: VPlan.h:3668
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
@ Offset
Definition: DWP.cpp:480
Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
Definition: LoopUtils.cpp:1809
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:849
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:431
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2400
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
unsigned getLoadStoreAddressSpace(Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7095
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
const SCEV * createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, Loop *OrigLoop)
std::pair< Instruction *, ElementCount > InstructionVFPair
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:425
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
Definition: LoopUtils.cpp:263
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2067
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
bool VerifySCEV
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition: VPlanCFG.h:226
auto map_range(ContainerTy &&C, FuncTy F)
Definition: STLExtras.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
cl::opt< bool > EnableVPlanNativePath("enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization."))
Definition: VPlan.cpp:54
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
std::unique_ptr< VPlan > VPlanPtr
Definition: VPlan.h:135
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:120
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
cl::opt< bool > EnableLoopVectorization
Align getLoadStoreAlignment(Value *I)
A helper function that returns the alignment of load or store instruction.
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Definition: SmallVector.h:1312
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition: STLExtras.h:572
void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition: STLExtras.h:336
Type * ToVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
Definition: VectorUtils.h:135
bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
Definition: DebugInfo.cpp:2367
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ Or
Bitwise or logical OR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, Loop *RemainderLoop, uint64_t UF)
Set weights for UnrolledLoop and RemainderLoop based on weights for OrigLoop and the following distri...
Definition: LoopUtils.cpp:1616
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
ScalarEpilogueLowering
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
void reportVectorizationInfo(const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
Definition: LoopUtils.cpp:1868
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
InstructionCost Cost
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:593
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
bool verifyVPlanIsValid(const VPlan &Plan)
Verify invariants for general VPlans.
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:471
cl::opt< bool > EnableLoopInterleaving
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
#define OP(n)
Definition: regex2.h:73
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:28
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
SmallMapVector< unsigned, unsigned, 4 > LoopInvariantRegs
Holds the number of loop invariant values that are used in the loop.
TargetLibraryInfo * TLI
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LoopVectorizePass(LoopVectorizeOptions Opts={})
BlockFrequencyInfo * BFI
ScalarEvolution * SE
AssumptionCache * AC
LoopVectorizeResult runImpl(Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
Storage for information about made changes.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition: PassManager.h:69
A marker to determine if extra passes after loop vectorization should be run.
Definition: LoopVectorize.h:86
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Flags describing the kind of vector reduction.
Parameters that control the generic loop unrolling transformation.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Definition: VPlan.h:88
ElementCount End
Definition: VPlan.h:93
A recipe for handling first-order recurrence phis.
Definition: VPlan.h:1881
VPIteration represents a single point in the iteration space of the output (vectorized and/or unrolle...
Definition: VPlan.h:226
bool isFirstIteration() const
Definition: VPlan.h:238
BasicBlock * PrevBB
The previous IR BasicBlock created or used.
Definition: VPlan.h:372
SmallDenseMap< VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
Definition: VPlan.h:380
BasicBlock * ExitBB
The last IR BasicBlock in the output IR.
Definition: VPlan.h:376
BasicBlock * getPreheaderBBFor(VPRecipeBase *R)
Returns the BasicBlock* mapped to the pre-header of the loop region containing R.
Definition: VPlan.cpp:354
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
Definition: VPlan.h:243
Value * get(VPValue *Def, unsigned Part, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def and a given Part if IsScalar is false,...
Definition: VPlan.cpp:253
DenseMap< const SCEV *, Value * > ExpandedSCEVs
Map SCEVs to their expanded values.
Definition: VPlan.h:417
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
Definition: VPlan.h:420
void addMetadata(Value *To, Instruction *From)
Add metadata from one instruction to another.
Definition: VPlan.cpp:367
struct llvm::VPTransformState::CFGState CFG
LoopVersioning * LVer
LoopVersioning.
Definition: VPlan.h:413
void addNewMetadata(Instruction *To, const Instruction *Orig)
Add additional metadata to To that was not present on Orig.
Definition: VPlan.cpp:359
void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance)
Construct the vector value of a scalarized value V one lane at a time.
Definition: VPlan.cpp:399
void set(VPValue *Def, Value *V, unsigned Part, bool IsScalar=false)
Set the generated vector Value for a given VPValue and a given Part, if IsScalar is false.
Definition: VPlan.h:295
std::optional< VPIteration > Instance
Hold the indices to generate specific scalar instructions.
Definition: VPlan.h:255
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
Definition: VPlan.h:397
VPlan * Plan
Pointer to the VPlan code is generated for.
Definition: VPlan.h:403
InnerLoopVectorizer * ILV
Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
Definition: VPlan.h:400
ElementCount VF
The chosen Vectorization and Unroll Factors of the loop being vectorized.
Definition: VPlan.h:249
void setDebugLocFrom(DebugLoc DL)
Set the debug location in the builder using the debug location DL.
Definition: VPlan.cpp:378
void execute(VPTransformState &State) override
Generate the wide load or gather.
VPValue * getEVL() const
Return the EVL operand.
Definition: VPlan.h:2445
A recipe for widening load operations, using the address to load from and an optional mask.
Definition: VPlan.h:2394
void execute(VPTransformState &State) override
Generate a wide load or gather.
A recipe for widening select instructions.
Definition: VPlan.h:1521
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition: VPlan.h:2521
void execute(VPTransformState &State) override
Generate the wide store or scatter.
VPValue * getEVL() const
Return the EVL operand.
Definition: VPlan.h:2524
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition: VPlan.h:2468
void execute(VPTransformState &State) override
Generate a wide store or scatter.
VPValue * getStoredValue() const
Return the value stored by this recipe.
Definition: VPlan.h:2485
static bool tryAddExplicitVectorLength(VPlan &Plan)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void dropPoisonGeneratingRecipes(VPlan &Plan, function_ref< bool(BasicBlock *)> BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimize(VPlan &Plan, ScalarEvolution &SE)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void VPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, ScalarEvolution &SE, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs, LLVMContext &Ctx)
Insert truncates and extends for any truncated recipe.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Sink users of fixed-order recurrences after the recipe defining their previous value.
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.