LLVM 19.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanHCFGBuilder.h"
62#include "VPlanTransforms.h"
63#include "VPlanVerifier.h"
64#include "llvm/ADT/APInt.h"
65#include "llvm/ADT/ArrayRef.h"
66#include "llvm/ADT/DenseMap.h"
68#include "llvm/ADT/Hashing.h"
69#include "llvm/ADT/MapVector.h"
70#include "llvm/ADT/STLExtras.h"
72#include "llvm/ADT/SmallSet.h"
74#include "llvm/ADT/Statistic.h"
75#include "llvm/ADT/StringRef.h"
76#include "llvm/ADT/Twine.h"
81#include "llvm/Analysis/CFG.h"
97#include "llvm/IR/Attributes.h"
98#include "llvm/IR/BasicBlock.h"
99#include "llvm/IR/CFG.h"
100#include "llvm/IR/Constant.h"
101#include "llvm/IR/Constants.h"
102#include "llvm/IR/DataLayout.h"
103#include "llvm/IR/DebugInfo.h"
105#include "llvm/IR/DebugLoc.h"
106#include "llvm/IR/DerivedTypes.h"
108#include "llvm/IR/Dominators.h"
109#include "llvm/IR/Function.h"
110#include "llvm/IR/IRBuilder.h"
111#include "llvm/IR/InstrTypes.h"
112#include "llvm/IR/Instruction.h"
113#include "llvm/IR/Instructions.h"
115#include "llvm/IR/Intrinsics.h"
116#include "llvm/IR/MDBuilder.h"
117#include "llvm/IR/Metadata.h"
118#include "llvm/IR/Module.h"
119#include "llvm/IR/Operator.h"
120#include "llvm/IR/PatternMatch.h"
122#include "llvm/IR/Type.h"
123#include "llvm/IR/Use.h"
124#include "llvm/IR/User.h"
125#include "llvm/IR/Value.h"
126#include "llvm/IR/ValueHandle.h"
127#include "llvm/IR/Verifier.h"
128#include "llvm/Support/Casting.h"
131#include "llvm/Support/Debug.h"
144#include <algorithm>
145#include <cassert>
146#include <cmath>
147#include <cstdint>
148#include <functional>
149#include <iterator>
150#include <limits>
151#include <map>
152#include <memory>
153#include <string>
154#include <tuple>
155#include <utility>
156
157using namespace llvm;
158
159#define LV_NAME "loop-vectorize"
160#define DEBUG_TYPE LV_NAME
161
162#ifndef NDEBUG
163const char VerboseDebug[] = DEBUG_TYPE "-verbose";
164#endif
165
166/// @{
167/// Metadata attribute names
168const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
170 "llvm.loop.vectorize.followup_vectorized";
172 "llvm.loop.vectorize.followup_epilogue";
173/// @}
174
175STATISTIC(LoopsVectorized, "Number of loops vectorized");
176STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
177STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
178
180 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
181 cl::desc("Enable vectorization of epilogue loops."));
182
184 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
185 cl::desc("When epilogue vectorization is enabled, and a value greater than "
186 "1 is specified, forces the given VF for all applicable epilogue "
187 "loops."));
188
190 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
191 cl::desc("Only loops with vectorization factor equal to or larger than "
192 "the specified value are considered for epilogue vectorization."));
193
194/// Loops with a known constant trip count below this number are vectorized only
195/// if no scalar iteration overheads are incurred.
197 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
198 cl::desc("Loops with a constant trip count that is smaller than this "
199 "value are vectorized only if no scalar iteration overheads "
200 "are incurred."));
201
203 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
204 cl::desc("The maximum allowed number of runtime memory checks"));
205
206// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207// that predication is preferred, and this lists all options. I.e., the
208// vectorizer will try to fold the tail-loop (epilogue) into the vector body
209// and predicate the instructions accordingly. If tail-folding fails, there are
210// different fallback strategies depending on these values:
212 enum Option {
216 };
217} // namespace PreferPredicateTy
218
220 "prefer-predicate-over-epilogue",
223 cl::desc("Tail-folding and predication preferences over creating a scalar "
224 "epilogue loop."),
226 "scalar-epilogue",
227 "Don't tail-predicate loops, create scalar epilogue"),
229 "predicate-else-scalar-epilogue",
230 "prefer tail-folding, create scalar epilogue if tail "
231 "folding fails."),
233 "predicate-dont-vectorize",
234 "prefers tail-folding, don't attempt vectorization if "
235 "tail-folding fails.")));
236
238 "force-tail-folding-style", cl::desc("Force the tail folding style"),
239 cl::init(TailFoldingStyle::None),
241 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
243 TailFoldingStyle::Data, "data",
244 "Create lane mask for data only, using active.lane.mask intrinsic"),
245 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
246 "data-without-lane-mask",
247 "Create lane mask with compare/stepvector"),
248 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
249 "Create lane mask using active.lane.mask intrinsic, and use "
250 "it for both data and control flow"),
252 TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
253 "data-and-control-without-rt-check",
254 "Similar to data-and-control, but remove the runtime check")));
255
257 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
258 cl::desc("Maximize bandwidth when selecting vectorization factor which "
259 "will be determined by the smallest type in loop."));
260
262 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
263 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
264
265/// An interleave-group may need masking if it resides in a block that needs
266/// predication, or in order to mask away gaps.
268 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
269 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
270
272 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
273 cl::desc("A flag that overrides the target's number of scalar registers."));
274
276 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
277 cl::desc("A flag that overrides the target's number of vector registers."));
278
280 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
281 cl::desc("A flag that overrides the target's max interleave factor for "
282 "scalar loops."));
283
285 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
286 cl::desc("A flag that overrides the target's max interleave factor for "
287 "vectorized loops."));
288
290 "force-target-instruction-cost", cl::init(0), cl::Hidden,
291 cl::desc("A flag that overrides the target's expected cost for "
292 "an instruction to a single constant value. Mostly "
293 "useful for getting consistent testing."));
294
296 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
297 cl::desc(
298 "Pretend that scalable vectors are supported, even if the target does "
299 "not support them. This flag should only be used for testing."));
300
302 "small-loop-cost", cl::init(20), cl::Hidden,
303 cl::desc(
304 "The cost of a loop that is considered 'small' by the interleaver."));
305
307 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
308 cl::desc("Enable the use of the block frequency analysis to access PGO "
309 "heuristics minimizing code growth in cold regions and being more "
310 "aggressive in hot regions."));
311
312// Runtime interleave loops for load/store throughput.
314 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
315 cl::desc(
316 "Enable runtime interleaving until load/store ports are saturated"));
317
318/// The number of stores in a loop that are allowed to need predication.
320 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
321 cl::desc("Max number of stores to be predicated behind an if."));
322
324 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
325 cl::desc("Count the induction variable only once when interleaving"));
326
328 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
329 cl::desc("Enable if predication of stores during vectorization."));
330
332 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
333 cl::desc("The maximum interleave count to use when interleaving a scalar "
334 "reduction in a nested loop."));
335
336static cl::opt<bool>
337 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
339 cl::desc("Prefer in-loop vector reductions, "
340 "overriding the targets preference."));
341
343 "force-ordered-reductions", cl::init(false), cl::Hidden,
344 cl::desc("Enable the vectorisation of loops with in-order (strict) "
345 "FP reductions"));
346
348 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
349 cl::desc(
350 "Prefer predicating a reduction operation over an after loop select."));
351
352namespace llvm {
354 "enable-vplan-native-path", cl::Hidden,
355 cl::desc("Enable VPlan-native vectorization path with "
356 "support for outer loop vectorization."));
357}
358
359// This flag enables the stress testing of the VPlan H-CFG construction in the
360// VPlan-native vectorization path. It must be used in conjuction with
361// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
362// verification of the H-CFGs built.
364 "vplan-build-stress-test", cl::init(false), cl::Hidden,
365 cl::desc(
366 "Build VPlan for every supported loop nest in the function and bail "
367 "out right after the build (stress test the VPlan H-CFG construction "
368 "in the VPlan-native vectorization path)."));
369
371 "interleave-loops", cl::init(true), cl::Hidden,
372 cl::desc("Enable loop interleaving in Loop vectorization passes"));
374 "vectorize-loops", cl::init(true), cl::Hidden,
375 cl::desc("Run the Loop vectorization passes"));
376
378 "vplan-print-in-dot-format", cl::Hidden,
379 cl::desc("Use dot format instead of plain text when dumping VPlans"));
380
382 "force-widen-divrem-via-safe-divisor", cl::Hidden,
383 cl::desc(
384 "Override cost based safe divisor widening for div/rem instructions"));
385
387 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
389 cl::desc("Try wider VFs if they enable the use of vector variants"));
390
391// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
392// variables not overflowing do not hold. See `emitSCEVChecks`.
393static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
394// Likelyhood of bypassing the vectorized loop because pointers overlap. See
395// `emitMemRuntimeChecks`.
396static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
397// Likelyhood of bypassing the vectorized loop because there are zero trips left
398// after prolog. See `emitIterationCountCheck`.
399static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
400
401/// A helper function that returns true if the given type is irregular. The
402/// type is irregular if its allocated size doesn't equal the store size of an
403/// element of the corresponding vector type.
404static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
405 // Determine if an array of N elements of type Ty is "bitcast compatible"
406 // with a <N x Ty> vector.
407 // This is only true if there is no padding between the array elements.
408 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
409}
410
411/// A helper function that returns the reciprocal of the block probability of
412/// predicated blocks. If we return X, we are assuming the predicated block
413/// will execute once for every X iterations of the loop header.
414///
415/// TODO: We should use actual block probability here, if available. Currently,
416/// we always assume predicated blocks have a 50% chance of executing.
417static unsigned getReciprocalPredBlockProb() { return 2; }
418
419/// Returns "best known" trip count for the specified loop \p L as defined by
420/// the following procedure:
421/// 1) Returns exact trip count if it is known.
422/// 2) Returns expected trip count according to profile data if any.
423/// 3) Returns upper bound estimate if it is known.
424/// 4) Returns std::nullopt if all of the above failed.
425static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
426 Loop *L) {
427 // Check if exact trip count is known.
428 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
429 return ExpectedTC;
430
431 // Check if there is an expected trip count available from profile data.
433 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
434 return *EstimatedTC;
435
436 // Check if upper bound estimate is known.
437 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
438 return ExpectedTC;
439
440 return std::nullopt;
441}
442
443/// Return a vector containing interleaved elements from multiple
444/// smaller input vectors.
446 const Twine &Name) {
447 unsigned Factor = Vals.size();
448 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
449
450 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
451#ifndef NDEBUG
452 for (Value *Val : Vals)
453 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
454#endif
455
456 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
457 // must use intrinsics to interleave.
458 if (VecTy->isScalableTy()) {
459 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
460 return Builder.CreateIntrinsic(
461 WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
462 /*FMFSource=*/nullptr, Name);
463 }
464
465 // Fixed length. Start by concatenating all vectors into a wide vector.
466 Value *WideVec = concatenateVectors(Builder, Vals);
467
468 // Interleave the elements into the wide vector.
469 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
470 return Builder.CreateShuffleVector(
471 WideVec, createInterleaveMask(NumElts, Factor), Name);
472}
473
474namespace {
475// Forward declare GeneratedRTChecks.
476class GeneratedRTChecks;
477
478using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
479} // namespace
480
481namespace llvm {
482
484
485/// InnerLoopVectorizer vectorizes loops which contain only one basic
486/// block to a specified vectorization factor (VF).
487/// This class performs the widening of scalars into vectors, or multiple
488/// scalars. This class also implements the following features:
489/// * It inserts an epilogue loop for handling loops that don't have iteration
490/// counts that are known to be a multiple of the vectorization factor.
491/// * It handles the code generation for reduction variables.
492/// * Scalarization (implementation using scalars) of un-vectorizable
493/// instructions.
494/// InnerLoopVectorizer does not perform any vectorization-legality
495/// checks, and relies on the caller to check for the different legality
496/// aspects. The InnerLoopVectorizer relies on the
497/// LoopVectorizationLegality class to provide information about the induction
498/// and reduction variables that were found to a given vectorization factor.
500public:
503 const TargetLibraryInfo *TLI,
507 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
509 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
510 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
511 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
512 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
514 // Query this against the original loop and save it here because the profile
515 // of the original loop header may change as the transformation happens.
518
520 this->MinProfitableTripCount = VecWidth;
521 else
522 this->MinProfitableTripCount = MinProfitableTripCount;
523 }
524
525 virtual ~InnerLoopVectorizer() = default;
526
527 /// Create a new empty loop that will contain vectorized instructions later
528 /// on, while the old loop will be used as the scalar remainder. Control flow
529 /// is generated around the vectorized (and scalar epilogue) loops consisting
530 /// of various checks and bypasses. Return the pre-header block of the new
531 /// loop and the start value for the canonical induction, if it is != 0. The
532 /// latter is the case when vectorizing the epilogue loop. In the case of
533 /// epilogue vectorization, this function is overriden to handle the more
534 /// complex control flow around the loops. \p ExpandedSCEVs is used to
535 /// look up SCEV expansions for expressions needed during skeleton creation.
536 virtual std::pair<BasicBlock *, Value *>
537 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
538
539 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
540 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
541
542 // Return true if any runtime check is added.
544
545 /// A type for vectorized values in the new loop. Each value from the
546 /// original loop, when vectorized, is represented by UF vector values in the
547 /// new unrolled loop, where UF is the unroll factor.
549
550 /// A helper function to scalarize a single Instruction in the innermost loop.
551 /// Generates a sequence of scalar instances for each lane between \p MinLane
552 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
553 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
554 /// Instr's operands.
555 void scalarizeInstruction(const Instruction *Instr,
556 VPReplicateRecipe *RepRecipe,
557 const VPIteration &Instance,
558 VPTransformState &State);
559
560 /// Try to vectorize interleaved access group \p Group with the base address
561 /// given in \p Addr, optionally masking the vector operations if \p
562 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
563 /// values in the vectorized loop.
565 ArrayRef<VPValue *> VPDefs,
567 ArrayRef<VPValue *> StoredValues,
568 VPValue *BlockInMask, bool NeedsMaskForGaps);
569
570 /// Fix the non-induction PHIs in \p Plan.
571 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
572
573 /// Returns true if the reordering of FP operations is not allowed, but we are
574 /// able to vectorize with strict in-order reductions for the given RdxDesc.
575 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
576
577 /// Create a new phi node for the induction variable \p OrigPhi to resume
578 /// iteration count in the scalar epilogue, from where the vectorized loop
579 /// left off. \p Step is the SCEV-expanded induction step to use. In cases
580 /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
581 /// and the resume values can come from an additional bypass block, the \p
582 /// AdditionalBypass pair provides information about the bypass block and the
583 /// end value on the edge from bypass to this loop.
585 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
586 ArrayRef<BasicBlock *> BypassBlocks,
587 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
588
589 /// Returns the original loop trip count.
590 Value *getTripCount() const { return TripCount; }
591
592 /// Used to set the trip count after ILV's construction and after the
593 /// preheader block has been executed. Note that this always holds the trip
594 /// count of the original loop for both main loop and epilogue vectorization.
595 void setTripCount(Value *TC) { TripCount = TC; }
596
597protected:
599
600 /// A small list of PHINodes.
602
603 /// A type for scalarized values in the new loop. Each value from the
604 /// original loop, when scalarized, is represented by UF x VF scalar values
605 /// in the new unrolled loop, where UF is the unroll factor and VF is the
606 /// vectorization factor.
608
609 /// Set up the values of the IVs correctly when exiting the vector loop.
610 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
611 Value *VectorTripCount, Value *EndValue,
612 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
613 VPlan &Plan, VPTransformState &State);
614
615 /// Create the exit value of first order recurrences in the middle block and
616 /// update their users.
618 VPTransformState &State);
619
620 /// Create code for the loop exit value of the reduction.
622
623 /// Iteratively sink the scalarized operands of a predicated instruction into
624 /// the block that was created for it.
625 void sinkScalarOperands(Instruction *PredInst);
626
627 /// Returns (and creates if needed) the trip count of the widened loop.
629
630 /// Returns a bitcasted value to the requested vector type.
631 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
633 const DataLayout &DL);
634
635 /// Emit a bypass check to see if the vector trip count is zero, including if
636 /// it overflows.
638
639 /// Emit a bypass check to see if all of the SCEV assumptions we've
640 /// had to make are correct. Returns the block containing the checks or
641 /// nullptr if no checks have been added.
643
644 /// Emit bypass checks to check any memory assumptions we may have made.
645 /// Returns the block containing the checks or nullptr if no checks have been
646 /// added.
648
649 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
650 /// vector loop preheader, middle block and scalar preheader.
652
653 /// Create new phi nodes for the induction variables to resume iteration count
654 /// in the scalar epilogue, from where the vectorized loop left off.
655 /// In cases where the loop skeleton is more complicated (eg. epilogue
656 /// vectorization) and the resume values can come from an additional bypass
657 /// block, the \p AdditionalBypass pair provides information about the bypass
658 /// block and the end value on the edge from bypass to this loop.
660 const SCEV2ValueTy &ExpandedSCEVs,
661 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
662
663 /// Complete the loop skeleton by adding debug MDs, creating appropriate
664 /// conditional branches in the middle block, preparing the builder and
665 /// running the verifier. Return the preheader of the completed vector loop.
667
668 /// Allow subclasses to override and print debug traces before/after vplan
669 /// execution, when trace information is requested.
670 virtual void printDebugTracesAtStart(){};
671 virtual void printDebugTracesAtEnd(){};
672
673 /// The original loop.
675
676 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
677 /// dynamic knowledge to simplify SCEV expressions and converts them to a
678 /// more usable form.
680
681 /// Loop Info.
683
684 /// Dominator Tree.
686
687 /// Target Library Info.
689
690 /// Target Transform Info.
692
693 /// Assumption Cache.
695
696 /// Interface to emit optimization remarks.
698
699 /// The vectorization SIMD factor to use. Each vector will have this many
700 /// vector elements.
702
704
705 /// The vectorization unroll factor to use. Each scalar is vectorized to this
706 /// many different vector instructions.
707 unsigned UF;
708
709 /// The builder that we use
711
712 // --- Vectorization state ---
713
714 /// The vector-loop preheader.
716
717 /// The scalar-loop preheader.
719
720 /// Middle Block between the vector and the scalar.
722
723 /// The unique ExitBlock of the scalar loop if one exists. Note that
724 /// there can be multiple exiting edges reaching this block.
726
727 /// The scalar loop body.
729
730 /// A list of all bypass blocks. The first block is the entry of the loop.
732
733 /// Store instructions that were predicated.
735
736 /// Trip count of the original loop.
737 Value *TripCount = nullptr;
738
739 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
741
742 /// The legality analysis.
744
745 /// The profitablity analysis.
747
748 // Record whether runtime checks are added.
749 bool AddedSafetyChecks = false;
750
751 // Holds the end values for each induction variable. We save the end values
752 // so we can later fix-up the external users of the induction variables.
754
755 /// BFI and PSI are used to check for profile guided size optimizations.
758
759 // Whether this loop should be optimized for size based on profile guided size
760 // optimizatios.
762
763 /// Structure to hold information about generated runtime checks, responsible
764 /// for cleaning the checks, if vectorization turns out unprofitable.
765 GeneratedRTChecks &RTChecks;
766
767 // Holds the resume values for reductions in the loops, used to set the
768 // correct start value of reduction PHIs when vectorizing the epilogue.
771};
772
774public:
777 const TargetLibraryInfo *TLI,
779 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
782 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
784 ElementCount::getFixed(1),
785 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
786 BFI, PSI, Check) {}
787};
788
789/// Encapsulate information regarding vectorization of a loop and its epilogue.
790/// This information is meant to be updated and used across two stages of
791/// epilogue vectorization.
794 unsigned MainLoopUF = 0;
796 unsigned EpilogueUF = 0;
801 Value *TripCount = nullptr;
803
805 ElementCount EVF, unsigned EUF)
806 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
807 assert(EUF == 1 &&
808 "A high UF for the epilogue loop is likely not beneficial.");
809 }
810};
811
812/// An extension of the inner loop vectorizer that creates a skeleton for a
813/// vectorized loop that has its epilogue (residual) also vectorized.
814/// The idea is to run the vplan on a given loop twice, firstly to setup the
815/// skeleton and vectorize the main loop, and secondly to complete the skeleton
816/// from the first step and vectorize the epilogue. This is achieved by
817/// deriving two concrete strategy classes from this base class and invoking
818/// them in succession from the loop vectorizer planner.
820public:
828 GeneratedRTChecks &Checks)
830 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
831 CM, BFI, PSI, Checks),
832 EPI(EPI) {}
833
834 // Override this function to handle the more complex control flow around the
835 // three loops.
836 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
837 const SCEV2ValueTy &ExpandedSCEVs) final {
838 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
839 }
840
841 /// The interface for creating a vectorized skeleton using one of two
842 /// different strategies, each corresponding to one execution of the vplan
843 /// as described above.
844 virtual std::pair<BasicBlock *, Value *>
845 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
846
847 /// Holds and updates state information required to vectorize the main loop
848 /// and its epilogue in two separate passes. This setup helps us avoid
849 /// regenerating and recomputing runtime safety checks. It also helps us to
850 /// shorten the iteration-count-check path length for the cases where the
851 /// iteration count of the loop is so small that the main vector loop is
852 /// completely skipped.
854};
855
856/// A specialized derived class of inner loop vectorizer that performs
857/// vectorization of *main* loops in the process of vectorizing loops and their
858/// epilogues.
860public:
868 GeneratedRTChecks &Check)
870 EPI, LVL, CM, BFI, PSI, Check) {}
871 /// Implements the interface for creating a vectorized skeleton using the
872 /// *main loop* strategy (ie the first pass of vplan execution).
873 std::pair<BasicBlock *, Value *>
874 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
875
876protected:
877 /// Emits an iteration count bypass check once for the main loop (when \p
878 /// ForEpilogue is false) and once for the epilogue loop (when \p
879 /// ForEpilogue is true).
880 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
881 void printDebugTracesAtStart() override;
882 void printDebugTracesAtEnd() override;
883};
884
885// A specialized derived class of inner loop vectorizer that performs
886// vectorization of *epilogue* loops in the process of vectorizing loops and
887// their epilogues.
889public:
897 GeneratedRTChecks &Checks)
899 EPI, LVL, CM, BFI, PSI, Checks) {
901 }
902 /// Implements the interface for creating a vectorized skeleton using the
903 /// *epilogue loop* strategy (ie the second pass of vplan execution).
904 std::pair<BasicBlock *, Value *>
905 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
906
907protected:
908 /// Emits an iteration count bypass check after the main vector loop has
909 /// finished to see if there are any iterations left to execute by either
910 /// the vector epilogue or the scalar epilogue.
912 BasicBlock *Bypass,
913 BasicBlock *Insert);
914 void printDebugTracesAtStart() override;
915 void printDebugTracesAtEnd() override;
916};
917} // end namespace llvm
918
919/// Look for a meaningful debug location on the instruction or it's
920/// operands.
922 if (!I)
923 return DebugLoc();
924
926 if (I->getDebugLoc() != Empty)
927 return I->getDebugLoc();
928
929 for (Use &Op : I->operands()) {
930 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
931 if (OpInst->getDebugLoc() != Empty)
932 return OpInst->getDebugLoc();
933 }
934
935 return I->getDebugLoc();
936}
937
938/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
939/// is passed, the message relates to that particular instruction.
940#ifndef NDEBUG
941static void debugVectorizationMessage(const StringRef Prefix,
942 const StringRef DebugMsg,
943 Instruction *I) {
944 dbgs() << "LV: " << Prefix << DebugMsg;
945 if (I != nullptr)
946 dbgs() << " " << *I;
947 else
948 dbgs() << '.';
949 dbgs() << '\n';
950}
951#endif
952
953/// Create an analysis remark that explains why vectorization failed
954///
955/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
956/// RemarkName is the identifier for the remark. If \p I is passed it is an
957/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
958/// the location of the remark. \return the remark object that can be
959/// streamed to.
961 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
962 Value *CodeRegion = TheLoop->getHeader();
963 DebugLoc DL = TheLoop->getStartLoc();
964
965 if (I) {
966 CodeRegion = I->getParent();
967 // If there is no debug location attached to the instruction, revert back to
968 // using the loop's.
969 if (I->getDebugLoc())
970 DL = I->getDebugLoc();
971 }
972
973 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
974}
975
976namespace llvm {
977
978/// Return a value for Step multiplied by VF.
980 int64_t Step) {
981 assert(Ty->isIntegerTy() && "Expected an integer step");
982 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
983}
984
985/// Return the runtime value for VF.
987 return B.CreateElementCount(Ty, VF);
988}
989
991 Loop *OrigLoop) {
992 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
993 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
994
995 ScalarEvolution &SE = *PSE.getSE();
996 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
997}
998
1000 const StringRef OREMsg, const StringRef ORETag,
1001 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1002 Instruction *I) {
1003 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1004 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1005 ORE->emit(
1006 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1007 << "loop not vectorized: " << OREMsg);
1008}
1009
1010void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1011 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1012 Instruction *I) {
1014 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1015 ORE->emit(
1016 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1017 << Msg);
1018}
1019
1020/// Report successful vectorization of the loop. In case an outer loop is
1021/// vectorized, prepend "outer" to the vectorization remark.
1023 VectorizationFactor VF, unsigned IC) {
1025 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
1026 nullptr));
1027 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
1028 ORE->emit([&]() {
1029 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
1030 TheLoop->getHeader())
1031 << "vectorized " << LoopType << "loop (vectorization width: "
1032 << ore::NV("VectorizationFactor", VF.Width)
1033 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
1034 });
1035}
1036
1037} // end namespace llvm
1038
1039#ifndef NDEBUG
1040/// \return string containing a file name and a line # for the given loop.
1041static std::string getDebugLocString(const Loop *L) {
1042 std::string Result;
1043 if (L) {
1044 raw_string_ostream OS(Result);
1045 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1046 LoopDbgLoc.print(OS);
1047 else
1048 // Just print the module name.
1049 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1050 OS.flush();
1051 }
1052 return Result;
1053}
1054#endif
1055
1056namespace llvm {
1057
1058// Loop vectorization cost-model hints how the scalar epilogue loop should be
1059// lowered.
1061
1062 // The default: allowing scalar epilogues.
1064
1065 // Vectorization with OptForSize: don't allow epilogues.
1067
1068 // A special case of vectorisation with OptForSize: loops with a very small
1069 // trip count are considered for vectorization under OptForSize, thereby
1070 // making sure the cost of their loop body is dominant, free of runtime
1071 // guards and scalar iteration overheads.
1073
1074 // Loop hint predicate indicating an epilogue is undesired.
1076
1077 // Directive indicating we must either tail fold or not vectorize
1080
1081using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1082
1083/// LoopVectorizationCostModel - estimates the expected speedups due to
1084/// vectorization.
1085/// In many cases vectorization is not profitable. This can happen because of
1086/// a number of reasons. In this class we mainly attempt to predict the
1087/// expected speedup/slowdowns due to the supported instruction set. We use the
1088/// TargetTransformInfo to query the different backends for the cost of
1089/// different operations.
1091public:
1095 const TargetTransformInfo &TTI,
1101 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1102 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1103 Hints(Hints), InterleaveInfo(IAI) {}
1104
1105 /// \return An upper bound for the vectorization factors (both fixed and
1106 /// scalable). If the factors are 0, vectorization and interleaving should be
1107 /// avoided up front.
1108 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1109
1110 /// \return True if runtime checks are required for vectorization, and false
1111 /// otherwise.
1112 bool runtimeChecksRequired();
1113
1114 /// Setup cost-based decisions for user vectorization factor.
1115 /// \return true if the UserVF is a feasible VF to be chosen.
1119 return expectedCost(UserVF).first.isValid();
1120 }
1121
1122 /// \return The size (in bits) of the smallest and widest types in the code
1123 /// that needs to be vectorized. We ignore values that remain scalar such as
1124 /// 64 bit loop indices.
1125 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1126
1127 /// \return The desired interleave count.
1128 /// If interleave count has been specified by metadata it will be returned.
1129 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1130 /// are the selected vectorization factor and the cost of the selected VF.
1131 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1132
1133 /// Memory access instruction may be vectorized in more than one way.
1134 /// Form of instruction after vectorization depends on cost.
1135 /// This function takes cost-based decisions for Load/Store instructions
1136 /// and collects them in a map. This decisions map is used for building
1137 /// the lists of loop-uniform and loop-scalar instructions.
1138 /// The calculated cost is saved with widening decision in order to
1139 /// avoid redundant calculations.
1141
1142 /// A call may be vectorized in different ways depending on whether we have
1143 /// vectorized variants available and whether the target supports masking.
1144 /// This function analyzes all calls in the function at the supplied VF,
1145 /// makes a decision based on the costs of available options, and stores that
1146 /// decision in a map for use in planning and plan execution.
1148
1149 /// A struct that represents some properties of the register usage
1150 /// of a loop.
1152 /// Holds the number of loop invariant values that are used in the loop.
1153 /// The key is ClassID of target-provided register class.
1155 /// Holds the maximum number of concurrent live intervals in the loop.
1156 /// The key is ClassID of target-provided register class.
1158 };
1159
1160 /// \return Returns information about the register usages of the loop for the
1161 /// given vectorization factors.
1164
1165 /// Collect values we want to ignore in the cost model.
1166 void collectValuesToIgnore();
1167
1168 /// Collect all element types in the loop for which widening is needed.
1170
1171 /// Split reductions into those that happen in the loop, and those that happen
1172 /// outside. In loop reductions are collected into InLoopReductions.
1174
1175 /// Returns true if we should use strict in-order reductions for the given
1176 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1177 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1178 /// of FP operations.
1179 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1180 return !Hints->allowReordering() && RdxDesc.isOrdered();
1181 }
1182
1183 /// \returns The smallest bitwidth each instruction can be represented with.
1184 /// The vector equivalents of these instructions should be truncated to this
1185 /// type.
1187 return MinBWs;
1188 }
1189
1190 /// \returns True if it is more profitable to scalarize instruction \p I for
1191 /// vectorization factor \p VF.
1193 assert(VF.isVector() &&
1194 "Profitable to scalarize relevant only for VF > 1.");
1195 assert(
1196 TheLoop->isInnermost() &&
1197 "cost-model should not be used for outer loops (in VPlan-native path)");
1198
1199 auto Scalars = InstsToScalarize.find(VF);
1200 assert(Scalars != InstsToScalarize.end() &&
1201 "VF not yet analyzed for scalarization profitability");
1202 return Scalars->second.contains(I);
1203 }
1204
1205 /// Returns true if \p I is known to be uniform after vectorization.
1207 assert(
1208 TheLoop->isInnermost() &&
1209 "cost-model should not be used for outer loops (in VPlan-native path)");
1210 // Pseudo probe needs to be duplicated for each unrolled iteration and
1211 // vector lane so that profiled loop trip count can be accurately
1212 // accumulated instead of being under counted.
1213 if (isa<PseudoProbeInst>(I))
1214 return false;
1215
1216 if (VF.isScalar())
1217 return true;
1218
1219 auto UniformsPerVF = Uniforms.find(VF);
1220 assert(UniformsPerVF != Uniforms.end() &&
1221 "VF not yet analyzed for uniformity");
1222 return UniformsPerVF->second.count(I);
1223 }
1224
1225 /// Returns true if \p I is known to be scalar after vectorization.
1227 assert(
1228 TheLoop->isInnermost() &&
1229 "cost-model should not be used for outer loops (in VPlan-native path)");
1230 if (VF.isScalar())
1231 return true;
1232
1233 auto ScalarsPerVF = Scalars.find(VF);
1234 assert(ScalarsPerVF != Scalars.end() &&
1235 "Scalar values are not calculated for VF");
1236 return ScalarsPerVF->second.count(I);
1237 }
1238
1239 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1240 /// for vectorization factor \p VF.
1242 return VF.isVector() && MinBWs.contains(I) &&
1243 !isProfitableToScalarize(I, VF) &&
1245 }
1246
1247 /// Decision that was taken during cost calculation for memory instruction.
1250 CM_Widen, // For consecutive accesses with stride +1.
1251 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1258
1259 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1260 /// instruction \p I and vector width \p VF.
1263 assert(VF.isVector() && "Expected VF >=2");
1264 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1265 }
1266
1267 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1268 /// interleaving group \p Grp and vector width \p VF.
1272 assert(VF.isVector() && "Expected VF >=2");
1273 /// Broadcast this decicion to all instructions inside the group.
1274 /// But the cost will be assigned to one instruction only.
1275 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1276 if (auto *I = Grp->getMember(i)) {
1277 if (Grp->getInsertPos() == I)
1278 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1279 else
1280 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1281 }
1282 }
1283 }
1284
1285 /// Return the cost model decision for the given instruction \p I and vector
1286 /// width \p VF. Return CM_Unknown if this instruction did not pass
1287 /// through the cost modeling.
1289 assert(VF.isVector() && "Expected VF to be a vector VF");
1290 assert(
1291 TheLoop->isInnermost() &&
1292 "cost-model should not be used for outer loops (in VPlan-native path)");
1293
1294 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1295 auto Itr = WideningDecisions.find(InstOnVF);
1296 if (Itr == WideningDecisions.end())
1297 return CM_Unknown;
1298 return Itr->second.first;
1299 }
1300
1301 /// Return the vectorization cost for the given instruction \p I and vector
1302 /// width \p VF.
1304 assert(VF.isVector() && "Expected VF >=2");
1305 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1306 assert(WideningDecisions.contains(InstOnVF) &&
1307 "The cost is not calculated");
1308 return WideningDecisions[InstOnVF].second;
1309 }
1310
1315 std::optional<unsigned> MaskPos;
1317 };
1318
1320 Function *Variant, Intrinsic::ID IID,
1321 std::optional<unsigned> MaskPos,
1323 assert(!VF.isScalar() && "Expected vector VF");
1324 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1325 MaskPos, Cost};
1326 }
1327
1329 ElementCount VF) const {
1330 assert(!VF.isScalar() && "Expected vector VF");
1331 return CallWideningDecisions.at(std::make_pair(CI, VF));
1332 }
1333
1334 /// Return True if instruction \p I is an optimizable truncate whose operand
1335 /// is an induction variable. Such a truncate will be removed by adding a new
1336 /// induction variable with the destination type.
1338 // If the instruction is not a truncate, return false.
1339 auto *Trunc = dyn_cast<TruncInst>(I);
1340 if (!Trunc)
1341 return false;
1342
1343 // Get the source and destination types of the truncate.
1344 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1345 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1346
1347 // If the truncate is free for the given types, return false. Replacing a
1348 // free truncate with an induction variable would add an induction variable
1349 // update instruction to each iteration of the loop. We exclude from this
1350 // check the primary induction variable since it will need an update
1351 // instruction regardless.
1352 Value *Op = Trunc->getOperand(0);
1353 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1354 return false;
1355
1356 // If the truncated value is not an induction variable, return false.
1357 return Legal->isInductionPhi(Op);
1358 }
1359
1360 /// Collects the instructions to scalarize for each predicated instruction in
1361 /// the loop.
1363
1364 /// Collect Uniform and Scalar values for the given \p VF.
1365 /// The sets depend on CM decision for Load/Store instructions
1366 /// that may be vectorized as interleave, gather-scatter or scalarized.
1367 /// Also make a decision on what to do about call instructions in the loop
1368 /// at that VF -- scalarize, call a known vector routine, or call a
1369 /// vector intrinsic.
1371 // Do the analysis once.
1372 if (VF.isScalar() || Uniforms.contains(VF))
1373 return;
1376 collectLoopUniforms(VF);
1377 collectLoopScalars(VF);
1378 }
1379
1380 /// Returns true if the target machine supports masked store operation
1381 /// for the given \p DataType and kind of access to \p Ptr.
1382 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1383 return Legal->isConsecutivePtr(DataType, Ptr) &&
1384 TTI.isLegalMaskedStore(DataType, Alignment);
1385 }
1386
1387 /// Returns true if the target machine supports masked load operation
1388 /// for the given \p DataType and kind of access to \p Ptr.
1389 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1390 return Legal->isConsecutivePtr(DataType, Ptr) &&
1391 TTI.isLegalMaskedLoad(DataType, Alignment);
1392 }
1393
1394 /// Returns true if the target machine can represent \p V as a masked gather
1395 /// or scatter operation.
1397 bool LI = isa<LoadInst>(V);
1398 bool SI = isa<StoreInst>(V);
1399 if (!LI && !SI)
1400 return false;
1401 auto *Ty = getLoadStoreType(V);
1403 if (VF.isVector())
1404 Ty = VectorType::get(Ty, VF);
1405 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1406 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1407 }
1408
1409 /// Returns true if the target machine supports all of the reduction
1410 /// variables found for the given VF.
1412 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1413 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1414 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1415 }));
1416 }
1417
1418 /// Given costs for both strategies, return true if the scalar predication
1419 /// lowering should be used for div/rem. This incorporates an override
1420 /// option so it is not simply a cost comparison.
1422 InstructionCost SafeDivisorCost) const {
1423 switch (ForceSafeDivisor) {
1424 case cl::BOU_UNSET:
1425 return ScalarCost < SafeDivisorCost;
1426 case cl::BOU_TRUE:
1427 return false;
1428 case cl::BOU_FALSE:
1429 return true;
1430 };
1431 llvm_unreachable("impossible case value");
1432 }
1433
1434 /// Returns true if \p I is an instruction which requires predication and
1435 /// for which our chosen predication strategy is scalarization (i.e. we
1436 /// don't have an alternate strategy such as masking available).
1437 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1439
1440 /// Returns true if \p I is an instruction that needs to be predicated
1441 /// at runtime. The result is independent of the predication mechanism.
1442 /// Superset of instructions that return true for isScalarWithPredication.
1443 bool isPredicatedInst(Instruction *I) const;
1444
1445 /// Return the costs for our two available strategies for lowering a
1446 /// div/rem operation which requires speculating at least one lane.
1447 /// First result is for scalarization (will be invalid for scalable
1448 /// vectors); second is for the safe-divisor strategy.
1449 std::pair<InstructionCost, InstructionCost>
1451 ElementCount VF) const;
1452
1453 /// Returns true if \p I is a memory instruction with consecutive memory
1454 /// access that can be widened.
1456
1457 /// Returns true if \p I is a memory instruction in an interleaved-group
1458 /// of memory accesses that can be vectorized with wide vector loads/stores
1459 /// and shuffles.
1461
1462 /// Check if \p Instr belongs to any interleaved access group.
1464 return InterleaveInfo.isInterleaved(Instr);
1465 }
1466
1467 /// Get the interleaved access group that \p Instr belongs to.
1470 return InterleaveInfo.getInterleaveGroup(Instr);
1471 }
1472
1473 /// Returns true if we're required to use a scalar epilogue for at least
1474 /// the final iteration of the original loop.
1475 bool requiresScalarEpilogue(bool IsVectorizing) const {
1477 return false;
1478 // If we might exit from anywhere but the latch, must run the exiting
1479 // iteration in scalar form.
1481 return true;
1482 return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
1483 }
1484
1485 /// Returns true if we're required to use a scalar epilogue for at least
1486 /// the final iteration of the original loop for all VFs in \p Range.
1487 /// A scalar epilogue must either be required for all VFs in \p Range or for
1488 /// none.
1490 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1491 return requiresScalarEpilogue(VF.isVector());
1492 };
1493 bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1494 assert(
1495 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1496 "all VFs in range must agree on whether a scalar epilogue is required");
1497 return IsRequired;
1498 }
1499
1500 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1501 /// loop hint annotation.
1503 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1504 }
1505
1506 /// Returns the TailFoldingStyle that is best for the current loop.
1507 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1508 return IVUpdateMayOverflow ? ChosenTailFoldingStyle.first
1509 : ChosenTailFoldingStyle.second;
1510 }
1511
1512 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1513 /// overflow or not.
1515 assert(ChosenTailFoldingStyle.first == TailFoldingStyle::None &&
1516 ChosenTailFoldingStyle.second == TailFoldingStyle::None &&
1517 "Tail folding must not be selected yet.");
1519 return;
1520
1521 if (ForceTailFoldingStyle.getNumOccurrences()) {
1522 ChosenTailFoldingStyle.first = ChosenTailFoldingStyle.second =
1524 return;
1525 }
1526
1527 ChosenTailFoldingStyle.first =
1528 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true);
1529 ChosenTailFoldingStyle.second =
1530 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false);
1531 }
1532
1533 /// Returns true if all loop blocks should be masked to fold tail loop.
1534 bool foldTailByMasking() const {
1535 // TODO: check if it is possible to check for None style independent of
1536 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1538 }
1539
1540 /// Returns true if the instructions in this block requires predication
1541 /// for any reason, e.g. because tail folding now requires a predicate
1542 /// or because the block in the original loop was predicated.
1545 }
1546
1547 /// Returns true if the Phi is part of an inloop reduction.
1548 bool isInLoopReduction(PHINode *Phi) const {
1549 return InLoopReductions.contains(Phi);
1550 }
1551
1552 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1553 /// with factor VF. Return the cost of the instruction, including
1554 /// scalarization overhead if it's needed.
1556
1557 /// Estimate cost of a call instruction CI if it were vectorized with factor
1558 /// VF. Return the cost of the instruction, including scalarization overhead
1559 /// if it's needed.
1561
1562 /// Invalidates decisions already taken by the cost model.
1564 WideningDecisions.clear();
1565 CallWideningDecisions.clear();
1566 Uniforms.clear();
1567 Scalars.clear();
1568 }
1569
1570 /// The vectorization cost is a combination of the cost itself and a boolean
1571 /// indicating whether any of the contributing operations will actually
1572 /// operate on vector values after type legalization in the backend. If this
1573 /// latter value is false, then all operations will be scalarized (i.e. no
1574 /// vectorization has actually taken place).
1575 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1576
1577 /// Returns the expected execution cost. The unit of the cost does
1578 /// not matter because we use the 'cost' units to compare different
1579 /// vector widths. The cost that is returned is *not* normalized by
1580 /// the factor width. If \p Invalid is not nullptr, this function
1581 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1582 /// each instruction that has an Invalid cost for the given VF.
1586
1587 bool hasPredStores() const { return NumPredStores > 0; }
1588
1589 /// Returns true if epilogue vectorization is considered profitable, and
1590 /// false otherwise.
1591 /// \p VF is the vectorization factor chosen for the original loop.
1593
1594private:
1595 unsigned NumPredStores = 0;
1596
1597 /// \return An upper bound for the vectorization factors for both
1598 /// fixed and scalable vectorization, where the minimum-known number of
1599 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1600 /// disabled or unsupported, then the scalable part will be equal to
1601 /// ElementCount::getScalable(0).
1602 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1603 ElementCount UserVF,
1604 bool FoldTailByMasking);
1605
1606 /// \return the maximized element count based on the targets vector
1607 /// registers and the loop trip-count, but limited to a maximum safe VF.
1608 /// This is a helper function of computeFeasibleMaxVF.
1609 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1610 unsigned SmallestType,
1611 unsigned WidestType,
1612 ElementCount MaxSafeVF,
1613 bool FoldTailByMasking);
1614
1615 /// \return the maximum legal scalable VF, based on the safe max number
1616 /// of elements.
1617 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1618
1619 /// Returns the execution time cost of an instruction for a given vector
1620 /// width. Vector width of one means scalar.
1621 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1622
1623 /// The cost-computation logic from getInstructionCost which provides
1624 /// the vector type as an output parameter.
1625 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1626 Type *&VectorTy);
1627
1628 /// Return the cost of instructions in an inloop reduction pattern, if I is
1629 /// part of that pattern.
1630 std::optional<InstructionCost>
1631 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1633
1634 /// Calculate vectorization cost of memory instruction \p I.
1635 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1636
1637 /// The cost computation for scalarized memory instruction.
1638 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1639
1640 /// The cost computation for interleaving group of memory instructions.
1641 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1642
1643 /// The cost computation for Gather/Scatter instruction.
1644 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1645
1646 /// The cost computation for widening instruction \p I with consecutive
1647 /// memory access.
1648 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1649
1650 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1651 /// Load: scalar load + broadcast.
1652 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1653 /// element)
1654 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1655
1656 /// Estimate the overhead of scalarizing an instruction. This is a
1657 /// convenience wrapper for the type-based getScalarizationOverhead API.
1658 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1660
1661 /// Returns true if an artificially high cost for emulated masked memrefs
1662 /// should be used.
1663 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1664
1665 /// Map of scalar integer values to the smallest bitwidth they can be legally
1666 /// represented as. The vector equivalents of these values should be truncated
1667 /// to this type.
1669
1670 /// A type representing the costs for instructions if they were to be
1671 /// scalarized rather than vectorized. The entries are Instruction-Cost
1672 /// pairs.
1673 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1674
1675 /// A set containing all BasicBlocks that are known to present after
1676 /// vectorization as a predicated block.
1678 PredicatedBBsAfterVectorization;
1679
1680 /// Records whether it is allowed to have the original scalar loop execute at
1681 /// least once. This may be needed as a fallback loop in case runtime
1682 /// aliasing/dependence checks fail, or to handle the tail/remainder
1683 /// iterations when the trip count is unknown or doesn't divide by the VF,
1684 /// or as a peel-loop to handle gaps in interleave-groups.
1685 /// Under optsize and when the trip count is very small we don't allow any
1686 /// iterations to execute in the scalar loop.
1687 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1688
1689 /// Control finally chosen tail folding style. The first element is used if
1690 /// the IV update may overflow, the second element - if it does not.
1691 std::pair<TailFoldingStyle, TailFoldingStyle> ChosenTailFoldingStyle =
1693
1694 /// A map holding scalar costs for different vectorization factors. The
1695 /// presence of a cost for an instruction in the mapping indicates that the
1696 /// instruction will be scalarized when vectorizing with the associated
1697 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1699
1700 /// Holds the instructions known to be uniform after vectorization.
1701 /// The data is collected per VF.
1703
1704 /// Holds the instructions known to be scalar after vectorization.
1705 /// The data is collected per VF.
1707
1708 /// Holds the instructions (address computations) that are forced to be
1709 /// scalarized.
1711
1712 /// PHINodes of the reductions that should be expanded in-loop.
1713 SmallPtrSet<PHINode *, 4> InLoopReductions;
1714
1715 /// A Map of inloop reduction operations and their immediate chain operand.
1716 /// FIXME: This can be removed once reductions can be costed correctly in
1717 /// VPlan. This was added to allow quick lookup of the inloop operations.
1718 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1719
1720 /// Returns the expected difference in cost from scalarizing the expression
1721 /// feeding a predicated instruction \p PredInst. The instructions to
1722 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1723 /// non-negative return value implies the expression will be scalarized.
1724 /// Currently, only single-use chains are considered for scalarization.
1725 InstructionCost computePredInstDiscount(Instruction *PredInst,
1726 ScalarCostsTy &ScalarCosts,
1727 ElementCount VF);
1728
1729 /// Collect the instructions that are uniform after vectorization. An
1730 /// instruction is uniform if we represent it with a single scalar value in
1731 /// the vectorized loop corresponding to each vector iteration. Examples of
1732 /// uniform instructions include pointer operands of consecutive or
1733 /// interleaved memory accesses. Note that although uniformity implies an
1734 /// instruction will be scalar, the reverse is not true. In general, a
1735 /// scalarized instruction will be represented by VF scalar values in the
1736 /// vectorized loop, each corresponding to an iteration of the original
1737 /// scalar loop.
1738 void collectLoopUniforms(ElementCount VF);
1739
1740 /// Collect the instructions that are scalar after vectorization. An
1741 /// instruction is scalar if it is known to be uniform or will be scalarized
1742 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1743 /// to the list if they are used by a load/store instruction that is marked as
1744 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1745 /// VF values in the vectorized loop, each corresponding to an iteration of
1746 /// the original scalar loop.
1747 void collectLoopScalars(ElementCount VF);
1748
1749 /// Keeps cost model vectorization decision and cost for instructions.
1750 /// Right now it is used for memory instructions only.
1752 std::pair<InstWidening, InstructionCost>>;
1753
1754 DecisionList WideningDecisions;
1755
1756 using CallDecisionList =
1757 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1758
1759 CallDecisionList CallWideningDecisions;
1760
1761 /// Returns true if \p V is expected to be vectorized and it needs to be
1762 /// extracted.
1763 bool needsExtract(Value *V, ElementCount VF) const {
1764 Instruction *I = dyn_cast<Instruction>(V);
1765 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1767 return false;
1768
1769 // Assume we can vectorize V (and hence we need extraction) if the
1770 // scalars are not computed yet. This can happen, because it is called
1771 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1772 // the scalars are collected. That should be a safe assumption in most
1773 // cases, because we check if the operands have vectorizable types
1774 // beforehand in LoopVectorizationLegality.
1775 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1776 };
1777
1778 /// Returns a range containing only operands needing to be extracted.
1779 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1780 ElementCount VF) const {
1782 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1783 }
1784
1785public:
1786 /// The loop that we evaluate.
1788
1789 /// Predicated scalar evolution analysis.
1791
1792 /// Loop Info analysis.
1794
1795 /// Vectorization legality.
1797
1798 /// Vector target information.
1800
1801 /// Target Library Info.
1803
1804 /// Demanded bits analysis.
1806
1807 /// Assumption cache.
1809
1810 /// Interface to emit optimization remarks.
1812
1814
1815 /// Loop Vectorize Hint.
1817
1818 /// The interleave access information contains groups of interleaved accesses
1819 /// with the same stride and close to each other.
1821
1822 /// Values to ignore in the cost model.
1824
1825 /// Values to ignore in the cost model when VF > 1.
1827
1828 /// All element types found in the loop.
1830};
1831} // end namespace llvm
1832
1833namespace {
1834/// Helper struct to manage generating runtime checks for vectorization.
1835///
1836/// The runtime checks are created up-front in temporary blocks to allow better
1837/// estimating the cost and un-linked from the existing IR. After deciding to
1838/// vectorize, the checks are moved back. If deciding not to vectorize, the
1839/// temporary blocks are completely removed.
1840class GeneratedRTChecks {
1841 /// Basic block which contains the generated SCEV checks, if any.
1842 BasicBlock *SCEVCheckBlock = nullptr;
1843
1844 /// The value representing the result of the generated SCEV checks. If it is
1845 /// nullptr, either no SCEV checks have been generated or they have been used.
1846 Value *SCEVCheckCond = nullptr;
1847
1848 /// Basic block which contains the generated memory runtime checks, if any.
1849 BasicBlock *MemCheckBlock = nullptr;
1850
1851 /// The value representing the result of the generated memory runtime checks.
1852 /// If it is nullptr, either no memory runtime checks have been generated or
1853 /// they have been used.
1854 Value *MemRuntimeCheckCond = nullptr;
1855
1856 DominatorTree *DT;
1857 LoopInfo *LI;
1859
1860 SCEVExpander SCEVExp;
1861 SCEVExpander MemCheckExp;
1862
1863 bool CostTooHigh = false;
1864 const bool AddBranchWeights;
1865
1866 Loop *OuterLoop = nullptr;
1867
1868public:
1869 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1871 bool AddBranchWeights)
1872 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1873 MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1874
1875 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1876 /// accurately estimate the cost of the runtime checks. The blocks are
1877 /// un-linked from the IR and is added back during vector code generation. If
1878 /// there is no vector code generation, the check blocks are removed
1879 /// completely.
1880 void Create(Loop *L, const LoopAccessInfo &LAI,
1881 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1882
1883 // Hard cutoff to limit compile-time increase in case a very large number of
1884 // runtime checks needs to be generated.
1885 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1886 // profile info.
1887 CostTooHigh =
1889 if (CostTooHigh)
1890 return;
1891
1892 BasicBlock *LoopHeader = L->getHeader();
1893 BasicBlock *Preheader = L->getLoopPreheader();
1894
1895 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1896 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1897 // may be used by SCEVExpander. The blocks will be un-linked from their
1898 // predecessors and removed from LI & DT at the end of the function.
1899 if (!UnionPred.isAlwaysTrue()) {
1900 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1901 nullptr, "vector.scevcheck");
1902
1903 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1904 &UnionPred, SCEVCheckBlock->getTerminator());
1905 }
1906
1907 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1908 if (RtPtrChecking.Need) {
1909 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1910 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1911 "vector.memcheck");
1912
1913 auto DiffChecks = RtPtrChecking.getDiffChecks();
1914 if (DiffChecks) {
1915 Value *RuntimeVF = nullptr;
1916 MemRuntimeCheckCond = addDiffRuntimeChecks(
1917 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1918 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1919 if (!RuntimeVF)
1920 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1921 return RuntimeVF;
1922 },
1923 IC);
1924 } else {
1925 MemRuntimeCheckCond = addRuntimeChecks(
1926 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1928 }
1929 assert(MemRuntimeCheckCond &&
1930 "no RT checks generated although RtPtrChecking "
1931 "claimed checks are required");
1932 }
1933
1934 if (!MemCheckBlock && !SCEVCheckBlock)
1935 return;
1936
1937 // Unhook the temporary block with the checks, update various places
1938 // accordingly.
1939 if (SCEVCheckBlock)
1940 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1941 if (MemCheckBlock)
1942 MemCheckBlock->replaceAllUsesWith(Preheader);
1943
1944 if (SCEVCheckBlock) {
1945 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1946 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1947 Preheader->getTerminator()->eraseFromParent();
1948 }
1949 if (MemCheckBlock) {
1950 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1951 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1952 Preheader->getTerminator()->eraseFromParent();
1953 }
1954
1955 DT->changeImmediateDominator(LoopHeader, Preheader);
1956 if (MemCheckBlock) {
1957 DT->eraseNode(MemCheckBlock);
1958 LI->removeBlock(MemCheckBlock);
1959 }
1960 if (SCEVCheckBlock) {
1961 DT->eraseNode(SCEVCheckBlock);
1962 LI->removeBlock(SCEVCheckBlock);
1963 }
1964
1965 // Outer loop is used as part of the later cost calculations.
1966 OuterLoop = L->getParentLoop();
1967 }
1968
1969 InstructionCost getCost() {
1970 if (SCEVCheckBlock || MemCheckBlock)
1971 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1972
1973 if (CostTooHigh) {
1975 Cost.setInvalid();
1976 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1977 return Cost;
1978 }
1979
1980 InstructionCost RTCheckCost = 0;
1981 if (SCEVCheckBlock)
1982 for (Instruction &I : *SCEVCheckBlock) {
1983 if (SCEVCheckBlock->getTerminator() == &I)
1984 continue;
1987 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1988 RTCheckCost += C;
1989 }
1990 if (MemCheckBlock) {
1991 InstructionCost MemCheckCost = 0;
1992 for (Instruction &I : *MemCheckBlock) {
1993 if (MemCheckBlock->getTerminator() == &I)
1994 continue;
1997 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1998 MemCheckCost += C;
1999 }
2000
2001 // If the runtime memory checks are being created inside an outer loop
2002 // we should find out if these checks are outer loop invariant. If so,
2003 // the checks will likely be hoisted out and so the effective cost will
2004 // reduce according to the outer loop trip count.
2005 if (OuterLoop) {
2006 ScalarEvolution *SE = MemCheckExp.getSE();
2007 // TODO: If profitable, we could refine this further by analysing every
2008 // individual memory check, since there could be a mixture of loop
2009 // variant and invariant checks that mean the final condition is
2010 // variant.
2011 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
2012 if (SE->isLoopInvariant(Cond, OuterLoop)) {
2013 // It seems reasonable to assume that we can reduce the effective
2014 // cost of the checks even when we know nothing about the trip
2015 // count. Assume that the outer loop executes at least twice.
2016 unsigned BestTripCount = 2;
2017
2018 // If exact trip count is known use that.
2019 if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
2020 BestTripCount = SmallTC;
2022 // Else use profile data if available.
2023 if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
2024 BestTripCount = *EstimatedTC;
2025 }
2026
2027 BestTripCount = std::max(BestTripCount, 1U);
2028 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2029
2030 // Let's ensure the cost is always at least 1.
2031 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2033
2034 if (BestTripCount > 1)
2036 << "We expect runtime memory checks to be hoisted "
2037 << "out of the outer loop. Cost reduced from "
2038 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2039
2040 MemCheckCost = NewMemCheckCost;
2041 }
2042 }
2043
2044 RTCheckCost += MemCheckCost;
2045 }
2046
2047 if (SCEVCheckBlock || MemCheckBlock)
2048 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2049 << "\n");
2050
2051 return RTCheckCost;
2052 }
2053
2054 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2055 /// unused.
2056 ~GeneratedRTChecks() {
2057 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2058 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2059 if (!SCEVCheckCond)
2060 SCEVCleaner.markResultUsed();
2061
2062 if (!MemRuntimeCheckCond)
2063 MemCheckCleaner.markResultUsed();
2064
2065 if (MemRuntimeCheckCond) {
2066 auto &SE = *MemCheckExp.getSE();
2067 // Memory runtime check generation creates compares that use expanded
2068 // values. Remove them before running the SCEVExpanderCleaners.
2069 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2070 if (MemCheckExp.isInsertedInstruction(&I))
2071 continue;
2072 SE.forgetValue(&I);
2073 I.eraseFromParent();
2074 }
2075 }
2076 MemCheckCleaner.cleanup();
2077 SCEVCleaner.cleanup();
2078
2079 if (SCEVCheckCond)
2080 SCEVCheckBlock->eraseFromParent();
2081 if (MemRuntimeCheckCond)
2082 MemCheckBlock->eraseFromParent();
2083 }
2084
2085 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2086 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2087 /// depending on the generated condition.
2088 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2089 BasicBlock *LoopVectorPreHeader,
2090 BasicBlock *LoopExitBlock) {
2091 if (!SCEVCheckCond)
2092 return nullptr;
2093
2094 Value *Cond = SCEVCheckCond;
2095 // Mark the check as used, to prevent it from being removed during cleanup.
2096 SCEVCheckCond = nullptr;
2097 if (auto *C = dyn_cast<ConstantInt>(Cond))
2098 if (C->isZero())
2099 return nullptr;
2100
2101 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2102
2103 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2104 // Create new preheader for vector loop.
2105 if (OuterLoop)
2106 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2107
2108 SCEVCheckBlock->getTerminator()->eraseFromParent();
2109 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2110 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2111 SCEVCheckBlock);
2112
2113 DT->addNewBlock(SCEVCheckBlock, Pred);
2114 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2115
2116 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2117 if (AddBranchWeights)
2119 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2120 return SCEVCheckBlock;
2121 }
2122
2123 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2124 /// the branches to branch to the vector preheader or \p Bypass, depending on
2125 /// the generated condition.
2126 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2127 BasicBlock *LoopVectorPreHeader) {
2128 // Check if we generated code that checks in runtime if arrays overlap.
2129 if (!MemRuntimeCheckCond)
2130 return nullptr;
2131
2132 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2133 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2134 MemCheckBlock);
2135
2136 DT->addNewBlock(MemCheckBlock, Pred);
2137 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2138 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2139
2140 if (OuterLoop)
2141 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2142
2143 BranchInst &BI =
2144 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2145 if (AddBranchWeights) {
2147 }
2148 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2149 MemCheckBlock->getTerminator()->setDebugLoc(
2150 Pred->getTerminator()->getDebugLoc());
2151
2152 // Mark the check as used, to prevent it from being removed during cleanup.
2153 MemRuntimeCheckCond = nullptr;
2154 return MemCheckBlock;
2155 }
2156};
2157} // namespace
2158
2160 return Style == TailFoldingStyle::Data ||
2161 Style == TailFoldingStyle::DataAndControlFlow ||
2162 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2163}
2164
2166 return Style == TailFoldingStyle::DataAndControlFlow ||
2167 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2168}
2169
2170// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2171// vectorization. The loop needs to be annotated with #pragma omp simd
2172// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2173// vector length information is not provided, vectorization is not considered
2174// explicit. Interleave hints are not allowed either. These limitations will be
2175// relaxed in the future.
2176// Please, note that we are currently forced to abuse the pragma 'clang
2177// vectorize' semantics. This pragma provides *auto-vectorization hints*
2178// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2179// provides *explicit vectorization hints* (LV can bypass legal checks and
2180// assume that vectorization is legal). However, both hints are implemented
2181// using the same metadata (llvm.loop.vectorize, processed by
2182// LoopVectorizeHints). This will be fixed in the future when the native IR
2183// representation for pragma 'omp simd' is introduced.
2184static bool isExplicitVecOuterLoop(Loop *OuterLp,
2186 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2187 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2188
2189 // Only outer loops with an explicit vectorization hint are supported.
2190 // Unannotated outer loops are ignored.
2192 return false;
2193
2194 Function *Fn = OuterLp->getHeader()->getParent();
2195 if (!Hints.allowVectorization(Fn, OuterLp,
2196 true /*VectorizeOnlyWhenForced*/)) {
2197 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2198 return false;
2199 }
2200
2201 if (Hints.getInterleave() > 1) {
2202 // TODO: Interleave support is future work.
2203 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2204 "outer loops.\n");
2205 Hints.emitRemarkWithHints();
2206 return false;
2207 }
2208
2209 return true;
2210}
2211
2215 // Collect inner loops and outer loops without irreducible control flow. For
2216 // now, only collect outer loops that have explicit vectorization hints. If we
2217 // are stress testing the VPlan H-CFG construction, we collect the outermost
2218 // loop of every loop nest.
2219 if (L.isInnermost() || VPlanBuildStressTest ||
2221 LoopBlocksRPO RPOT(&L);
2222 RPOT.perform(LI);
2223 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2224 V.push_back(&L);
2225 // TODO: Collect inner loops inside marked outer loops in case
2226 // vectorization fails for the outer loop. Do not invoke
2227 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2228 // already known to be reducible. We can use an inherited attribute for
2229 // that.
2230 return;
2231 }
2232 }
2233 for (Loop *InnerL : L)
2234 collectSupportedLoops(*InnerL, LI, ORE, V);
2235}
2236
2237//===----------------------------------------------------------------------===//
2238// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2239// LoopVectorizationCostModel and LoopVectorizationPlanner.
2240//===----------------------------------------------------------------------===//
2241
2242/// Compute the transformed value of Index at offset StartValue using step
2243/// StepValue.
2244/// For integer induction, returns StartValue + Index * StepValue.
2245/// For pointer induction, returns StartValue[Index * StepValue].
2246/// FIXME: The newly created binary instructions should contain nsw/nuw
2247/// flags, which can be found from the original scalar operations.
2248static Value *
2250 Value *Step,
2252 const BinaryOperator *InductionBinOp) {
2253 Type *StepTy = Step->getType();
2254 Value *CastedIndex = StepTy->isIntegerTy()
2255 ? B.CreateSExtOrTrunc(Index, StepTy)
2256 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2257 if (CastedIndex != Index) {
2258 CastedIndex->setName(CastedIndex->getName() + ".cast");
2259 Index = CastedIndex;
2260 }
2261
2262 // Note: the IR at this point is broken. We cannot use SE to create any new
2263 // SCEV and then expand it, hoping that SCEV's simplification will give us
2264 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2265 // lead to various SCEV crashes. So all we can do is to use builder and rely
2266 // on InstCombine for future simplifications. Here we handle some trivial
2267 // cases only.
2268 auto CreateAdd = [&B](Value *X, Value *Y) {
2269 assert(X->getType() == Y->getType() && "Types don't match!");
2270 if (auto *CX = dyn_cast<ConstantInt>(X))
2271 if (CX->isZero())
2272 return Y;
2273 if (auto *CY = dyn_cast<ConstantInt>(Y))
2274 if (CY->isZero())
2275 return X;
2276 return B.CreateAdd(X, Y);
2277 };
2278
2279 // We allow X to be a vector type, in which case Y will potentially be
2280 // splatted into a vector with the same element count.
2281 auto CreateMul = [&B](Value *X, Value *Y) {
2282 assert(X->getType()->getScalarType() == Y->getType() &&
2283 "Types don't match!");
2284 if (auto *CX = dyn_cast<ConstantInt>(X))
2285 if (CX->isOne())
2286 return Y;
2287 if (auto *CY = dyn_cast<ConstantInt>(Y))
2288 if (CY->isOne())
2289 return X;
2290 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2291 if (XVTy && !isa<VectorType>(Y->getType()))
2292 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2293 return B.CreateMul(X, Y);
2294 };
2295
2296 switch (InductionKind) {
2298 assert(!isa<VectorType>(Index->getType()) &&
2299 "Vector indices not supported for integer inductions yet");
2300 assert(Index->getType() == StartValue->getType() &&
2301 "Index type does not match StartValue type");
2302 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2303 return B.CreateSub(StartValue, Index);
2304 auto *Offset = CreateMul(Index, Step);
2305 return CreateAdd(StartValue, Offset);
2306 }
2308 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2310 assert(!isa<VectorType>(Index->getType()) &&
2311 "Vector indices not supported for FP inductions yet");
2312 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2313 assert(InductionBinOp &&
2314 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2315 InductionBinOp->getOpcode() == Instruction::FSub) &&
2316 "Original bin op should be defined for FP induction");
2317
2318 Value *MulExp = B.CreateFMul(Step, Index);
2319 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2320 "induction");
2321 }
2323 return nullptr;
2324 }
2325 llvm_unreachable("invalid enum");
2326}
2327
2328std::optional<unsigned> getMaxVScale(const Function &F,
2329 const TargetTransformInfo &TTI) {
2330 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2331 return MaxVScale;
2332
2333 if (F.hasFnAttribute(Attribute::VScaleRange))
2334 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2335
2336 return std::nullopt;
2337}
2338
2339/// For the given VF and UF and maximum trip count computed for the loop, return
2340/// whether the induction variable might overflow in the vectorized loop. If not,
2341/// then we know a runtime overflow check always evaluates to false and can be
2342/// removed.
2345 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2346 // Always be conservative if we don't know the exact unroll factor.
2347 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2348
2349 Type *IdxTy = Cost->Legal->getWidestInductionType();
2350 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2351
2352 // We know the runtime overflow check is known false iff the (max) trip-count
2353 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2354 // the vector loop induction variable.
2355 if (unsigned TC =
2356 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2357 uint64_t MaxVF = VF.getKnownMinValue();
2358 if (VF.isScalable()) {
2359 std::optional<unsigned> MaxVScale =
2360 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2361 if (!MaxVScale)
2362 return false;
2363 MaxVF *= *MaxVScale;
2364 }
2365
2366 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2367 }
2368
2369 return false;
2370}
2371
2372// Return whether we allow using masked interleave-groups (for dealing with
2373// strided loads/stores that reside in predicated blocks, or for dealing
2374// with gaps).
2376 // If an override option has been passed in for interleaved accesses, use it.
2377 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2379
2381}
2382
2383// Try to vectorize the interleave group that \p Instr belongs to.
2384//
2385// E.g. Translate following interleaved load group (factor = 3):
2386// for (i = 0; i < N; i+=3) {
2387// R = Pic[i]; // Member of index 0
2388// G = Pic[i+1]; // Member of index 1
2389// B = Pic[i+2]; // Member of index 2
2390// ... // do something to R, G, B
2391// }
2392// To:
2393// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2394// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2395// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2396// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2397//
2398// Or translate following interleaved store group (factor = 3):
2399// for (i = 0; i < N; i+=3) {
2400// ... do something to R, G, B
2401// Pic[i] = R; // Member of index 0
2402// Pic[i+1] = G; // Member of index 1
2403// Pic[i+2] = B; // Member of index 2
2404// }
2405// To:
2406// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2407// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2408// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2409// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2410// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2413 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2414 VPValue *BlockInMask, bool NeedsMaskForGaps) {
2415 Instruction *Instr = Group->getInsertPos();
2416 const DataLayout &DL = Instr->getModule()->getDataLayout();
2417
2418 // Prepare for the vector type of the interleaved load/store.
2419 Type *ScalarTy = getLoadStoreType(Instr);
2420 unsigned InterleaveFactor = Group->getFactor();
2421 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2422
2423 // Prepare for the new pointers.
2424 SmallVector<Value *, 2> AddrParts;
2425 unsigned Index = Group->getIndex(Instr);
2426
2427 // TODO: extend the masked interleaved-group support to reversed access.
2428 assert((!BlockInMask || !Group->isReverse()) &&
2429 "Reversed masked interleave-group not supported.");
2430
2431 Value *Idx;
2432 // If the group is reverse, adjust the index to refer to the last vector lane
2433 // instead of the first. We adjust the index from the first vector lane,
2434 // rather than directly getting the pointer for lane VF - 1, because the
2435 // pointer operand of the interleaved access is supposed to be uniform. For
2436 // uniform instructions, we're only required to generate a value for the
2437 // first vector lane in each unroll iteration.
2438 if (Group->isReverse()) {
2439 Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2440 Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
2444 } else
2446
2447 for (unsigned Part = 0; Part < UF; Part++) {
2448 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2449 if (auto *I = dyn_cast<Instruction>(AddrPart))
2450 State.setDebugLocFrom(I->getDebugLoc());
2451
2452 // Notice current instruction could be any index. Need to adjust the address
2453 // to the member of index 0.
2454 //
2455 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2456 // b = A[i]; // Member of index 0
2457 // Current pointer is pointed to A[i+1], adjust it to A[i].
2458 //
2459 // E.g. A[i+1] = a; // Member of index 1
2460 // A[i] = b; // Member of index 0
2461 // A[i+2] = c; // Member of index 2 (Current instruction)
2462 // Current pointer is pointed to A[i+2], adjust it to A[i].
2463
2464 bool InBounds = false;
2465 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2466 InBounds = gep->isInBounds();
2467 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2468 AddrParts.push_back(AddrPart);
2469 }
2470
2471 State.setDebugLocFrom(Instr->getDebugLoc());
2472 Value *PoisonVec = PoisonValue::get(VecTy);
2473
2474 auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
2475 unsigned Part, Value *MaskForGaps) -> Value * {
2476 if (VF.isScalable()) {
2477 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2478 assert(InterleaveFactor == 2 &&
2479 "Unsupported deinterleave factor for scalable vectors");
2480 auto *BlockInMaskPart = State.get(BlockInMask, Part);
2481 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2482 auto *MaskTy =
2484 return Builder.CreateIntrinsic(
2485 MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
2486 /*FMFSource=*/nullptr, "interleaved.mask");
2487 }
2488
2489 if (!BlockInMask)
2490 return MaskForGaps;
2491
2492 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2493 Value *ShuffledMask = Builder.CreateShuffleVector(
2494 BlockInMaskPart,
2495 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2496 "interleaved.mask");
2497 return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2498 MaskForGaps)
2499 : ShuffledMask;
2500 };
2501
2502 // Vectorize the interleaved load group.
2503 if (isa<LoadInst>(Instr)) {
2504 Value *MaskForGaps = nullptr;
2505 if (NeedsMaskForGaps) {
2506 MaskForGaps =
2508 assert(MaskForGaps && "Mask for Gaps is required but it is null");
2509 }
2510
2511 // For each unroll part, create a wide load for the group.
2512 SmallVector<Value *, 2> NewLoads;
2513 for (unsigned Part = 0; Part < UF; Part++) {
2514 Instruction *NewLoad;
2515 if (BlockInMask || MaskForGaps) {
2517 "masked interleaved groups are not allowed.");
2518 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2519 NewLoad =
2520 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2521 GroupMask, PoisonVec, "wide.masked.vec");
2522 }
2523 else
2524 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2525 Group->getAlign(), "wide.vec");
2526 Group->addMetadata(NewLoad);
2527 NewLoads.push_back(NewLoad);
2528 }
2529
2530 if (VecTy->isScalableTy()) {
2531 assert(InterleaveFactor == 2 &&
2532 "Unsupported deinterleave factor for scalable vectors");
2533
2534 for (unsigned Part = 0; Part < UF; ++Part) {
2535 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2536 // so must use intrinsics to deinterleave.
2538 Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
2539 /*FMFSource=*/nullptr, "strided.vec");
2540 unsigned J = 0;
2541 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2542 Instruction *Member = Group->getMember(I);
2543
2544 if (!Member)
2545 continue;
2546
2547 Value *StridedVec = Builder.CreateExtractValue(DI, I);
2548 // If this member has different type, cast the result type.
2549 if (Member->getType() != ScalarTy) {
2550 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2551 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2552 }
2553
2554 if (Group->isReverse())
2555 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2556
2557 State.set(VPDefs[J], StridedVec, Part);
2558 ++J;
2559 }
2560 }
2561
2562 return;
2563 }
2564
2565 // For each member in the group, shuffle out the appropriate data from the
2566 // wide loads.
2567 unsigned J = 0;
2568 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2569 Instruction *Member = Group->getMember(I);
2570
2571 // Skip the gaps in the group.
2572 if (!Member)
2573 continue;
2574
2575 auto StrideMask =
2576 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2577 for (unsigned Part = 0; Part < UF; Part++) {
2578 Value *StridedVec = Builder.CreateShuffleVector(
2579 NewLoads[Part], StrideMask, "strided.vec");
2580
2581 // If this member has different type, cast the result type.
2582 if (Member->getType() != ScalarTy) {
2583 assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2584 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2585 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2586 }
2587
2588 if (Group->isReverse())
2589 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2590
2591 State.set(VPDefs[J], StridedVec, Part);
2592 }
2593 ++J;
2594 }
2595 return;
2596 }
2597
2598 // The sub vector type for current instruction.
2599 auto *SubVT = VectorType::get(ScalarTy, VF);
2600
2601 // Vectorize the interleaved store group.
2602 Value *MaskForGaps =
2604 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2605 "masked interleaved groups are not allowed.");
2606 assert((!MaskForGaps || !VF.isScalable()) &&
2607 "masking gaps for scalable vectors is not yet supported.");
2608 for (unsigned Part = 0; Part < UF; Part++) {
2609 // Collect the stored vector from each member.
2610 SmallVector<Value *, 4> StoredVecs;
2611 unsigned StoredIdx = 0;
2612 for (unsigned i = 0; i < InterleaveFactor; i++) {
2613 assert((Group->getMember(i) || MaskForGaps) &&
2614 "Fail to get a member from an interleaved store group");
2615 Instruction *Member = Group->getMember(i);
2616
2617 // Skip the gaps in the group.
2618 if (!Member) {
2619 Value *Undef = PoisonValue::get(SubVT);
2620 StoredVecs.push_back(Undef);
2621 continue;
2622 }
2623
2624 Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2625 ++StoredIdx;
2626
2627 if (Group->isReverse())
2628 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2629
2630 // If this member has different type, cast it to a unified type.
2631
2632 if (StoredVec->getType() != SubVT)
2633 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2634
2635 StoredVecs.push_back(StoredVec);
2636 }
2637
2638 // Interleave all the smaller vectors into one wider vector.
2639 Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
2640 Instruction *NewStoreInstr;
2641 if (BlockInMask || MaskForGaps) {
2642 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2643 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2644 Group->getAlign(), GroupMask);
2645 } else
2646 NewStoreInstr =
2647 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2648
2649 Group->addMetadata(NewStoreInstr);
2650 }
2651}
2652
2654 VPReplicateRecipe *RepRecipe,
2655 const VPIteration &Instance,
2656 VPTransformState &State) {
2657 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2658
2659 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2660 // the first lane and part.
2661 if (isa<NoAliasScopeDeclInst>(Instr))
2662 if (!Instance.isFirstIteration())
2663 return;
2664
2665 // Does this instruction return a value ?
2666 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2667
2668 Instruction *Cloned = Instr->clone();
2669 if (!IsVoidRetTy) {
2670 Cloned->setName(Instr->getName() + ".cloned");
2671#if !defined(NDEBUG)
2672 // Verify that VPlan type inference results agree with the type of the
2673 // generated values.
2674 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2675 "inferred type and type from generated instructions do not match");
2676#endif
2677 }
2678
2679 RepRecipe->setFlags(Cloned);
2680
2681 if (auto DL = Instr->getDebugLoc())
2682 State.setDebugLocFrom(DL);
2683
2684 // Replace the operands of the cloned instructions with their scalar
2685 // equivalents in the new loop.
2686 for (const auto &I : enumerate(RepRecipe->operands())) {
2687 auto InputInstance = Instance;
2688 VPValue *Operand = I.value();
2690 InputInstance.Lane = VPLane::getFirstLane();
2691 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2692 }
2693 State.addNewMetadata(Cloned, Instr);
2694
2695 // Place the cloned scalar in the new loop.
2696 State.Builder.Insert(Cloned);
2697
2698 State.set(RepRecipe, Cloned, Instance);
2699
2700 // If we just cloned a new assumption, add it the assumption cache.
2701 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2703
2704 // End if-block.
2705 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2706 if (IfPredicateInstr)
2707 PredicatedInstructions.push_back(Cloned);
2708}
2709
2710Value *
2712 if (VectorTripCount)
2713 return VectorTripCount;
2714
2715 Value *TC = getTripCount();
2716 IRBuilder<> Builder(InsertBlock->getTerminator());
2717
2718 Type *Ty = TC->getType();
2719 // This is where we can make the step a runtime constant.
2720 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2721
2722 // If the tail is to be folded by masking, round the number of iterations N
2723 // up to a multiple of Step instead of rounding down. This is done by first
2724 // adding Step-1 and then rounding down. Note that it's ok if this addition
2725 // overflows: the vector induction variable will eventually wrap to zero given
2726 // that it starts at zero and its Step is a power of two; the loop will then
2727 // exit, with the last early-exit vector comparison also producing all-true.
2728 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2729 // is accounted for in emitIterationCountCheck that adds an overflow check.
2730 if (Cost->foldTailByMasking()) {
2732 "VF*UF must be a power of 2 when folding tail by masking");
2733 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2734 TC = Builder.CreateAdd(
2735 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2736 }
2737
2738 // Now we need to generate the expression for the part of the loop that the
2739 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2740 // iterations are not required for correctness, or N - Step, otherwise. Step
2741 // is equal to the vectorization factor (number of SIMD elements) times the
2742 // unroll factor (number of SIMD instructions).
2743 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2744
2745 // There are cases where we *must* run at least one iteration in the remainder
2746 // loop. See the cost model for when this can happen. If the step evenly
2747 // divides the trip count, we set the remainder to be equal to the step. If
2748 // the step does not evenly divide the trip count, no adjustment is necessary
2749 // since there will already be scalar iterations. Note that the minimum
2750 // iterations check ensures that N >= Step.
2751 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2752 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2753 R = Builder.CreateSelect(IsZero, Step, R);
2754 }
2755
2756 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2757
2758 return VectorTripCount;
2759}
2760
2762 const DataLayout &DL) {
2763 // Verify that V is a vector type with same number of elements as DstVTy.
2764 auto *DstFVTy = cast<VectorType>(DstVTy);
2765 auto VF = DstFVTy->getElementCount();
2766 auto *SrcVecTy = cast<VectorType>(V->getType());
2767 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2768 Type *SrcElemTy = SrcVecTy->getElementType();
2769 Type *DstElemTy = DstFVTy->getElementType();
2770 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2771 "Vector elements must have same size");
2772
2773 // Do a direct cast if element types are castable.
2774 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2775 return Builder.CreateBitOrPointerCast(V, DstFVTy);
2776 }
2777 // V cannot be directly casted to desired vector type.
2778 // May happen when V is a floating point vector but DstVTy is a vector of
2779 // pointers or vice-versa. Handle this using a two-step bitcast using an
2780 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2781 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2782 "Only one type should be a pointer type");
2783 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2784 "Only one type should be a floating point type");
2785 Type *IntTy =
2786 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2787 auto *VecIntTy = VectorType::get(IntTy, VF);
2788 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2789 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2790}
2791
2793 Value *Count = getTripCount();
2794 // Reuse existing vector loop preheader for TC checks.
2795 // Note that new preheader block is generated for vector loop.
2796 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2797 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2798
2799 // Generate code to check if the loop's trip count is less than VF * UF, or
2800 // equal to it in case a scalar epilogue is required; this implies that the
2801 // vector trip count is zero. This check also covers the case where adding one
2802 // to the backedge-taken count overflowed leading to an incorrect trip count
2803 // of zero. In this case we will also jump to the scalar loop.
2804 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2806
2807 // If tail is to be folded, vector loop takes care of all iterations.
2808 Type *CountTy = Count->getType();
2809 Value *CheckMinIters = Builder.getFalse();
2810 auto CreateStep = [&]() -> Value * {
2811 // Create step with max(MinProTripCount, UF * VF).
2813 return createStepForVF(Builder, CountTy, VF, UF);
2814
2815 Value *MinProfTC =
2817 if (!VF.isScalable())
2818 return MinProfTC;
2820 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2821 };
2822
2823 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2824 if (Style == TailFoldingStyle::None)
2825 CheckMinIters =
2826 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2827 else if (VF.isScalable() &&
2830 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2831 // an overflow to zero when updating induction variables and so an
2832 // additional overflow check is required before entering the vector loop.
2833
2834 // Get the maximum unsigned value for the type.
2835 Value *MaxUIntTripCount =
2836 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2837 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2838
2839 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2840 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2841 }
2842
2843 // Create new preheader for vector loop.
2845 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2846 "vector.ph");
2847
2848 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2849 DT->getNode(Bypass)->getIDom()) &&
2850 "TC check is expected to dominate Bypass");
2851
2852 // Update dominator for Bypass & LoopExit (if needed).
2853 DT->changeImmediateDominator(Bypass, TCCheckBlock);
2854 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2855 // If there is an epilogue which must run, there's no edge from the
2856 // middle block to exit blocks and thus no need to update the immediate
2857 // dominator of the exit blocks.
2859
2860 BranchInst &BI =
2861 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2864 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2865 LoopBypassBlocks.push_back(TCCheckBlock);
2866}
2867
2869 BasicBlock *const SCEVCheckBlock =
2870 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2871 if (!SCEVCheckBlock)
2872 return nullptr;
2873
2874 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2876 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2877 "Cannot SCEV check stride or overflow when optimizing for size");
2878
2879
2880 // Update dominator only if this is first RT check.
2881 if (LoopBypassBlocks.empty()) {
2882 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2883 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2884 // If there is an epilogue which must run, there's no edge from the
2885 // middle block to exit blocks and thus no need to update the immediate
2886 // dominator of the exit blocks.
2887 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2888 }
2889
2890 LoopBypassBlocks.push_back(SCEVCheckBlock);
2891 AddedSafetyChecks = true;
2892 return SCEVCheckBlock;
2893}
2894
2896 // VPlan-native path does not do any analysis for runtime checks currently.
2898 return nullptr;
2899
2900 BasicBlock *const MemCheckBlock =
2901 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2902
2903 // Check if we generated code that checks in runtime if arrays overlap. We put
2904 // the checks into a separate block to make the more common case of few
2905 // elements faster.
2906 if (!MemCheckBlock)
2907 return nullptr;
2908
2909 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2910 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2911 "Cannot emit memory checks when optimizing for size, unless forced "
2912 "to vectorize.");
2913 ORE->emit([&]() {
2914 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2917 << "Code-size may be reduced by not forcing "
2918 "vectorization, or by source-code modifications "
2919 "eliminating the need for runtime checks "
2920 "(e.g., adding 'restrict').";
2921 });
2922 }
2923
2924 LoopBypassBlocks.push_back(MemCheckBlock);
2925
2926 AddedSafetyChecks = true;
2927
2928 return MemCheckBlock;
2929}
2930
2934 assert(LoopVectorPreHeader && "Invalid loop structure");
2935 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2936 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2937 "multiple exit loop without required epilogue?");
2938
2941 LI, nullptr, Twine(Prefix) + "middle.block");
2944 nullptr, Twine(Prefix) + "scalar.ph");
2945
2946 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
2947
2948 // Set up the middle block terminator. Two cases:
2949 // 1) If we know that we must execute the scalar epilogue, emit an
2950 // unconditional branch.
2951 // 2) Otherwise, we must have a single unique exit block (due to how we
2952 // implement the multiple exit case). In this case, set up a conditional
2953 // branch from the middle block to the loop scalar preheader, and the
2954 // exit block. completeLoopSkeleton will update the condition to use an
2955 // iteration check, if required to decide whether to execute the remainder.
2956 BranchInst *BrInst =
2957 Cost->requiresScalarEpilogue(VF.isVector())
2960 Builder.getTrue());
2961 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
2963
2964 // Update dominator for loop exit. During skeleton creation, only the vector
2965 // pre-header and the middle block are created. The vector loop is entirely
2966 // created during VPlan exection.
2967 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2968 // If there is an epilogue which must run, there's no edge from the
2969 // middle block to exit blocks and thus no need to update the immediate
2970 // dominator of the exit blocks.
2972}
2973
2975 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
2976 ArrayRef<BasicBlock *> BypassBlocks,
2977 std::pair<BasicBlock *, Value *> AdditionalBypass) {
2979 assert(VectorTripCount && "Expected valid arguments");
2980
2981 Instruction *OldInduction = Legal->getPrimaryInduction();
2982 Value *&EndValue = IVEndValues[OrigPhi];
2983 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
2984 if (OrigPhi == OldInduction) {
2985 // We know what the end value is.
2986 EndValue = VectorTripCount;
2987 } else {
2989
2990 // Fast-math-flags propagate from the original induction instruction.
2991 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
2992 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2993
2995 Step, II.getKind(), II.getInductionBinOp());
2996 EndValue->setName("ind.end");
2997
2998 // Compute the end value for the additional bypass (if applicable).
2999 if (AdditionalBypass.first) {
3000 B.SetInsertPoint(AdditionalBypass.first,
3001 AdditionalBypass.first->getFirstInsertionPt());
3002 EndValueFromAdditionalBypass =
3003 emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
3004 Step, II.getKind(), II.getInductionBinOp());
3005 EndValueFromAdditionalBypass->setName("ind.end");
3006 }
3007 }
3008
3009 // Create phi nodes to merge from the backedge-taken check block.
3010 PHINode *BCResumeVal =
3011 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3013 // Copy original phi DL over to the new one.
3014 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3015
3016 // The new PHI merges the original incoming value, in case of a bypass,
3017 // or the value at the end of the vectorized loop.
3018 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3019
3020 // Fix the scalar body counter (PHI node).
3021 // The old induction's phi node in the scalar body needs the truncated
3022 // value.
3023 for (BasicBlock *BB : BypassBlocks)
3024 BCResumeVal->addIncoming(II.getStartValue(), BB);
3025
3026 if (AdditionalBypass.first)
3027 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3028 EndValueFromAdditionalBypass);
3029 return BCResumeVal;
3030}
3031
3032/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
3033/// expansion results.
3035 const SCEV2ValueTy &ExpandedSCEVs) {
3036 const SCEV *Step = ID.getStep();
3037 if (auto *C = dyn_cast<SCEVConstant>(Step))
3038 return C->getValue();
3039 if (auto *U = dyn_cast<SCEVUnknown>(Step))
3040 return U->getValue();
3041 auto I = ExpandedSCEVs.find(Step);
3042 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
3043 return I->second;
3044}
3045
3047 const SCEV2ValueTy &ExpandedSCEVs,
3048 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3049 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3050 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3051 "Inconsistent information about additional bypass.");
3052 // We are going to resume the execution of the scalar loop.
3053 // Go over all of the induction variables that we found and fix the
3054 // PHIs that are left in the scalar version of the loop.
3055 // The starting values of PHI nodes depend on the counter of the last
3056 // iteration in the vectorized loop.
3057 // If we come from a bypass edge then we need to start from the original
3058 // start value.
3059 for (const auto &InductionEntry : Legal->getInductionVars()) {
3060 PHINode *OrigPhi = InductionEntry.first;
3061 const InductionDescriptor &II = InductionEntry.second;
3062 PHINode *BCResumeVal = createInductionResumeValue(
3063 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
3064 AdditionalBypass);
3065 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3066 }
3067}
3068
3070 // The trip counts should be cached by now.
3071 Value *Count = getTripCount();
3073
3074 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3075
3076 // Add a check in the middle block to see if we have completed
3077 // all of the iterations in the first vector loop. Three cases:
3078 // 1) If we require a scalar epilogue, there is no conditional branch as
3079 // we unconditionally branch to the scalar preheader. Do nothing.
3080 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3081 // Thus if tail is to be folded, we know we don't need to run the
3082 // remainder and we can use the previous value for the condition (true).
3083 // 3) Otherwise, construct a runtime check.
3084 if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
3085 !Cost->foldTailByMasking()) {
3086 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3087 // of the corresponding compare because they may have ended up with
3088 // different line numbers and we want to avoid awkward line stepping while
3089 // debugging. Eg. if the compare has got a line number inside the loop.
3090 // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3091 // operands. Perform simplification directly on VPlan once the branch is
3092 // modeled there.
3094 B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3095 Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
3096 BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
3097 BI.setCondition(CmpN);
3098 if (hasBranchWeightMD(*ScalarLatchTerm)) {
3099 // Assume that `Count % VectorTripCount` is equally distributed.
3100 unsigned TripCount = UF * VF.getKnownMinValue();
3101 assert(TripCount > 0 && "trip count should not be zero");
3102 const uint32_t Weights[] = {1, TripCount - 1};
3103 setBranchWeights(BI, Weights);
3104 }
3105 }
3106
3107#ifdef EXPENSIVE_CHECKS
3108 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3109#endif
3110
3111 return LoopVectorPreHeader;
3112}
3113
3114std::pair<BasicBlock *, Value *>
3116 const SCEV2ValueTy &ExpandedSCEVs) {
3117 /*
3118 In this function we generate a new loop. The new loop will contain
3119 the vectorized instructions while the old loop will continue to run the
3120 scalar remainder.
3121
3122 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3123 / | preheader are expanded here. Eventually all required SCEV
3124 / | expansion should happen here.
3125 / v
3126 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3127 | / |
3128 | / v
3129 || [ ] <-- vector pre header.
3130 |/ |
3131 | v
3132 | [ ] \
3133 | [ ]_| <-- vector loop (created during VPlan execution).
3134 | |
3135 | v
3136 \ -[ ] <--- middle-block.
3137 \/ |
3138 /\ v
3139 | ->[ ] <--- new preheader.
3140 | |
3141 (opt) v <-- edge from middle to exit iff epilogue is not required.
3142 | [ ] \
3143 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3144 \ |
3145 \ v
3146 >[ ] <-- exit block(s).
3147 ...
3148 */
3149
3150 // Create an empty vector loop, and prepare basic blocks for the runtime
3151 // checks.
3153
3154 // Now, compare the new count to zero. If it is zero skip the vector loop and
3155 // jump to the scalar loop. This check also covers the case where the
3156 // backedge-taken count is uint##_max: adding one to it will overflow leading
3157 // to an incorrect trip count of zero. In this (rare) case we will also jump
3158 // to the scalar loop.
3160
3161 // Generate the code to check any assumptions that we've made for SCEV
3162 // expressions.
3164
3165 // Generate the code that checks in runtime if arrays overlap. We put the
3166 // checks into a separate block to make the more common case of few elements
3167 // faster.
3169
3170 // Emit phis for the new starting index of the scalar loop.
3171 createInductionResumeValues(ExpandedSCEVs);
3172
3173 return {completeLoopSkeleton(), nullptr};
3174}
3175
3176// Fix up external users of the induction variable. At this point, we are
3177// in LCSSA form, with all external PHIs that use the IV having one input value,
3178// coming from the remainder loop. We need those PHIs to also have a correct
3179// value for the IV when arriving directly from the middle block.
3181 const InductionDescriptor &II,
3182 Value *VectorTripCount, Value *EndValue,
3183 BasicBlock *MiddleBlock,
3184 BasicBlock *VectorHeader, VPlan &Plan,
3185 VPTransformState &State) {
3186 // There are two kinds of external IV usages - those that use the value
3187 // computed in the last iteration (the PHI) and those that use the penultimate
3188 // value (the value that feeds into the phi from the loop latch).
3189 // We allow both, but they, obviously, have different values.
3190
3191 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3192
3193 DenseMap<Value *, Value *> MissingVals;
3194
3195 // An external user of the last iteration's value should see the value that
3196 // the remainder loop uses to initialize its own IV.
3198 for (User *U : PostInc->users()) {
3199 Instruction *UI = cast<Instruction>(U);
3200 if (!OrigLoop->contains(UI)) {
3201 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3202 MissingVals[UI] = EndValue;
3203 }
3204 }
3205
3206 // An external user of the penultimate value need to see EndValue - Step.
3207 // The simplest way to get this is to recompute it from the constituent SCEVs,
3208 // that is Start + (Step * (CRD - 1)).
3209 for (User *U : OrigPhi->users()) {
3210 auto *UI = cast<Instruction>(U);
3211 if (!OrigLoop->contains(UI)) {
3212 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3213 IRBuilder<> B(MiddleBlock->getTerminator());
3214
3215 // Fast-math-flags propagate from the original induction instruction.
3216 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3217 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3218
3219 Value *CountMinusOne = B.CreateSub(
3220 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3221 CountMinusOne->setName("cmo");
3222
3223 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
3224 assert(StepVPV && "step must have been expanded during VPlan execution");
3225 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
3226 : State.get(StepVPV, {0, 0});
3227 Value *Escape =
3228 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
3229 II.getKind(), II.getInductionBinOp());
3230 Escape->setName("ind.escape");
3231 MissingVals[UI] = Escape;
3232 }
3233 }
3234
3235 for (auto &I : MissingVals) {
3236 PHINode *PHI = cast<PHINode>(I.first);
3237 // One corner case we have to handle is two IVs "chasing" each-other,
3238 // that is %IV2 = phi [...], [ %IV1, %latch ]
3239 // In this case, if IV1 has an external use, we need to avoid adding both
3240 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3241 // don't already have an incoming value for the middle block.
3242 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3243 PHI->addIncoming(I.second, MiddleBlock);
3244 Plan.removeLiveOut(PHI);
3245 }
3246 }
3247}
3248
3249namespace {
3250
3251struct CSEDenseMapInfo {
3252 static bool canHandle(const Instruction *I) {
3253 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3254 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3255 }
3256
3257 static inline Instruction *getEmptyKey() {
3259 }
3260
3261 static inline Instruction *getTombstoneKey() {
3263 }
3264
3265 static unsigned getHashValue(const Instruction *I) {
3266 assert(canHandle(I) && "Unknown instruction!");
3267 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3268 I->value_op_end()));
3269 }
3270
3271 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3272 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3273 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3274 return LHS == RHS;
3275 return LHS->isIdenticalTo(RHS);
3276 }
3277};
3278
3279} // end anonymous namespace
3280
3281///Perform cse of induction variable instructions.
3282static void cse(BasicBlock *BB) {
3283 // Perform simple cse.
3285 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3286 if (!CSEDenseMapInfo::canHandle(&In))
3287 continue;
3288
3289 // Check if we can replace this instruction with any of the
3290 // visited instructions.
3291 if (Instruction *V = CSEMap.lookup(&In)) {
3292 In.replaceAllUsesWith(V);
3293 In.eraseFromParent();
3294 continue;
3295 }
3296
3297 CSEMap[&In] = &In;
3298 }
3299}
3300
3303 ElementCount VF) const {
3304 // We only need to calculate a cost if the VF is scalar; for actual vectors
3305 // we should already have a pre-calculated cost at each VF.
3306 if (!VF.isScalar())
3307 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
3308
3310 Type *RetTy = CI->getType();
3312 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
3313 return *RedCost;
3314
3316 for (auto &ArgOp : CI->args())
3317 Tys.push_back(ArgOp->getType());
3318
3319 InstructionCost ScalarCallCost =
3321
3322 // If this is an intrinsic we may have a lower cost for it.
3324 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
3325 return std::min(ScalarCallCost, IntrinsicCost);
3326 }
3327 return ScalarCallCost;
3328}
3329
3331 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3332 return Elt;
3333 return VectorType::get(Elt, VF);
3334}
3335
3338 ElementCount VF) const {
3340 assert(ID && "Expected intrinsic call!");
3341 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3342 FastMathFlags FMF;
3343 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3344 FMF = FPMO->getFastMathFlags();
3345
3348 SmallVector<Type *> ParamTys;
3349 std::transform(FTy->param_begin(), FTy->param_end(),
3350 std::back_inserter(ParamTys),
3351 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3352
3353 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3354 dyn_cast<IntrinsicInst>(CI));
3355 return TTI.getIntrinsicInstrCost(CostAttrs,
3357}
3358
3360 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3361 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3362 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3363}
3364
3366 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3367 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3368 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3369}
3370
3372 VPlan &Plan) {
3373 // Fix widened non-induction PHIs by setting up the PHI operands.
3375 fixNonInductionPHIs(Plan, State);
3376
3377 // At this point every instruction in the original loop is widened to a
3378 // vector form. Now we need to fix the recurrences in the loop. These PHI
3379 // nodes are currently empty because we did not want to introduce cycles.
3380 // This is the second stage of vectorizing recurrences. Note that fixing
3381 // reduction phis are already modeled in VPlan.
3382 // TODO: Also model fixing fixed-order recurrence phis in VPlan.
3383 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3384 VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
3385 for (VPRecipeBase &R : HeaderVPBB->phis()) {
3386 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3387 fixFixedOrderRecurrence(FOR, State);
3388 }
3389
3390 // Forget the original basic block.
3393
3394 // After vectorization, the exit blocks of the original loop will have
3395 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
3396 // looked through single-entry phis.
3397 SmallVector<BasicBlock *> ExitBlocks;
3398 OrigLoop->getExitBlocks(ExitBlocks);
3399 for (BasicBlock *Exit : ExitBlocks)
3400 for (PHINode &PN : Exit->phis())
3402
3403 VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
3404 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3405 if (Cost->requiresScalarEpilogue(VF.isVector())) {
3406 // No edge from the middle block to the unique exit block has been inserted
3407 // and there is nothing to fix from vector loop; phis should have incoming
3408 // from scalar loop only.
3409 } else {
3410 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
3411 // the cost model.
3412
3413 // If we inserted an edge from the middle block to the unique exit block,
3414 // update uses outside the loop (phis) to account for the newly inserted
3415 // edge.
3416
3417 // Fix-up external users of the induction variables.
3418 for (const auto &Entry : Legal->getInductionVars())
3419 fixupIVUsers(Entry.first, Entry.second,
3421 IVEndValues[Entry.first], LoopMiddleBlock,
3422 VectorLoop->getHeader(), Plan, State);
3423 }
3424
3425 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3426 // in the exit block, so update the builder.
3427 State.Builder.SetInsertPoint(State.CFG.ExitBB,
3428 State.CFG.ExitBB->getFirstNonPHIIt());
3429 for (const auto &KV : Plan.getLiveOuts())
3430 KV.second->fixPhi(Plan, State);
3431
3433 sinkScalarOperands(&*PI);
3434
3435 // Remove redundant induction instructions.
3436 cse(VectorLoop->getHeader());
3437
3438 // Set/update profile weights for the vector and remainder loops as original
3439 // loop iterations are now distributed among them. Note that original loop
3440 // represented by LoopScalarBody becomes remainder loop after vectorization.
3441 //
3442 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3443 // end up getting slightly roughened result but that should be OK since
3444 // profile is not inherently precise anyway. Note also possible bypass of
3445 // vector code caused by legality checks is ignored, assigning all the weight
3446 // to the vector loop, optimistically.
3447 //
3448 // For scalable vectorization we can't know at compile time how many iterations
3449 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3450 // vscale of '1'.
3453 VF.getKnownMinValue() * UF);
3454}
3455
3458 // This is the second phase of vectorizing first-order recurrences. An
3459 // overview of the transformation is described below. Suppose we have the
3460 // following loop.
3461 //
3462 // for (int i = 0; i < n; ++i)
3463 // b[i] = a[i] - a[i - 1];
3464 //
3465 // There is a first-order recurrence on "a". For this loop, the shorthand
3466 // scalar IR looks like:
3467 //
3468 // scalar.ph:
3469 // s_init = a[-1]
3470 // br scalar.body
3471 //
3472 // scalar.body:
3473 // i = phi [0, scalar.ph], [i+1, scalar.body]
3474 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3475 // s2 = a[i]
3476 // b[i] = s2 - s1
3477 // br cond, scalar.body, ...
3478 //
3479 // In this example, s1 is a recurrence because it's value depends on the
3480 // previous iteration. In the first phase of vectorization, we created a
3481 // vector phi v1 for s1. We now complete the vectorization and produce the
3482 // shorthand vector IR shown below (for VF = 4, UF = 1).
3483 //
3484 // vector.ph:
3485 // v_init = vector(..., ..., ..., a[-1])
3486 // br vector.body
3487 //
3488 // vector.body
3489 // i = phi [0, vector.ph], [i+4, vector.body]
3490 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3491 // v2 = a[i, i+1, i+2, i+3];
3492 // v3 = vector(v1(3), v2(0, 1, 2))
3493 // b[i, i+1, i+2, i+3] = v2 - v3
3494 // br cond, vector.body, middle.block
3495 //
3496 // middle.block:
3497 // x = v2(3)
3498 // br scalar.ph
3499 //
3500 // scalar.ph:
3501 // s_init = phi [x, middle.block], [a[-1], otherwise]
3502 // br scalar.body
3503 //
3504 // After execution completes the vector loop, we extract the next value of
3505 // the recurrence (x) to use as the initial value in the scalar loop.
3506
3507 // Extract the last vector element in the middle block. This will be the
3508 // initial value for the recurrence when jumping to the scalar loop.
3509 VPValue *PreviousDef = PhiR->getBackedgeValue();
3510 Value *Incoming = State.get(PreviousDef, UF - 1);
3511 auto *ExtractForScalar = Incoming;
3512 auto *IdxTy = Builder.getInt32Ty();
3513 Value *RuntimeVF = nullptr;
3514 if (VF.isVector()) {
3515 auto *One = ConstantInt::get(IdxTy, 1);
3517 RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3518 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3519 ExtractForScalar =
3520 Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
3521 }
3522
3523 auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
3524 assert(PhiR->getNumUsers() == 1 &&
3525 RecurSplice->getOpcode() ==
3527 "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3528 SmallVector<VPLiveOut *> LiveOuts;
3529 for (VPUser *U : RecurSplice->users())
3530 if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
3531 LiveOuts.push_back(LiveOut);
3532
3533 if (!LiveOuts.empty()) {
3534 // Extract the second last element in the middle block if the
3535 // Phi is used outside the loop. We need to extract the phi itself
3536 // and not the last element (the phi update in the current iteration). This
3537 // will be the value when jumping to the exit block from the
3538 // LoopMiddleBlock, when the scalar loop is not run at all.
3539 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3540 if (VF.isVector()) {
3541 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3542 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3543 Incoming, Idx, "vector.recur.extract.for.phi");
3544 } else {
3545 assert(UF > 1 && "VF and UF cannot both be 1");
3546 // When loop is unrolled without vectorizing, initialize
3547 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
3548 // value of `Incoming`. This is analogous to the vectorized case above:
3549 // extracting the second last element when VF > 1.
3550 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3551 }
3552
3553 for (VPLiveOut *LiveOut : LiveOuts) {
3554 assert(!Cost->requiresScalarEpilogue(VF.isVector()));
3555 PHINode *LCSSAPhi = LiveOut->getPhi();
3556 LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3557 State.Plan->removeLiveOut(LCSSAPhi);
3558 }
3559 }
3560
3561 // Fix the initial value of the original recurrence in the scalar loop.
3563 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3564 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3565 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3566 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3567 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3568 Start->addIncoming(Incoming, BB);
3569 }
3570
3571 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3572 Phi->setName("scalar.recur");
3573}
3574
3576 // The basic block and loop containing the predicated instruction.
3577 auto *PredBB = PredInst->getParent();
3578 auto *VectorLoop = LI->getLoopFor(PredBB);
3579
3580 // Initialize a worklist with the operands of the predicated instruction.
3581 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3582
3583 // Holds instructions that we need to analyze again. An instruction may be
3584 // reanalyzed if we don't yet know if we can sink it or not.
3585 SmallVector<Instruction *, 8> InstsToReanalyze;
3586
3587 // Returns true if a given use occurs in the predicated block. Phi nodes use
3588 // their operands in their corresponding predecessor blocks.
3589 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3590 auto *I = cast<Instruction>(U.getUser());
3591 BasicBlock *BB = I->getParent();
3592 if (auto *Phi = dyn_cast<PHINode>(I))
3593 BB = Phi->getIncomingBlock(
3594 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3595 return BB == PredBB;
3596 };
3597
3598 // Iteratively sink the scalarized operands of the predicated instruction
3599 // into the block we created for it. When an instruction is sunk, it's
3600 // operands are then added to the worklist. The algorithm ends after one pass
3601 // through the worklist doesn't sink a single instruction.
3602 bool Changed;
3603 do {
3604 // Add the instructions that need to be reanalyzed to the worklist, and
3605 // reset the changed indicator.
3606 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3607 InstsToReanalyze.clear();
3608 Changed = false;
3609
3610 while (!Worklist.empty()) {
3611 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3612
3613 // We can't sink an instruction if it is a phi node, is not in the loop,
3614 // may have side effects or may read from memory.
3615 // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3616 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3617 I->mayHaveSideEffects() || I->mayReadFromMemory())
3618 continue;
3619
3620 // If the instruction is already in PredBB, check if we can sink its
3621 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3622 // sinking the scalar instruction I, hence it appears in PredBB; but it
3623 // may have failed to sink I's operands (recursively), which we try
3624 // (again) here.
3625 if (I->getParent() == PredBB) {
3626 Worklist.insert(I->op_begin(), I->op_end());
3627 continue;
3628 }
3629
3630 // It's legal to sink the instruction if all its uses occur in the
3631 // predicated block. Otherwise, there's nothing to do yet, and we may
3632 // need to reanalyze the instruction.
3633 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3634 InstsToReanalyze.push_back(I);
3635 continue;
3636 }
3637
3638 // Move the instruction to the beginning of the predicated block, and add
3639 // it's operands to the worklist.
3640 I->moveBefore(&*PredBB->getFirstInsertionPt());
3641 Worklist.insert(I->op_begin(), I->op_end());
3642
3643 // The sinking may have enabled other instructions to be sunk, so we will
3644 // need to iterate.
3645 Changed = true;
3646 }
3647 } while (Changed);
3648}
3649
3651 VPTransformState &State) {
3652 auto Iter = vp_depth_first_deep(Plan.getEntry());
3653 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3654 for (VPRecipeBase &P : VPBB->phis()) {
3655 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3656 if (!VPPhi)
3657 continue;
3658 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3659 // Make sure the builder has a valid insert point.
3660 Builder.SetInsertPoint(NewPhi);
3661 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3662 VPValue *Inc = VPPhi->getIncomingValue(i);
3663 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3664 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
3665 }
3666 }
3667 }
3668}
3669
3671 const RecurrenceDescriptor &RdxDesc) {
3672 return Cost->useOrderedReductions(RdxDesc);
3673}
3674
3675void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3676 // We should not collect Scalars more than once per VF. Right now, this
3677 // function is called from collectUniformsAndScalars(), which already does
3678 // this check. Collecting Scalars for VF=1 does not make any sense.
3679 assert(VF.isVector() && !Scalars.contains(VF) &&
3680 "This function should not be visited twice for the same VF");
3681
3682 // This avoids any chances of creating a REPLICATE recipe during planning
3683 // since that would result in generation of scalarized code during execution,
3684 // which is not supported for scalable vectors.
3685 if (VF.isScalable()) {
3686 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3687 return;
3688 }
3689
3691
3692 // These sets are used to seed the analysis with pointers used by memory
3693 // accesses that will remain scalar.
3695 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3696 auto *Latch = TheLoop->getLoopLatch();
3697
3698 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3699 // The pointer operands of loads and stores will be scalar as long as the
3700 // memory access is not a gather or scatter operation. The value operand of a
3701 // store will remain scalar if the store is scalarized.
3702 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3703 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3704 assert(WideningDecision != CM_Unknown &&
3705 "Widening decision should be ready at this moment");
3706 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3707 if (Ptr == Store->getValueOperand())
3708 return WideningDecision == CM_Scalarize;
3709 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3710 "Ptr is neither a value or pointer operand");
3711 return WideningDecision != CM_GatherScatter;
3712 };
3713
3714 // A helper that returns true if the given value is a bitcast or
3715 // getelementptr instruction contained in the loop.
3716 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
3717 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
3718 isa<GetElementPtrInst>(V)) &&
3720 };
3721
3722 // A helper that evaluates a memory access's use of a pointer. If the use will
3723 // be a scalar use and the pointer is only used by memory accesses, we place
3724 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3725 // PossibleNonScalarPtrs.
3726 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3727 // We only care about bitcast and getelementptr instructions contained in
3728 // the loop.
3729 if (!isLoopVaryingBitCastOrGEP(Ptr))
3730 return;
3731
3732 // If the pointer has already been identified as scalar (e.g., if it was
3733 // also identified as uniform), there's nothing to do.
3734 auto *I = cast<Instruction>(Ptr);
3735 if (Worklist.count(I))
3736 return;
3737
3738 // If the use of the pointer will be a scalar use, and all users of the
3739 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3740 // place the pointer in PossibleNonScalarPtrs.
3741 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3742 return isa<LoadInst>(U) || isa<StoreInst>(U);
3743 }))
3744 ScalarPtrs.insert(I);
3745 else
3746 PossibleNonScalarPtrs.insert(I);
3747 };
3748
3749 // We seed the scalars analysis with three classes of instructions: (1)
3750 // instructions marked uniform-after-vectorization and (2) bitcast,
3751 // getelementptr and (pointer) phi instructions used by memory accesses
3752 // requiring a scalar use.
3753 //
3754 // (1) Add to the worklist all instructions that have been identified as
3755 // uniform-after-vectorization.
3756 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3757
3758 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3759 // memory accesses requiring a scalar use. The pointer operands of loads and
3760 // stores will be scalar as long as the memory accesses is not a gather or
3761 // scatter operation. The value operand of a store will remain scalar if the
3762 // store is scalarized.
3763 for (auto *BB : TheLoop->blocks())
3764 for (auto &I : *BB) {
3765 if (auto *Load = dyn_cast<LoadInst>(&I)) {
3766 evaluatePtrUse(Load, Load->getPointerOperand());
3767 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3768 evaluatePtrUse(Store, Store->getPointerOperand());
3769 evaluatePtrUse(Store, Store->getValueOperand());
3770 }
3771 }
3772 for (auto *I : ScalarPtrs)
3773 if (!PossibleNonScalarPtrs.count(I)) {
3774 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3775 Worklist.insert(I);
3776 }
3777
3778 // Insert the forced scalars.
3779 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3780 // induction variable when the PHI user is scalarized.
3781 auto ForcedScalar = ForcedScalars.find(VF);
3782 if (ForcedScalar != ForcedScalars.end())
3783 for (auto *I : ForcedScalar->second) {
3784 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3785 Worklist.insert(I);
3786 }
3787
3788 // Expand the worklist by looking through any bitcasts and getelementptr
3789 // instructions we've already identified as scalar. This is similar to the
3790 // expansion step in collectLoopUniforms(); however, here we're only
3791 // expanding to include additional bitcasts and getelementptr instructions.
3792 unsigned Idx = 0;
3793 while (Idx != Worklist.size()) {
3794 Instruction *Dst = Worklist[Idx++];
3795 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
3796 continue;
3797 auto *Src = cast<Instruction>(Dst->getOperand(0));
3798 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3799 auto *J = cast<Instruction>(U);
3800 return !TheLoop->contains(J) || Worklist.count(J) ||
3801 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3802 isScalarUse(J, Src));
3803 })) {
3804 Worklist.insert(Src);
3805 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3806 }
3807 }
3808
3809 // An induction variable will remain scalar if all users of the induction
3810 // variable and induction variable update remain scalar.
3811 for (const auto &Induction : Legal->getInductionVars()) {
3812 auto *Ind = Induction.first;
3813 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3814
3815 // If tail-folding is applied, the primary induction variable will be used
3816 // to feed a vector compare.
3817 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3818 continue;
3819
3820 // Returns true if \p Indvar is a pointer induction that is used directly by
3821 // load/store instruction \p I.
3822 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3823 Instruction *I) {
3824 return Induction.second.getKind() ==
3826 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3827 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
3828 };
3829
3830 // Determine if all users of the induction variable are scalar after
3831 // vectorization.
3832 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3833 auto *I = cast<Instruction>(U);
3834 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3835 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3836 });
3837 if (!ScalarInd)
3838 continue;
3839
3840 // Determine if all users of the induction variable update instruction are
3841 // scalar after vectorization.
3842 auto ScalarIndUpdate =
3843 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3844 auto *I = cast<Instruction>(U);
3845 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3846 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3847 });
3848 if (!ScalarIndUpdate)
3849 continue;
3850
3851 // The induction variable and its update instruction will remain scalar.
3852 Worklist.insert(Ind);
3853 Worklist.insert(IndUpdate);
3854 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3855 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3856 << "\n");
3857 }
3858
3859 Scalars[VF].insert(Worklist.begin(), Worklist.end());
3860}
3861
3863 Instruction *I, ElementCount VF) const {
3864 if (!isPredicatedInst(I))
3865 return false;
3866
3867 // Do we have a non-scalar lowering for this predicated
3868 // instruction? No - it is scalar with predication.
3869 switch(I->getOpcode()) {
3870 default:
3871 return true;
3872 case Instruction::Call:
3873 if (VF.isScalar())
3874 return true;
3875 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3876 .Kind == CM_Scalarize;
3877 case Instruction::Load:
3878 case Instruction::Store: {
3880 auto *Ty = getLoadStoreType(I);
3881 Type *VTy = Ty;
3882 if (VF.isVector())
3883 VTy = VectorType::get(Ty, VF);
3884 const Align Alignment = getLoadStoreAlignment(I);
3885 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3886 TTI.isLegalMaskedGather(VTy, Alignment))
3887 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3888 TTI.isLegalMaskedScatter(VTy, Alignment));
3889 }
3890 case Instruction::UDiv:
3891 case Instruction::SDiv:
3892 case Instruction::SRem:
3893 case Instruction::URem: {
3894 // We have the option to use the safe-divisor idiom to avoid predication.
3895 // The cost based decision here will always select safe-divisor for
3896 // scalable vectors as scalarization isn't legal.
3897 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3898 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3899 }
3900 }
3901}
3902
3904 if (!blockNeedsPredicationForAnyReason(I->getParent()))
3905 return false;
3906
3907 // Can we prove this instruction is safe to unconditionally execute?
3908 // If not, we must use some form of predication.
3909 switch(I->getOpcode()) {
3910 default:
3911 return false;
3912 case Instruction::Load:
3913 case Instruction::Store: {
3914 if (!Legal->isMaskRequired(I))
3915 return false;
3916 // When we know the load's address is loop invariant and the instruction
3917 // in the original scalar loop was unconditionally executed then we
3918 // don't need to mark it as a predicated instruction. Tail folding may
3919 // introduce additional predication, but we're guaranteed to always have
3920 // at least one active lane. We call Legal->blockNeedsPredication here
3921 // because it doesn't query tail-folding. For stores, we need to prove
3922 // both speculation safety (which follows from the same argument as loads),
3923 // but also must prove the value being stored is correct. The easiest
3924 // form of the later is to require that all values stored are the same.
3926 (isa<LoadInst>(I) ||
3927 (isa<StoreInst>(I) &&
3928 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
3929 !Legal->blockNeedsPredication(I->getParent()))
3930 return false;
3931 return true;
3932 }
3933 case Instruction::UDiv:
3934 case Instruction::SDiv:
3935 case Instruction::SRem:
3936 case Instruction::URem:
3937 // TODO: We can use the loop-preheader as context point here and get
3938 // context sensitive reasoning
3940 case Instruction::Call:
3941 return Legal->isMaskRequired(I);
3942 }
3943}
3944
3945std::pair<InstructionCost, InstructionCost>
3947 ElementCount VF) const {
3948 assert(I->getOpcode() == Instruction::UDiv ||
3949 I->getOpcode() == Instruction::SDiv ||
3950 I->getOpcode() == Instruction::SRem ||
3951 I->getOpcode() == Instruction::URem);
3953
3955
3956 // Scalarization isn't legal for scalable vector types
3957 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3958 if (!VF.isScalable()) {
3959 // Get the scalarization cost and scale this amount by the probability of
3960 // executing the predicated block. If the instruction is not predicated,
3961 // we fall through to the next case.
3962 ScalarizationCost = 0;
3963
3964 // These instructions have a non-void type, so account for the phi nodes
3965 // that we will create. This cost is likely to be zero. The phi node
3966 // cost, if any, should be scaled by the block probability because it
3967 // models a copy at the end of each predicated block.
3968 ScalarizationCost += VF.getKnownMinValue() *
3969 TTI.getCFInstrCost(Instruction::PHI, CostKind);
3970
3971 // The cost of the non-predicated instruction.
3972 ScalarizationCost += VF.getKnownMinValue() *
3973 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3974
3975 // The cost of insertelement and extractelement instructions needed for
3976 // scalarization.
3977 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
3978
3979 // Scale the cost by the probability of executing the predicated blocks.
3980 // This assumes the predicated block for each vector lane is equally
3981 // likely.
3982 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3983 }
3984 InstructionCost SafeDivisorCost = 0;
3985
3986 auto *VecTy = ToVectorTy(I->getType(), VF);
3987
3988 // The cost of the select guard to ensure all lanes are well defined
3989 // after we speculate above any internal control flow.
3990 SafeDivisorCost += TTI.getCmpSelInstrCost(
3991 Instruction::Select, VecTy,
3992 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
3994
3995 // Certain instructions can be cheaper to vectorize if they have a constant
3996 // second vector operand. One example of this are shifts on x86.
3997 Value *Op2 = I->getOperand(1);
3998 auto Op2Info = TTI.getOperandInfo(Op2);
3999 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
4000 Legal->isInvariant(Op2))
4002
4003 SmallVector<const Value *, 4> Operands(I->operand_values());
4004 SafeDivisorCost += TTI.getArithmeticInstrCost(
4005 I->getOpcode(), VecTy, CostKind,
4006 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4007 Op2Info, Operands, I);
4008 return {ScalarizationCost, SafeDivisorCost};
4009}
4010
4012 Instruction *I, ElementCount VF) {
4013 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4015 "Decision should not be set yet.");
4016 auto *Group = getInterleavedAccessGroup(I);
4017 assert(Group && "Must have a group.");
4018
4019 // If the instruction's allocated size doesn't equal it's type size, it
4020 // requires padding and will be scalarized.
4021 auto &DL = I->getModule()->getDataLayout();
4022 auto *ScalarTy = getLoadStoreType(I);
4023 if (hasIrregularType(ScalarTy, DL))
4024 return false;
4025
4026 // If the group involves a non-integral pointer, we may not be able to
4027 // losslessly cast all values to a common type.
4028 unsigned InterleaveFactor = Group->getFactor();
4029 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4030 for (unsigned i = 0; i < InterleaveFactor; i++) {
4031 Instruction *Member = Group->getMember(i);
4032 if (!Member)
4033 continue;
4034 auto *MemberTy = getLoadStoreType(Member);
4035 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4036 // Don't coerce non-integral pointers to integers or vice versa.
4037 if (MemberNI != ScalarNI) {
4038 // TODO: Consider adding special nullptr value case here
4039 return false;
4040 } else if (MemberNI && ScalarNI &&
4041 ScalarTy->getPointerAddressSpace() !=
4042 MemberTy->getPointerAddressSpace()) {
4043 return false;
4044 }
4045 }
4046
4047 // Check if masking is required.
4048 // A Group may need masking for one of two reasons: it resides in a block that
4049 // needs predication, or it was decided to use masking to deal with gaps
4050 // (either a gap at the end of a load-access that may result in a speculative
4051 // load, or any gaps in a store-access).
4052 bool PredicatedAccessRequiresMasking =
4053 blockNeedsPredicationForAnyReason(I->getParent()) &&
4055 bool LoadAccessWithGapsRequiresEpilogMasking =
4056 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4058 bool StoreAccessWithGapsRequiresMasking =
4059 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4060 if (!PredicatedAccessRequiresMasking &&
4061 !LoadAccessWithGapsRequiresEpilogMasking &&
4062 !StoreAccessWithGapsRequiresMasking)
4063 return true;
4064
4065 // If masked interleaving is required, we expect that the user/target had
4066 // enabled it, because otherwise it either wouldn't have been created or
4067 // it should have been invalidated by the CostModel.
4069 "Masked interleave-groups for predicated accesses are not enabled.");
4070
4071 if (Group->isReverse())
4072 return false;
4073
4074 auto *Ty = getLoadStoreType(I);
4075 const Align Alignment = getLoadStoreAlignment(I);
4076 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4077 : TTI.isLegalMaskedStore(Ty, Alignment);
4078}
4079
4081 Instruction *I, ElementCount VF) {
4082 // Get and ensure we have a valid memory instruction.
4083 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4084
4086 auto *ScalarTy = getLoadStoreType(I);
4087
4088 // In order to be widened, the pointer should be consecutive, first of all.
4089 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4090 return false;
4091
4092 // If the instruction is a store located in a predicated block, it will be
4093 // scalarized.
4094 if (isScalarWithPredication(I, VF))
4095 return false;
4096
4097 // If the instruction's allocated size doesn't equal it's type size, it
4098 // requires padding and will be scalarized.
4099 auto &DL = I->getModule()->getDataLayout();
4100 if (hasIrregularType(ScalarTy, DL))
4101 return false;
4102
4103 return true;
4104}
4105
4106void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4107 // We should not collect Uniforms more than once per VF. Right now,
4108 // this function is called from collectUniformsAndScalars(), which
4109 // already does this check. Collecting Uniforms for VF=1 does not make any
4110 // sense.
4111
4112 assert(VF.isVector() && !Uniforms.contains(VF) &&
4113 "This function should not be visited twice for the same VF");
4114
4115 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4116 // not analyze again. Uniforms.count(VF) will return 1.
4117 Uniforms[VF].clear();
4118
4119 // We now know that the loop is vectorizable!
4120 // Collect instructions inside the loop that will remain uniform after
4121 // vectorization.
4122
4123 // Global values, params and instructions outside of current loop are out of
4124 // scope.
4125 auto isOutOfScope = [&](Value *V) -> bool {
4126 Instruction *I = dyn_cast<Instruction>(V);
4127 return (!I || !TheLoop->contains(I));
4128 };
4129
4130 // Worklist containing uniform instructions demanding lane 0.
4131 SetVector<Instruction *> Worklist;
4132 BasicBlock *Latch = TheLoop->getLoopLatch();
4133
4134 // Add uniform instructions demanding lane 0 to the worklist. Instructions
4135 // that are scalar with predication must not be considered uniform after
4136 // vectorization, because that would create an erroneous replicating region
4137 // where only a single instance out of VF should be formed.
4138 // TODO: optimize such seldom cases if found important, see PR40816.
4139 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4140 if (isOutOfScope(I)) {
4141 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4142 << *I << "\n");
4143 return;
4144 }
4145 if (isScalarWithPredication(I, VF)) {
4146 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4147 << *I << "\n");
4148 return;
4149 }
4150 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4151 Worklist.insert(I);
4152 };
4153
4154 // Start with the conditional branch. If the branch condition is an
4155 // instruction contained in the loop that is only used by the branch, it is
4156 // uniform.
4157 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4158 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4159 addToWorklistIfAllowed(Cmp);
4160
4161 auto PrevVF = VF.divideCoefficientBy(2);
4162 // Return true if all lanes perform the same memory operation, and we can
4163 // thus chose to execute only one.
4164 auto isUniformMemOpUse = [&](Instruction *I) {
4165 // If the value was already known to not be uniform for the previous
4166 // (smaller VF), it cannot be uniform for the larger VF.
4167 if (PrevVF.isVector()) {
4168 auto Iter = Uniforms.find(PrevVF);
4169 if (Iter != Uniforms.end() && !Iter->second.contains(I))
4170 return false;
4171 }
4172 if (!Legal->isUniformMemOp(*I, VF))
4173 return false;
4174 if (isa<LoadInst>(I))
4175 // Loading the same address always produces the same result - at least
4176 // assuming aliasing and ordering which have already been checked.
4177 return true;
4178 // Storing the same value on every iteration.
4179 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4180 };
4181
4182 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4183 InstWidening WideningDecision = getWideningDecision(I, VF);
4184 assert(WideningDecision != CM_Unknown &&
4185 "Widening decision should be ready at this moment");
4186
4187 if (isUniformMemOpUse(I))
4188 return true;
4189
4190 return (WideningDecision == CM_Widen ||
4191 WideningDecision == CM_Widen_Reverse ||
4192 WideningDecision == CM_Interleave);
4193 };
4194
4195 // Returns true if Ptr is the pointer operand of a memory access instruction
4196 // I, I is known to not require scalarization, and the pointer is not also
4197 // stored.
4198 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4199 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
4200 return false;
4201 return getLoadStorePointerOperand(I) == Ptr &&
4202 (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
4203 };
4204
4205 // Holds a list of values which are known to have at least one uniform use.
4206 // Note that there may be other uses which aren't uniform. A "uniform use"
4207 // here is something which only demands lane 0 of the unrolled iterations;
4208 // it does not imply that all lanes produce the same value (e.g. this is not
4209 // the usual meaning of uniform)
4210 SetVector<Value *> HasUniformUse;
4211
4212 // Scan the loop for instructions which are either a) known to have only
4213 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4214 for (auto *BB : TheLoop->blocks())
4215 for (auto &I : *BB) {
4216 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4217 switch (II->getIntrinsicID()) {
4218 case Intrinsic::sideeffect:
4219 case Intrinsic::experimental_noalias_scope_decl:
4220 case Intrinsic::assume:
4221 case Intrinsic::lifetime_start:
4222 case Intrinsic::lifetime_end:
4224 addToWorklistIfAllowed(&I);
4225 break;
4226 default:
4227 break;
4228 }
4229 }
4230
4231 // ExtractValue instructions must be uniform, because the operands are
4232 // known to be loop-invariant.
4233 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4234 assert(isOutOfScope(EVI->getAggregateOperand()) &&
4235 "Expected aggregate value to be loop invariant");
4236 addToWorklistIfAllowed(EVI);
4237 continue;
4238 }
4239
4240 // If there's no pointer operand, there's nothing to do.
4242 if (!Ptr)
4243 continue;
4244
4245 if (isUniformMemOpUse(&I))
4246 addToWorklistIfAllowed(&I);
4247
4248 if (isVectorizedMemAccessUse(&I, Ptr))
4249 HasUniformUse.insert(Ptr);
4250 }
4251
4252 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4253 // demanding) users. Since loops are assumed to be in LCSSA form, this
4254 // disallows uses outside the loop as well.
4255 for (auto *V : HasUniformUse) {
4256 if (isOutOfScope(V))
4257 continue;
4258 auto *I = cast<Instruction>(V);
4259 auto UsersAreMemAccesses =
4260 llvm::all_of(I->users(), [&](User *U) -> bool {
4261 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4262 });
4263 if (UsersAreMemAccesses)
4264 addToWorklistIfAllowed(I);
4265 }
4266
4267 // Expand Worklist in topological order: whenever a new instruction
4268 // is added , its users should be already inside Worklist. It ensures
4269 // a uniform instruction will only be used by uniform instructions.
4270 unsigned idx = 0;
4271 while (idx != Worklist.size()) {
4272 Instruction *I = Worklist[idx++];
4273
4274 for (auto *OV : I->operand_values()) {
4275 // isOutOfScope operands cannot be uniform instructions.
4276 if (isOutOfScope(OV))
4277 continue;
4278 // First order recurrence Phi's should typically be considered
4279 // non-uniform.
4280 auto *OP = dyn_cast<PHINode>(OV);
4282 continue;
4283 // If all the users of the operand are uniform, then add the
4284 // operand into the uniform worklist.
4285 auto *OI = cast<Instruction>(OV);
4286 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4287 auto *J = cast<Instruction>(U);
4288 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4289 }))
4290 addToWorklistIfAllowed(OI);
4291 }
4292 }
4293
4294 // For an instruction to be added into Worklist above, all its users inside
4295 // the loop should also be in Worklist. However, this condition cannot be
4296 // true for phi nodes that form a cyclic dependence. We must process phi
4297 // nodes separately. An induction variable will remain uniform if all users
4298 // of the induction variable and induction variable update remain uniform.
4299 // The code below handles both pointer and non-pointer induction variables.
4300 for (const auto &Induction : Legal->getInductionVars()) {
4301 auto *Ind = Induction.first;
4302 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4303
4304 // Determine if all users of the induction variable are uniform after
4305 // vectorization.
4306 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4307 auto *I = cast<Instruction>(U);
4308 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4309 isVectorizedMemAccessUse(I, Ind);
4310 });
4311 if (!UniformInd)
4312 continue;
4313
4314 // Determine if all users of the induction variable update instruction are
4315 // uniform after vectorization.
4316 auto UniformIndUpdate =
4317 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4318 auto *I = cast<Instruction>(U);
4319 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4320 isVectorizedMemAccessUse(I, IndUpdate);
4321 });
4322 if (!UniformIndUpdate)
4323 continue;
4324
4325 // The induction variable and its update instruction will remain uniform.
4326 addToWorklistIfAllowed(Ind);
4327 addToWorklistIfAllowed(IndUpdate);
4328 }
4329
4330 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4331}
4332
4334 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4335
4337 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4338 "runtime pointer checks needed. Enable vectorization of this "
4339 "loop with '#pragma clang loop vectorize(enable)' when "
4340 "compiling with -Os/-Oz",
4341 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4342 return true;
4343 }
4344
4345 if (!PSE.getPredicate().isAlwaysTrue()) {
4346 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4347 "runtime SCEV checks needed. Enable vectorization of this "
4348 "loop with '#pragma clang loop vectorize(enable)' when "
4349 "compiling with -Os/-Oz",
4350 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4351 return true;
4352 }
4353
4354 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4355 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4356 reportVectorizationFailure("Runtime stride check for small trip count",
4357 "runtime stride == 1 checks needed. Enable vectorization of "
4358 "this loop without such check by compiling with -Os/-Oz",
4359 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4360 return true;
4361 }
4362
4363 return false;
4364}
4365
4367LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4369 return ElementCount::getScalable(0);
4370
4372 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4373 "ScalableVectorizationDisabled", ORE, TheLoop);
4374 return ElementCount::getScalable(0);
4375 }
4376
4377 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4378
4379 auto MaxScalableVF = ElementCount::getScalable(
4380 std::numeric_limits<ElementCount::ScalarTy>::max());
4381
4382 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4383 // FIXME: While for scalable vectors this is currently sufficient, this should
4384 // be replaced by a more detailed mechanism that filters out specific VFs,
4385 // instead of invalidating vectorization for a whole set of VFs based on the
4386 // MaxVF.
4387
4388 // Disable scalable vectorization if the loop contains unsupported reductions.
4389 if (!canVectorizeReductions(MaxScalableVF)) {
4391 "Scalable vectorization not supported for the reduction "
4392 "operations found in this loop.",
4393 "ScalableVFUnfeasible", ORE, TheLoop);
4394 return ElementCount::getScalable(0);
4395 }
4396
4397 // Disable scalable vectorization if the loop contains any instructions
4398 // with element types not supported for scalable vectors.
4399 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4400 return !Ty->isVoidTy() &&
4402 })) {
4403 reportVectorizationInfo("Scalable vectorization is not supported "
4404 "for all element types found in this loop.",
4405 "ScalableVFUnfeasible", ORE, TheLoop);
4406 return ElementCount::getScalable(0);
4407 }
4408
4410 return MaxScalableVF;
4411
4412 // Limit MaxScalableVF by the maximum safe dependence distance.
4413 if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI))
4414 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
4415 else
4416 MaxScalableVF = ElementCount::getScalable(0);
4417
4418 if (!MaxScalableVF)
4420 "Max legal vector width too small, scalable vectorization "
4421 "unfeasible.",
4422 "ScalableVFUnfeasible", ORE, TheLoop);
4423
4424 return MaxScalableVF;
4425}
4426
4427FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4428 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4430 unsigned SmallestType, WidestType;
4431 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4432
4433 // Get the maximum safe dependence distance in bits computed by LAA.
4434 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4435 // the memory accesses that is most restrictive (involved in the smallest
4436 // dependence distance).
4437 unsigned MaxSafeElements =
4439
4440 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4441 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4442
4443 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4444 << ".\n");
4445 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4446 << ".\n");
4447
4448 // First analyze the UserVF, fall back if the UserVF should be ignored.
4449 if (UserVF) {
4450 auto MaxSafeUserVF =
4451 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4452
4453 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4454 // If `VF=vscale x N` is safe, then so is `VF=N`
4455 if (UserVF.isScalable())
4456 return FixedScalableVFPair(
4457 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4458 else
4459 return UserVF;
4460 }
4461
4462 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4463
4464 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4465 // is better to ignore the hint and let the compiler choose a suitable VF.
4466 if (!UserVF.isScalable()) {
4467 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4468 << " is unsafe, clamping to max safe VF="
4469 << MaxSafeFixedVF << ".\n");
4470 ORE->emit([&]() {
4471 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4473 TheLoop->getHeader())
4474 << "User-specified vectorization factor "
4475 << ore::NV("UserVectorizationFactor", UserVF)
4476 << " is unsafe, clamping to maximum safe vectorization factor "
4477 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4478 });
4479 return MaxSafeFixedVF;
4480 }
4481
4483 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4484 << " is ignored because scalable vectors are not "
4485 "available.\n");
4486 ORE->emit([&]() {
4487 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4489 TheLoop->getHeader())
4490 << "User-specified vectorization factor "
4491 << ore::NV("UserVectorizationFactor", UserVF)
4492 << " is ignored because the target does not support scalable "
4493 "vectors. The compiler will pick a more suitable value.";
4494 });
4495 } else {
4496 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4497 << " is unsafe. Ignoring scalable UserVF.\n");
4498 ORE->emit([&]() {
4499 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4501 TheLoop->getHeader())
4502 << "User-specified vectorization factor "
4503 << ore::NV("UserVectorizationFactor", UserVF)
4504 << " is unsafe. Ignoring the hint to let the compiler pick a "
4505 "more suitable value.";
4506 });
4507 }
4508 }
4509
4510 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4511 << " / " << WidestType << " bits.\n");
4512
4515 if (auto MaxVF =
4516 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4517 MaxSafeFixedVF, FoldTailByMasking))
4518 Result.FixedVF = MaxVF;
4519
4520 if (auto MaxVF =
4521 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4522 MaxSafeScalableVF, FoldTailByMasking))
4523 if (MaxVF.isScalable()) {
4524 Result.ScalableVF = MaxVF;
4525 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4526 << "\n");
4527 }
4528
4529 return Result;
4530}
4531
4535 // TODO: It may by useful to do since it's still likely to be dynamically
4536 // uniform if the target can skip.
4538 "Not inserting runtime ptr check for divergent target",
4539 "runtime pointer checks needed. Not enabled for divergent target",
4540 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4542 }
4543
4544 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4545 unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
4546 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4547 if (TC == 1) {
4548 reportVectorizationFailure("Single iteration (non) loop",
4549 "loop trip count is one, irrelevant for vectorization",
4550 "SingleIterationLoop", ORE, TheLoop);
4552 }
4553
4554 switch (ScalarEpilogueStatus) {
4556 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4558 [[fallthrough]];
4560 LLVM_DEBUG(
4561 dbgs() << "LV: vector predicate hint/switch found.\n"
4562 << "LV: Not allowing scalar epilogue, creating predicated "
4563 << "vector loop.\n");
4564 break;
4566 // fallthrough as a special case of OptForSize
4568 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4569 LLVM_DEBUG(
4570 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4571 else
4572 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4573 << "count.\n");
4574
4575 // Bail if runtime checks are required, which are not good when optimising
4576 // for size.
4579
4580 break;
4581 }
4582
4583 // The only loops we can vectorize without a scalar epilogue, are loops with
4584 // a bottom-test and a single exiting block. We'd have to handle the fact
4585 // that not every instruction executes on the last iteration. This will
4586 // require a lane mask which varies through the vector loop body. (TODO)
4588 // If there was a tail-folding hint/switch, but we can't fold the tail by
4589 // masking, fallback to a vectorization with a scalar epilogue.
4590 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4591 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4592 "scalar epilogue instead.\n");
4593 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4594 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4595 }
4597 }
4598
4599 // Now try the tail folding
4600
4601 // Invalidate interleave groups that require an epilogue if we can't mask
4602 // the interleave-group.
4604 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4605 "No decisions should have been taken at this point");
4606 // Note: There is no need to invalidate any cost modeling decisions here, as
4607 // non where taken so far.
4609 }
4610
4611 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4612
4613 // Avoid tail folding if the trip count is known to be a multiple of any VF
4614 // we choose.
4615 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4616 MaxFactors.FixedVF.getFixedValue();
4617 if (MaxFactors.ScalableVF) {
4618 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4619 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4620 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4621 *MaxPowerOf2RuntimeVF,
4622 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4623 } else
4624 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4625 }
4626
4627 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4628 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4629 "MaxFixedVF must be a power of 2");
4630 unsigned MaxVFtimesIC =
4631 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4632 ScalarEvolution *SE = PSE.getSE();
4633 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4634 const SCEV *ExitCount = SE->getAddExpr(
4635 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4636 const SCEV *Rem = SE->getURemExpr(
4637 SE->applyLoopGuards(ExitCount, TheLoop),
4638 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4639 if (Rem->isZero()) {
4640 // Accept MaxFixedVF if we do not have a tail.
4641 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4642 return MaxFactors;
4643 }
4644 }
4645
4646 // If we don't know the precise trip count, or if the trip count that we
4647 // found modulo the vectorization factor is not zero, try to fold the tail
4648 // by masking.
4649 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4651 if (foldTailByMasking())
4652 return MaxFactors;
4653
4654 // If there was a tail-folding hint/switch, but we can't fold the tail by
4655 // masking, fallback to a vectorization with a scalar epilogue.
4656 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4657 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4658 "scalar epilogue instead.\n");
4659 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4660 return MaxFactors;
4661 }
4662
4663 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4664 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4666 }
4667
4668 if (TC == 0) {
4670 "Unable to calculate the loop count due to complex control flow",
4671 "unable to calculate the loop count due to complex control flow",
4672 "UnknownLoopCountComplexCFG", ORE, TheLoop);
4674 }
4675
4677 "Cannot optimize for size and vectorize at the same time.",
4678 "cannot optimize for size and vectorize at the same time. "
4679 "Enable vectorization of this loop with '#pragma clang loop "
4680 "vectorize(enable)' when compiling with -Os/-Oz",
4681 "NoTailLoopWithOptForSize", ORE, TheLoop);
4683}
4684
4685ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4686 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4687 ElementCount MaxSafeVF, bool FoldTailByMasking) {
4688 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4689 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4690 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4692
4693 // Convenience function to return the minimum of two ElementCounts.
4694 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4695 assert((LHS.isScalable() == RHS.isScalable()) &&
4696 "Scalable flags must match");
4697 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4698 };
4699
4700 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4701 // Note that both WidestRegister and WidestType may not be a powers of 2.
4702 auto MaxVectorElementCount = ElementCount::get(
4703 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4704 ComputeScalableMaxVF);
4705 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4706 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4707 << (MaxVectorElementCount * WidestType) << " bits.\n");
4708
4709 if (!MaxVectorElementCount) {
4710 LLVM_DEBUG(dbgs() << "LV: The target has no "
4711 << (ComputeScalableMaxVF ? "scalable" : "fixed")
4712 << " vector registers.\n");
4713 return ElementCount::getFixed(1);
4714 }
4715
4716 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4717 if (MaxVectorElementCount.isScalable() &&
4718 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4719 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4720 auto Min = Attr.getVScaleRangeMin();
4721 WidestRegisterMinEC *= Min;
4722 }
4723
4724 // When a scalar epilogue is required, at least one iteration of the scalar
4725 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4726 // max VF that results in a dead vector loop.
4727 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4728 MaxTripCount -= 1;
4729
4730 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4731 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4732 // If upper bound loop trip count (TC) is known at compile time there is no
4733 // point in choosing VF greater than TC (as done in the loop below). Select
4734 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4735 // scalable, we only fall back on a fixed VF when the TC is less than or
4736 // equal to the known number of lanes.
4737 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4738 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4739 "exceeding the constant trip count: "
4740 << ClampedUpperTripCount << "\n");
4741 return ElementCount::get(
4742 ClampedUpperTripCount,
4743 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4744 }
4745
4747 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4749 ElementCount MaxVF = MaxVectorElementCount;
4750 if (MaximizeBandwidth ||
4751 (MaximizeBandwidth.getNumOccurrences() == 0 &&
4754 auto MaxVectorElementCountMaxBW = ElementCount::get(
4755 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4756 ComputeScalableMaxVF);
4757 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4758
4759 // Collect all viable vectorization factors larger than the default MaxVF
4760 // (i.e. MaxVectorElementCount).
4762 for (ElementCount VS = MaxVectorElementCount * 2;
4763 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4764 VFs.push_back(VS);
4765
4766 // For each VF calculate its register usage.
4767 auto RUs = calculateRegisterUsage(VFs);
4768
4769 // Select the largest VF which doesn't require more registers than existing
4770 // ones.
4771 for (int i = RUs.size() - 1; i >= 0; --i) {
4772 bool Selected = true;
4773 for (auto &pair : RUs[i].MaxLocalUsers) {
4774 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
4775 if (pair.second > TargetNumRegisters)
4776 Selected = false;
4777 }
4778 if (Selected) {
4779 MaxVF = VFs[i];
4780 break;
4781 }
4782 }
4783 if (ElementCount MinVF =
4784 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4785 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4786 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4787 << ") with target's minimum: " << MinVF << '\n');
4788 MaxVF = MinVF;
4789 }
4790 }
4791
4792 // Invalidate any widening decisions we might have made, in case the loop
4793 // requires prediction (decided later), but we have already made some
4794 // load/store widening decisions.
4796 }
4797 return MaxVF;
4798}
4799
4800/// Convenience function that returns the value of vscale_range iff
4801/// vscale_range.min == vscale_range.max or otherwise returns the value
4802/// returned by the corresponding TTI method.
4803static std::optional<unsigned>
4805 const Function *Fn = L->getHeader()->getParent();
4806 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4807 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4808 auto Min = Attr.getVScaleRangeMin();
4809 auto Max = Attr.getVScaleRangeMax();
4810 if (Max && Min == Max)
4811 return Max;
4812 }
4813
4814 return TTI.getVScaleForTuning();
4815}
4816
4817bool LoopVectorizationPlanner::isMoreProfitable(
4818 const VectorizationFactor &A, const VectorizationFactor &B) const {
4819 InstructionCost CostA = A.Cost;
4820 InstructionCost CostB = B.Cost;
4821
4822 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4823
4824 if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
4825 // If the trip count is a known (possibly small) constant, the trip count
4826 // will be rounded up to an integer number of iterations under
4827 // FoldTailByMasking. The total cost in that case will be
4828 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4829 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4830 // some extra overheads, but for the purpose of comparing the costs of
4831 // different VFs we can use this to compare the total loop-body cost
4832 // expected after vectorization.
4833 auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4834 InstructionCost VectorCost,
4835 InstructionCost ScalarCost) {
4836 return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
4837 : VectorCost * (MaxTripCount / VF) +
4838 ScalarCost * (MaxTripCount % VF);
4839 };
4840 auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
4841 auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
4842
4843 return RTCostA < RTCostB;
4844 }
4845
4846 // Improve estimate for the vector width if it is scalable.
4847 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4848 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4849 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4850 if (A.Width.isScalable())
4851 EstimatedWidthA *= *VScale;
4852 if (B.Width.isScalable())
4853 EstimatedWidthB *= *VScale;
4854 }
4855
4856 // Assume vscale may be larger than 1 (or the value being tuned for),
4857 // so that scalable vectorization is slightly favorable over fixed-width
4858 // vectorization.
4859 if (A.Width.isScalable() && !B.Width.isScalable())
4860 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
4861
4862 // To avoid the need for FP division:
4863 // (CostA / A.Width) < (CostB / B.Width)
4864 // <=> (CostA * B.Width) < (CostB * A.Width)
4865 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
4866}
4867
4870 Loop *TheLoop) {
4871 if (InvalidCosts.empty())
4872 return;
4873
4874 // Emit a report of VFs with invalid costs in the loop.
4875
4876 // Group the remarks per instruction, keeping the instruction order from
4877 // InvalidCosts.
4878 std::map<Instruction *, unsigned> Numbering;
4879 unsigned I = 0;
4880 for (auto &Pair : InvalidCosts)
4881 if (!Numbering.count(Pair.first))
4882 Numbering[Pair.first] = I++;
4883
4884 // Sort the list, first on instruction(number) then on VF.
4885 sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
4886 if (Numbering[A.first] != Numbering[B.first])
4887 return Numbering[A.first] < Numbering[B.first];
4889 return ECC(A.second, B.second);
4890 });
4891
4892 // For a list of ordered instruction-vf pairs:
4893 // [(load, vf1), (load, vf2), (store, vf1)]
4894 // Group the instructions together to emit separate remarks for:
4895 // load (vf1, vf2)
4896 // store (vf1)
4897 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
4898 auto Subset = ArrayRef<InstructionVFPair>();
4899 do {
4900 if (Subset.empty())
4901 Subset = Tail.take_front(1);
4902
4903 Instruction *I = Subset.front().first;
4904
4905 // If the next instruction is different, or if there are no other pairs,
4906 // emit a remark for the collated subset. e.g.
4907 // [(load, vf1), (load, vf2))]
4908 // to emit:
4909 // remark: invalid costs for 'load' at VF=(vf, vf2)
4910 if (Subset == Tail || Tail[Subset.size()].first != I) {
4911 std::string OutString;
4912 raw_string_ostream OS(OutString);
4913 assert(!Subset.empty() && "Unexpected empty range");
4914 OS << "Instruction with invalid costs prevented vectorization at VF=(";
4915 for (const auto &Pair : Subset)
4916 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4917 OS << "):";
4918 if (auto *CI = dyn_cast<CallInst>(I))
4919 OS << " call to " << CI->getCalledFunction()->getName();
4920 else
4921 OS << " " << I->getOpcodeName();
4922 OS.flush();
4923 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
4924 Tail = Tail.drop_front(Subset.size());
4925 Subset = {};
4926 } else
4927 // Grow the subset by one element
4928 Subset = Tail.take_front(Subset.size() + 1);
4929 } while (!Tail.empty());
4930}
4931
4932VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
4933 const ElementCountSet &VFCandidates) {
4934 InstructionCost ExpectedCost =
4936 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4937 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4938 assert(VFCandidates.count(ElementCount::getFixed(1)) &&
4939 "Expected Scalar VF to be a candidate");
4940
4941 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4942 ExpectedCost);
4943 VectorizationFactor ChosenFactor = ScalarCost;
4944
4945 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4946 if (ForceVectorization && VFCandidates.size() > 1) {
4947 // Ignore scalar width, because the user explicitly wants vectorization.
4948 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4949 // evaluation.
4950 ChosenFactor.Cost = InstructionCost::getMax();
4951 }
4952
4953 SmallVector<InstructionVFPair> InvalidCosts;
4954 for (const auto &i : VFCandidates) {
4955 // The cost for scalar VF=1 is already calculated, so ignore it.
4956 if (i.isScalar())
4957 continue;
4958
4960 CM.expectedCost(i, &InvalidCosts);
4961 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
4962
4963#ifndef NDEBUG
4964 unsigned AssumedMinimumVscale =
4965 getVScaleForTuning(OrigLoop, TTI).value_or(1);
4966 unsigned Width =
4967 Candidate.Width.isScalable()
4968 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
4969 : Candidate.Width.getFixedValue();
4970 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
4971 << " costs: " << (Candidate.Cost / Width));
4972 if (i.isScalable())
4973 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4974 << AssumedMinimumVscale << ")");
4975 LLVM_DEBUG(dbgs() << ".\n");
4976#endif
4977
4978 if (!C.second && !ForceVectorization) {
4979 LLVM_DEBUG(
4980 dbgs() << "LV: Not considering vector loop of width " << i
4981 << " because it will not generate any vector instructions.\n");
4982 continue;
4983 }
4984
4985 // If profitable add it to ProfitableVF list.
4986 if (isMoreProfitable(Candidate, ScalarCost))
4987 ProfitableVFs.push_back(Candidate);
4988
4989 if (isMoreProfitable(Candidate, ChosenFactor))
4990 ChosenFactor = Candidate;
4991 }
4992
4993 emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
4994
4997 "There are conditional stores.",
4998 "store that is conditionally executed prevents vectorization",
4999 "ConditionalStore", ORE, OrigLoop);
5000 ChosenFactor = ScalarCost;
5001 }
5002
5003 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5004 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
5005 << "LV: Vectorization seems to be not beneficial, "
5006 << "but was forced by a user.\n");
5007 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5008 return ChosenFactor;
5009}
5010
5011bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
5012 ElementCount VF) const {
5013 // Cross iteration phis such as reductions need special handling and are
5014 // currently unsupported.
5015 if (any_of(OrigLoop->getHeader()->phis(),
5016 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
5017 return false;
5018
5019 // Phis with uses outside of the loop require special handling and are
5020 // currently unsupported.
5021 for (const auto &Entry : Legal->getInductionVars()) {
5022 // Look for uses of the value of the induction at the last iteration.
5023 Value *PostInc =
5024 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
5025 for (User *U : PostInc->users())
5026 if (!OrigLoop->contains(cast<Instruction>(U)))
5027 return false;
5028 // Look for uses of penultimate value of the induction.
5029 for (User *U : Entry.first->users())
5030 if (!OrigLoop->contains(cast<Instruction>(U)))
5031 return false;
5032 }
5033
5034 // Epilogue vectorization code has not been auditted to ensure it handles
5035 // non-latch exits properly. It may be fine, but it needs auditted and
5036 // tested.
5037 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
5038 return false;
5039
5040 return true;
5041}
5042
5044 const ElementCount VF) const {
5045 // FIXME: We need a much better cost-model to take different parameters such
5046 // as register pressure, code size increase and cost of extra branches into
5047 // account. For now we apply a very crude heuristic and only consider loops
5048 // with vectorization factors larger than a certain value.
5049
5050 // Allow the target to opt out entirely.
5052 return false;
5053
5054 // We also consider epilogue vectorization unprofitable for targets that don't
5055 // consider interleaving beneficial (eg. MVE).
5056 if (TTI.getMaxInterleaveFactor(VF) <= 1)
5057 return false;
5058
5059 unsigned Multiplier = 1;
5060 if (VF.isScalable())
5061 Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
5062 if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
5063 return true;
5064 return false;
5065}
5066
5068 const ElementCount MainLoopVF, unsigned IC) {
5071 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
5072 return Result;
5073 }
5074
5075 if (!CM.isScalarEpilogueAllowed()) {
5076 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
5077 "epilogue is allowed.\n");
5078 return Result;
5079 }
5080
5081 // Not really a cost consideration, but check for unsupported cases here to
5082 // simplify the logic.
5083 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
5084 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
5085 "is not a supported candidate.\n");
5086 return Result;
5087 }
5088
5090 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
5092 if (hasPlanWithVF(ForcedEC))
5093 return {ForcedEC, 0, 0};
5094 else {
5095 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
5096 "viable.\n");
5097 return Result;
5098 }
5099 }
5100
5101 if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
5102 OrigLoop->getHeader()->getParent()->hasMinSize()) {
5103 LLVM_DEBUG(
5104 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
5105 return Result;
5106 }
5107
5108 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
5109 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5110 "this loop\n");
5111 return Result;
5112 }
5113
5114 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5115 // the main loop handles 8 lanes per iteration. We could still benefit from
5116 // vectorizing the epilogue loop with VF=4.
5117 ElementCount EstimatedRuntimeVF = MainLoopVF;
5118 if (MainLoopVF.isScalable()) {
5119 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5120 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
5121 EstimatedRuntimeVF *= *VScale;
5122 }
5123
5124 ScalarEvolution &SE = *PSE.getSE();
5125 Type *TCType = Legal->getWidestInductionType();
5126 const SCEV *RemainingIterations = nullptr;
5127 for (auto &NextVF : ProfitableVFs) {
5128 // Skip candidate VFs without a corresponding VPlan.
5129 if (!hasPlanWithVF(NextVF.Width))
5130 continue;
5131
5132 // Skip candidate VFs with widths >= the estimate runtime VF (scalable
5133 // vectors) or the VF of the main loop (fixed vectors).
5134 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5135 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
5136 ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
5137 continue;
5138
5139 // If NextVF is greater than the number of remaining iterations, the
5140 // epilogue loop would be dead. Skip such factors.
5141 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
5142 // TODO: extend to support scalable VFs.
5143 if (!RemainingIterations) {
5144 const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
5145 RemainingIterations = SE.getURemExpr(
5146 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
5147 }
5148 if (SE.isKnownPredicate(
5150 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
5151 RemainingIterations))
5152 continue;
5153 }
5154
5155 if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
5156 Result = NextVF;
5157 }
5158
5159 if (Result != VectorizationFactor::Disabled())
5160 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5161 << Result.Width << "\n");
5162 return Result;
5163}
5164
5165std::pair<unsigned, unsigned>
5167 unsigned MinWidth = -1U;
5168 unsigned MaxWidth = 8;
5170 // For in-loop reductions, no element types are added to ElementTypesInLoop
5171 // if there are no loads/stores in the loop. In this case, check through the
5172 // reduction variables to determine the maximum width.
5173 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5174 // Reset MaxWidth so that we can find the smallest type used by recurrences
5175 // in the loop.
5176 MaxWidth = -1U;
5177 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
5178 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5179 // When finding the min width used by the recurrence we need to account
5180 // for casts on the input operands of the recurrence.
5181 MaxWidth = std::min<unsigned>(
5182 MaxWidth, std::min<unsigned>(
5185 }
5186 } else {
5187 for (Type *T : ElementTypesInLoop) {
5188 MinWidth = std::min<unsigned>(
5189 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5190 MaxWidth = std::max<unsigned>(
5191 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5192 }
5193 }
5194 return {MinWidth, MaxWidth};
5195}
5196
5198 ElementTypesInLoop.clear();
5199 // For each block.
5200 for (BasicBlock *BB : TheLoop->blocks()) {
5201 // For each instruction in the loop.
5202 for (Instruction &I : BB->instructionsWithoutDebug()) {
5203 Type *T = I.getType();
5204
5205 // Skip ignored values.
5206 if (ValuesToIgnore.count(&I))
5207 continue;
5208
5209 // Only examine Loads, Stores and PHINodes.
5210 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5211 continue;
5212
5213 // Examine PHI nodes that are reduction variables. Update the type to
5214 // account for the recurrence type.
5215 if (auto *PN = dyn_cast<PHINode>(&I)) {
5216 if (!Legal->isReductionVariable(PN))
5217 continue;
5218 const RecurrenceDescriptor &RdxDesc =
5219 Legal->getReductionVars().find(PN)->second;
5222 RdxDesc.getRecurrenceType(),
5224 continue;
5225 T = RdxDesc.getRecurrenceType();
5226 }
5227
5228 // Examine the stored values.
5229 if (auto *ST = dyn_cast<StoreInst>(&I))
5230 T = ST->getValueOperand()->getType();
5231
5232 assert(T->isSized() &&
5233 "Expected the load/store/recurrence type to be sized");
5234
5235 ElementTypesInLoop.insert(T);
5236 }
5237 }
5238}
5239
5240unsigned
5242 InstructionCost LoopCost) {
5243 // -- The interleave heuristics --
5244 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5245 // There are many micro-architectural considerations that we can't predict
5246 // at this level. For example, frontend pressure (on decode or fetch) due to
5247 // code size, or the number and capabilities of the execution ports.
5248 //
5249 // We use the following heuristics to select the interleave count:
5250 // 1. If the code has reductions, then we interleave to break the cross
5251 // iteration dependency.
5252 // 2. If the loop is really small, then we interleave to reduce the loop
5253 // overhead.
5254 // 3. We don't interleave if we think that we will spill registers to memory
5255 // due to the increased register pressure.
5256
5258 return 1;
5259
5260 // We used the distance for the interleave count.
5262 return 1;
5263
5264 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5265 const bool HasReductions = !Legal->getReductionVars().empty();
5266
5267 // If we did not calculate the cost for VF (because the user selected the VF)
5268 // then we calculate the cost of VF here.
5269 if (LoopCost == 0) {
5270 LoopCost = expectedCost(VF).first;
5271 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5272
5273 // Loop body is free and there is no need for interleaving.
5274 if (LoopCost == 0)
5275 return 1;
5276 }
5277
5279 // We divide by these constants so assume that we have at least one
5280 // instruction that uses at least one register.
5281 for (auto& pair : R.MaxLocalUsers) {
5282 pair.second = std::max(pair.second, 1U);
5283 }
5284
5285 // We calculate the interleave count using the following formula.
5286 // Subtract the number of loop invariants from the number of available
5287 // registers. These registers are used by all of the interleaved instances.
5288 // Next, divide the remaining registers by the number of registers that is
5289 // required by the loop, in order to estimate how many parallel instances
5290 // fit without causing spills. All of this is rounded down if necessary to be
5291 // a power of two. We want power of two interleave count to simplify any
5292 // addressing operations or alignment considerations.
5293 // We also want power of two interleave counts to ensure that the induction
5294 // variable of the vector loop wraps to zero, when tail is folded by masking;
5295 // this currently happens when OptForSize, in which case IC is set to 1 above.
5296 unsigned IC = UINT_MAX;
5297
5298 for (auto& pair : R.MaxLocalUsers) {
5299 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5300 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5301 << " registers of "
5302 << TTI.getRegisterClassName(pair.first) << " register class\n");
5303 if (VF.isScalar()) {
5304 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5305 TargetNumRegisters = ForceTargetNumScalarRegs;
5306 } else {
5307 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5308 TargetNumRegisters = ForceTargetNumVectorRegs;
5309 }
5310 unsigned MaxLocalUsers = pair.second;
5311 unsigned LoopInvariantRegs = 0;
5312 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5313 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5314
5315 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5316 MaxLocalUsers);
5317 // Don't count the induction variable as interleaved.
5319 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5320 std::max(1U, (MaxLocalUsers - 1)));
5321 }
5322
5323 IC = std::min(IC, TmpIC);
5324 }
5325
5326 // Clamp the interleave ranges to reasonable counts.
5327 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5328
5329 // Check if the user has overridden the max.
5330 if (VF.isScalar()) {
5331 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5332 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5333 } else {
5334 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5335 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5336 }
5337
5338 unsigned EstimatedVF = VF.getKnownMinValue();
5339 if (VF.isScalable()) {
5340 if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
5341 EstimatedVF *= *VScale;
5342 }
5343 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
5344
5345 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5346 if (KnownTC > 0) {
5347 // At least one iteration must be scalar when this constraint holds. So the
5348 // maximum available iterations for interleaving is one less.
5349 unsigned AvailableTC =
5350 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5351
5352 // If trip count is known we select between two prospective ICs, where
5353 // 1) the aggressive IC is capped by the trip count divided by VF
5354 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5355 // The final IC is selected in a way that the epilogue loop trip count is
5356 // minimized while maximizing the IC itself, so that we either run the
5357 // vector loop at least once if it generates a small epilogue loop, or else
5358 // we run the vector loop at least twice.
5359
5360 unsigned InterleaveCountUB = bit_floor(
5361 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
5362 unsigned InterleaveCountLB = bit_floor(std::max(
5363 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5364 MaxInterleaveCount = InterleaveCountLB;
5365
5366 if (InterleaveCountUB != InterleaveCountLB) {
5367 unsigned TailTripCountUB =
5368 (AvailableTC % (EstimatedVF * InterleaveCountUB));
5369 unsigned TailTripCountLB =
5370 (AvailableTC % (EstimatedVF * InterleaveCountLB));
5371 // If both produce same scalar tail, maximize the IC to do the same work
5372 // in fewer vector loop iterations
5373 if (TailTripCountUB == TailTripCountLB)
5374 MaxInterleaveCount = InterleaveCountUB;
5375 }
5376 } else if (BestKnownTC && *BestKnownTC > 0) {
5377 // At least one iteration must be scalar when this constraint holds. So the
5378 // maximum available iterations for interleaving is one less.
5379 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5380 ? (*BestKnownTC) - 1
5381 : *BestKnownTC;
5382
5383 // If trip count is an estimated compile time constant, limit the
5384 // IC to be capped by the trip count divided by VF * 2, such that the vector
5385 // loop runs at least twice to make interleaving seem profitable when there
5386 // is an epilogue loop present. Since exact Trip count is not known we
5387 // choose to be conservative in our IC estimate.
5388 MaxInterleaveCount = bit_floor(std::max(
5389 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5390 }
5391
5392 assert(MaxInterleaveCount > 0 &&
5393 "Maximum interleave count must be greater than 0");
5394
5395 // Clamp the calculated IC to be between the 1 and the max interleave count
5396 // that the target and trip count allows.
5397 if (IC > MaxInterleaveCount)
5398 IC = MaxInterleaveCount;
5399 else
5400 // Make sure IC is greater than 0.
5401 IC = std::max(1u, IC);
5402
5403 assert(IC > 0 && "Interleave count must be greater than 0.");
5404
5405 // Interleave if we vectorized this loop and there is a reduction that could
5406 // benefit from interleaving.
5407 if (VF.isVector() && HasReductions) {
5408 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5409 return IC;
5410 }
5411
5412 // For any scalar loop that either requires runtime checks or predication we
5413 // are better off leaving this to the unroller. Note that if we've already
5414 // vectorized the loop we will have done the runtime check and so interleaving
5415 // won't require further checks.
5416 bool ScalarInterleavingRequiresPredication =
5417 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5418 return Legal->blockNeedsPredication(BB);
5419 }));
5420 bool ScalarInterleavingRequiresRuntimePointerCheck =
5422
5423 // We want to interleave small loops in order to reduce the loop overhead and
5424 // potentially expose ILP opportunities.
5425 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5426 << "LV: IC is " << IC << '\n'
5427 << "LV: VF is " << VF << '\n');
5428 const bool AggressivelyInterleaveReductions =
5429 TTI.enableAggressiveInterleaving(HasReductions);
5430 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5431 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5432 // We assume that the cost overhead is 1 and we use the cost model
5433 // to estimate the cost of the loop and interleave until the cost of the
5434 // loop overhead is about 5% of the cost of the loop.
5435 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5436 SmallLoopCost / *LoopCost.getValue()));
5437
5438 // Interleave until store/load ports (estimated by max interleave count) are
5439 // saturated.
5440 unsigned NumStores = Legal->getNumStores();
5441 unsigned NumLoads = Legal->getNumLoads();
5442 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5443 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5444
5445 // There is little point in interleaving for reductions containing selects
5446 // and compares when VF=1 since it may just create more overhead than it's
5447 // worth for loops with small trip counts. This is because we still have to
5448 // do the final reduction after the loop.
5449 bool HasSelectCmpReductions =
5450 HasReductions &&
5451 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5452 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5453 return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5454 RdxDesc.getRecurrenceKind());
5455 });
5456 if (HasSelectCmpReductions) {
5457 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5458 return 1;
5459 }
5460
5461 // If we have a scalar reduction (vector reductions are already dealt with
5462 // by this point), we can increase the critical path length if the loop
5463 // we're interleaving is inside another loop. For tree-wise reductions
5464 // set the limit to 2, and for ordered reductions it's best to disable
5465 // interleaving entirely.
5466 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5467 bool HasOrderedReductions =
5468 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5469 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5470 return RdxDesc.isOrdered();
5471 });
5472 if (HasOrderedReductions) {
5473 LLVM_DEBUG(
5474 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5475 return 1;
5476 }
5477
5478 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5479 SmallIC = std::min(SmallIC, F);
5480 StoresIC = std::min(StoresIC, F);
5481 LoadsIC = std::min(LoadsIC, F);
5482 }
5483
5485 std::max(StoresIC, LoadsIC) > SmallIC) {
5486 LLVM_DEBUG(
5487 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5488 return std::max(StoresIC, LoadsIC);
5489 }
5490
5491 // If there are scalar reductions and TTI has enabled aggressive
5492 // interleaving for reductions, we will interleave to expose ILP.
5493 if (VF.isScalar() && AggressivelyInterleaveReductions) {
5494 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5495 // Interleave no less than SmallIC but not as aggressive as the normal IC
5496 // to satisfy the rare situation when resources are too limited.
5497 return std::max(IC / 2, SmallIC);
5498 } else {
5499 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5500 return SmallIC;
5501 }
5502 }
5503
5504 // Interleave if this is a large loop (small loops are already dealt with by
5505 // this point) that could benefit from interleaving.
5506 if (AggressivelyInterleaveReductions) {
5507 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5508 return IC;
5509 }
5510
5511 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5512 return 1;
5513}
5514
5517 // This function calculates the register usage by measuring the highest number
5518 // of values that are alive at a single location. Obviously, this is a very
5519 // rough estimation. We scan the loop in a topological order in order and
5520 // assign a number to each instruction. We use RPO to ensure that defs are
5521 // met before their users. We assume that each instruction that has in-loop
5522 // users starts an interval. We record every time that an in-loop value is
5523 // used, so we have a list of the first and last occurrences of each
5524 // instruction. Next, we transpose this data structure into a multi map that
5525 // holds the list of intervals that *end* at a specific location. This multi
5526 // map allows us to perform a linear search. We scan the instructions linearly
5527 // and record each time that a new interval starts, by placing it in a set.
5528 // If we find this value in the multi-map then we remove it from the set.
5529 // The max register usage is the maximum size of the set.
5530 // We also search for instructions that are defined outside the loop, but are
5531 // used inside the loop. We need this number separately from the max-interval
5532 // usage number because when we unroll, loop-invariant values do not take
5533 // more register.
5535 DFS.perform(LI);
5536
5537 RegisterUsage RU;
5538
5539 // Each 'key' in the map opens a new interval. The values
5540 // of the map are the index of the 'last seen' usage of the
5541 // instruction that is the key.
5543
5544 // Maps instruction to its index.
5546 // Marks the end of each interval.
5547 IntervalMap EndPoint;
5548 // Saves the list of instruction indices that are used in the loop.
5550 // Saves the list of values that are used in the loop but are defined outside
5551 // the loop (not including non-instruction values such as arguments and
5552 // constants).
5553 SmallSetVector<Instruction *, 8> LoopInvariants;
5554
5555 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5556 for (Instruction &I : BB->instructionsWithoutDebug()) {
5557 IdxToInstr.push_back(&I);
5558
5559 // Save the end location of each USE.
5560 for (Value *U : I.operands()) {
5561 auto *Instr = dyn_cast<Instruction>(U);
5562
5563 // Ignore non-instruction values such as arguments, constants, etc.
5564 // FIXME: Might need some motivation why these values are ignored. If
5565 // for example an argument is used inside the loop it will increase the
5566 // register pressure (so shouldn't we add it to LoopInvariants).
5567 if (!Instr)
5568 continue;
5569
5570 // If this instruction is outside the loop then record it and continue.
5571 if (!TheLoop->contains(Instr)) {
5572 LoopInvariants.insert(Instr);
5573 continue;
5574 }
5575
5576 // Overwrite previous end points.
5577 EndPoint[Instr] = IdxToInstr.size();
5578 Ends.insert(Instr);
5579 }
5580 }
5581 }
5582
5583 // Saves the list of intervals that end with the index in 'key'.
5584 using InstrList = SmallVector<Instruction *, 2>;
5585 DenseMap<unsigned, InstrList> TransposeEnds;
5586
5587 // Transpose the EndPoints to a list of values that end at each index.
5588 for (auto &Interval : EndPoint)
5589 TransposeEnds[Interval.second].push_back(Interval.first);
5590
5591 SmallPtrSet<Instruction *, 8> OpenIntervals;
5594
5595 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5596
5597 const auto &TTICapture = TTI;
5598 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5599 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5600 return 0;
5601 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5602 };
5603
5604 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5605 Instruction *I = IdxToInstr[i];
5606
5607 // Remove all of the instructions that end at this location.
5608 InstrList &List = TransposeEnds[i];
5609 for (Instruction *ToRemove : List)
5610 OpenIntervals.erase(ToRemove);
5611
5612 // Ignore instructions that are never used within the loop.
5613 if (!Ends.count(I))
5614 continue;
5615
5616 // Skip ignored values.
5617 if (ValuesToIgnore.count(I))
5618 continue;
5619
5621
5622 // For each VF find the maximum usage of registers.
5623 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5624 // Count the number of registers used, per register class, given all open
5625 // intervals.
5626 // Note that elements in this SmallMapVector will be default constructed
5627 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5628 // there is no previous entry for ClassID.
5630
5631 if (VFs[j].isScalar()) {
5632 for (auto *Inst : OpenIntervals) {
5633 unsigned ClassID =
5634 TTI.getRegisterClassForType(false, Inst->getType());
5635 // FIXME: The target might use more than one register for the type
5636 // even in the scalar case.
5637 RegUsage[ClassID] += 1;
5638 }
5639 } else {
5641 for (auto *Inst : OpenIntervals) {
5642 // Skip ignored values for VF > 1.
5643 if (VecValuesToIgnore.count(Inst))
5644 continue;
5645 if (isScalarAfterVectorization(Inst, VFs[j])) {
5646 unsigned ClassID =
5647 TTI.getRegisterClassForType(false, Inst->getType());
5648 // FIXME: The target might use more than one register for the type
5649 // even in the scalar case.
5650 RegUsage[ClassID] += 1;
5651 } else {
5652 unsigned ClassID =
5653 TTI.getRegisterClassForType(true, Inst->getType());
5654 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5655 }
5656 }
5657 }
5658
5659 for (auto& pair : RegUsage) {
5660 auto &Entry = MaxUsages[j][pair.first];
5661 Entry = std::max(Entry, pair.second);
5662 }
5663 }
5664
5665 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5666 << OpenIntervals.size() << '\n');
5667
5668 // Add the current instruction to the list of open intervals.
5669 OpenIntervals.insert(I);
5670 }
5671
5672 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5673 // Note that elements in this SmallMapVector will be default constructed
5674 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5675 // there is no previous entry for ClassID.
5677
5678 for (auto *Inst : LoopInvariants) {
5679 // FIXME: The target might use more than one register for the type
5680 // even in the scalar case.
5681 bool IsScalar = all_of(Inst->users(), [&](User *U) {
5682 auto *I = cast<Instruction>(U);
5683 return TheLoop != LI->getLoopFor(I->getParent()) ||
5684 isScalarAfterVectorization(I, VFs[i]);
5685 });
5686
5687 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
5688 unsigned ClassID =
5689 TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5690 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5691 }
5692
5693 LLVM_DEBUG({
5694 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5695 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5696 << " item\n";
5697 for (const auto &pair : MaxUsages[i]) {
5698 dbgs() << "LV(REG): RegisterClass: "
5699 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5700 << " registers\n";
5701 }
5702 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5703 << " item\n";
5704 for (const auto &pair : Invariant) {
5705 dbgs() << "LV(REG): RegisterClass: "
5706 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5707 << " registers\n";
5708 }
5709 });
5710
5711 RU.LoopInvariantRegs = Invariant;
5712 RU.MaxLocalUsers = MaxUsages[i];
5713 RUs[i] = RU;
5714 }
5715
5716 return RUs;
5717}
5718
5719bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5720 ElementCount VF) {
5721 // TODO: Cost model for emulated masked load/store is completely
5722 // broken. This hack guides the cost model to use an artificially
5723 // high enough value to practically disable vectorization with such
5724 // operations, except where previously deployed legality hack allowed
5725 // using very low cost values. This is to avoid regressions coming simply
5726 // from moving "masked load/store" check from legality to cost model.
5727 // Masked Load/Gather emulation was previously never allowed.
5728 // Limited number of Masked Store/Scatter emulation was allowed.
5730 "Expecting a scalar emulated instruction");
5731 return isa<LoadInst>(I) ||
5732 (isa<StoreInst>(I) &&
5733 NumPredStores > NumberOfStoresToPredicate);
5734}
5735
5737 // If we aren't vectorizing the loop, or if we've already collected the
5738 // instructions to scalarize, there's nothing to do. Collection may already
5739 // have occurred if we have a user-selected VF and are now computing the
5740 // expected cost for interleaving.
5741 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5742 return;
5743
5744 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5745 // not profitable to scalarize any instructions, the presence of VF in the
5746 // map will indicate that we've analyzed it already.
5747 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5748
5749 PredicatedBBsAfterVectorization[VF].clear();
5750
5751 // Find all the instructions that are scalar with predication in the loop and
5752 // determine if it would be better to not if-convert the blocks they are in.
5753 // If so, we also record the instructions to scalarize.
5754 for (BasicBlock *BB : TheLoop->blocks()) {
5756 continue;
5757 for (Instruction &I : *BB)
5758 if (isScalarWithPredication(&I, VF)) {
5759 ScalarCostsTy ScalarCosts;
5760 // Do not apply discount if scalable, because that would lead to
5761 // invalid scalarization costs.
5762 // Do not apply discount logic if hacked cost is needed
5763 // for emulated masked memrefs.
5764 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
5765 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5766 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5767 // Remember that BB will remain after vectorization.
5768 PredicatedBBsAfterVectorization[VF].insert(BB);
5769 }
5770 }
5771}
5772
5773InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5774 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5775 assert(!isUniformAfterVectorization(PredInst, VF) &&
5776 "Instruction marked uniform-after-vectorization will be predicated");
5777
5778 // Initialize the discount to zero, meaning that the scalar version and the
5779 // vector version cost the same.
5780 InstructionCost Discount = 0;
5781
5782 // Holds instructions to analyze. The instructions we visit are mapped in
5783 // ScalarCosts. Those instructions are the ones that would be scalarized if
5784 // we find that the scalar version costs less.
5786
5787 // Returns true if the given instruction can be scalarized.
5788 auto canBeScalarized = [&](Instruction *I) -> bool {
5789 // We only attempt to scalarize instructions forming a single-use chain
5790 // from the original predicated block that would otherwise be vectorized.
5791 // Although not strictly necessary, we give up on instructions we know will
5792 // already be scalar to avoid traversing chains that are unlikely to be
5793 // beneficial.
5794 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5796 return false;
5797
5798 // If the instruction is scalar with predication, it will be analyzed
5799 // separately. We ignore it within the context of PredInst.
5800 if (isScalarWithPredication(I, VF))
5801 return false;
5802
5803 // If any of the instruction's operands are uniform after vectorization,
5804 // the instruction cannot be scalarized. This prevents, for example, a
5805 // masked load from being scalarized.
5806 //
5807 // We assume we will only emit a value for lane zero of an instruction
5808 // marked uniform after vectorization, rather than VF identical values.
5809 // Thus, if we scalarize an instruction that uses a uniform, we would
5810 // create uses of values corresponding to the lanes we aren't emitting code
5811 // for. This behavior can be changed by allowing getScalarValue to clone
5812 // the lane zero values for uniforms rather than asserting.
5813 for (Use &U : I->operands())
5814 if (auto *J = dyn_cast<Instruction>(U.get()))
5815 if (isUniformAfterVectorization(J, VF))
5816 return false;
5817
5818 // Otherwise, we can scalarize the instruction.
5819 return true;
5820 };
5821
5822 // Compute the expected cost discount from scalarizing the entire expression
5823 // feeding the predicated instruction. We currently only consider expressions
5824 // that are single-use instruction chains.
5825 Worklist.push_back(PredInst);
5826 while (!Worklist.empty()) {
5827 Instruction *I = Worklist.pop_back_val();
5828
5829 // If we've already analyzed the instruction, there's nothing to do.
5830 if (ScalarCosts.contains(I))
5831 continue;
5832
5833 // Compute the cost of the vector instruction. Note that this cost already
5834 // includes the scalarization overhead of the predicated instruction.
5835 InstructionCost VectorCost = getInstructionCost(I, VF).first;
5836
5837 // Compute the cost of the scalarized instruction. This cost is the cost of
5838 // the instruction as if it wasn't if-converted and instead remained in the
5839 // predicated block. We will scale this cost by block probability after
5840 // computing the scalarization overhead.
5841 InstructionCost ScalarCost =
5842 VF.getFixedValue() *
5843 getInstructionCost(I, ElementCount::getFixed(1)).first;
5844
5845 // Compute the scalarization overhead of needed insertelement instructions
5846 // and phi nodes.
5848 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5849 ScalarCost += TTI.getScalarizationOverhead(
5850 cast<VectorType>(ToVectorTy(I->getType(), VF)),
5851 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5852 /*Extract*/ false, CostKind);
5853 ScalarCost +=
5854 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5855 }
5856
5857 // Compute the scalarization overhead of needed extractelement
5858 // instructions. For each of the instruction's operands, if the operand can
5859 // be scalarized, add it to the worklist; otherwise, account for the
5860 // overhead.
5861 for (Use &U : I->operands())
5862 if (auto *J = dyn_cast<Instruction>(U.get())) {
5863 assert(VectorType::isValidElementType(J->getType()) &&
5864 "Instruction has non-scalar type");
5865 if (canBeScalarized(J))
5866 Worklist.push_back(J);
5867 else if (needsExtract(J, VF)) {
5868 ScalarCost += TTI.getScalarizationOverhead(
5869 cast<VectorType>(ToVectorTy(J->getType(), VF)),
5870 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5871 /*Extract*/ true, CostKind);
5872 }
5873 }
5874
5875 // Scale the total scalar cost by block probability.
5876 ScalarCost /= getReciprocalPredBlockProb();
5877
5878 // Compute the discount. A non-negative discount means the vector version
5879 // of the instruction costs more, and scalarizing would be beneficial.
5880 Discount += VectorCost - ScalarCost;
5881 ScalarCosts[I] = ScalarCost;
5882 }
5883
5884 return Discount;
5885}
5886
5891
5892 // For each block.
5893 for (BasicBlock *BB : TheLoop->blocks()) {
5894 VectorizationCostTy BlockCost;
5895
5896 // For each instruction in the old loop.
5897 for (Instruction &I : BB->instructionsWithoutDebug()) {
5898 // Skip ignored values.
5899 if (ValuesToIgnore.count(&I) ||
5900 (VF.isVector() && VecValuesToIgnore.count(&I)))
5901 continue;
5902
5903 VectorizationCostTy C = getInstructionCost(&I, VF);
5904
5905 // Check if we should override the cost.
5906 if (C.first.isValid() &&
5907 ForceTargetInstructionCost.getNumOccurrences() > 0)
5909
5910 // Keep a list of instructions with invalid costs.
5911 if (Invalid && !C.first.isValid())
5912 Invalid->emplace_back(&I, VF);
5913
5914 BlockCost.first += C.first;
5915 BlockCost.second |= C.second;
5916 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5917 << " for VF " << VF << " For instruction: " << I
5918 << '\n');
5919 }
5920
5921 // If we are vectorizing a predicated block, it will have been
5922 // if-converted. This means that the block's instructions (aside from
5923 // stores and instructions that may divide by zero) will now be
5924 // unconditionally executed. For the scalar case, we may not always execute
5925 // the predicated block, if it is an if-else block. Thus, scale the block's
5926 // cost by the probability of executing it. blockNeedsPredication from
5927 // Legal is used so as to not include all blocks in tail folded loops.
5928 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5929 BlockCost.first /= getReciprocalPredBlockProb();
5930
5931 Cost.first += BlockCost.first;
5932 Cost.second |= BlockCost.second;
5933 }
5934
5935 return Cost;
5936}
5937
5938/// Gets Address Access SCEV after verifying that the access pattern
5939/// is loop invariant except the induction variable dependence.
5940///
5941/// This SCEV can be sent to the Target in order to estimate the address
5942/// calculation cost.
5944 Value *Ptr,
5947 const Loop *TheLoop) {
5948
5949 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5950 if (!Gep)
5951 return nullptr;
5952
5953 // We are looking for a gep with all loop invariant indices except for one
5954 // which should be an induction variable.
5955 auto SE = PSE.getSE();
5956 unsigned NumOperands = Gep->getNumOperands();
5957 for (unsigned i = 1; i < NumOperands; ++i) {
5958 Value *Opd = Gep->getOperand(i);
5959 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5960 !Legal->isInductionVariable(Opd))
5961 return nullptr;
5962 }
5963
5964 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5965 return PSE.getSCEV(Ptr);
5966}
5967
5969LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5970 ElementCount VF) {
5971 assert(VF.isVector() &&
5972 "Scalarization cost of instruction implies vectorization.");
5973 if (VF.isScalable())
5975
5976 Type *ValTy = getLoadStoreType(I);
5977 auto SE = PSE.getSE();
5978
5979 unsigned AS = getLoadStoreAddressSpace(I);
5981 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5982 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5983 // that it is being called from this specific place.
5984
5985 // Figure out whether the access is strided and get the stride value
5986 // if it's known in compile time
5987 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5988
5989 // Get the cost of the scalar memory instruction and address computation.
5991 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5992
5993 // Don't pass *I here, since it is scalar but will actually be part of a
5994 // vectorized loop where the user of it is a vectorized instruction.
5996 const Align Alignment = getLoadStoreAlignment(I);
5997 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5998 ValTy->getScalarType(),
5999 Alignment, AS, CostKind);
6000
6001 // Get the overhead of the extractelement and insertelement instructions
6002 // we might create due to scalarization.
6003 Cost += getScalarizationOverhead(I, VF, CostKind);
6004
6005 // If we have a predicated load/store, it will need extra i1 extracts and
6006 // conditional branches, but may not be executed for each vector lane. Scale
6007 // the cost by the probability of executing the predicated block.
6008 if (isPredicatedInst(I)) {
6010
6011 // Add the cost of an i1 extract and a branch
6012 auto *Vec_i1Ty =
6015 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6016 /*Insert=*/false, /*Extract=*/true, CostKind);
6017 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
6018
6019 if (useEmulatedMaskMemRefHack(I, VF))
6020 // Artificially setting to a high enough value to practically disable
6021 // vectorization with such operations.
6022 Cost = 3000000;
6023 }
6024
6025 return Cost;
6026}
6027
6029LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6030 ElementCount VF) {
6031 Type *ValTy = getLoadStoreType(I);
6032 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6034 unsigned AS = getLoadStoreAddressSpace(I);
6035 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6037
6038 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6039 "Stride should be 1 or -1 for consecutive memory access");
6040 const Align Alignment = getLoadStoreAlignment(I);
6042 if (Legal->isMaskRequired(I)) {
6043 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6044 CostKind);
6045 } else {
6046 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6047 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6048 CostKind, OpInfo, I);
6049 }
6050
6051 bool Reverse = ConsecutiveStride < 0;
6052 if (Reverse)
6054 std::nullopt, CostKind, 0);
6055 return Cost;
6056}
6057
6059LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6060 ElementCount VF) {
6061 assert(Legal->isUniformMemOp(*I, VF));
6062
6063 Type *ValTy = getLoadStoreType(I);
6064 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6065 const Align Alignment = getLoadStoreAlignment(I);
6066 unsigned AS = getLoadStoreAddressSpace(I);
6068 if (isa<LoadInst>(I)) {
6069 return TTI.getAddressComputationCost(ValTy) +
6070 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6071 CostKind) +
6073 }
6074 StoreInst *SI = cast<StoreInst>(I);
6075
6076 bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
6077 return TTI.getAddressComputationCost(ValTy) +
6078 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6079 CostKind) +
6080 (isLoopInvariantStoreValue
6081 ? 0
6082 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6083 CostKind, VF.getKnownMinValue() - 1));
6084}
6085
6087LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6088 ElementCount VF) {
6089 Type *ValTy = getLoadStoreType(I);
6090 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6091 const Align Alignment = getLoadStoreAlignment(I);
6093
6094 return TTI.getAddressComputationCost(VectorTy) +
6096 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6098}
6099
6101LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6102 ElementCount VF) {
6103 Type *ValTy = getLoadStoreType(I);
6104 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6105 unsigned AS = getLoadStoreAddressSpace(I);
6107
6108 auto Group = getInterleavedAccessGroup(I);
6109 assert(Group && "Fail to get an interleaved access group.");
6110
6111 unsigned InterleaveFactor = Group->getFactor();
6112 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6113
6114 // Holds the indices of existing members in the interleaved group.
6116 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6117 if (Group->getMember(IF))
6118 Indices.push_back(IF);
6119
6120 // Calculate the cost of the whole interleaved group.
6121 bool UseMaskForGaps =
6122 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6123 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6125 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6126 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
6127
6128 if (Group->isReverse()) {
6129 // TODO: Add support for reversed masked interleaved access.
6131 "Reverse masked interleaved access not supported.");
6132 Cost += Group->getNumMembers() *
6134 std::nullopt, CostKind, 0);
6135 }
6136 return Cost;
6137}
6138
6139std::optional<InstructionCost>
6140LoopVectorizationCostModel::getReductionPatternCost(
6141 Instruction *I, ElementCount VF, Type *Ty,
6143 using namespace llvm::PatternMatch;
6144 // Early exit for no inloop reductions
6145 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6146 return std::nullopt;
6147 auto *VectorTy = cast<VectorType>(Ty);
6148
6149 // We are looking for a pattern of, and finding the minimal acceptable cost:
6150 // reduce(mul(ext(A), ext(B))) or
6151 // reduce(mul(A, B)) or
6152 // reduce(ext(A)) or
6153 // reduce(A).
6154 // The basic idea is that we walk down the tree to do that, finding the root
6155 // reduction instruction in InLoopReductionImmediateChains. From there we find
6156 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6157 // of the components. If the reduction cost is lower then we return it for the
6158 // reduction instruction and 0 for the other instructions in the pattern. If
6159 // it is not we return an invalid cost specifying the orignal cost method
6160 // should be used.
6161 Instruction *RetI = I;
6162 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6163 if (!RetI->hasOneUser())
6164 return std::nullopt;
6165 RetI = RetI->user_back();
6166 }
6167
6168 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6169 RetI->user_back()->getOpcode() == Instruction::Add) {
6170 RetI = RetI->user_back();
6171 }
6172
6173 // Test if the found instruction is a reduction, and if not return an invalid
6174 // cost specifying the parent to use the original cost modelling.
6175 if (!InLoopReductionImmediateChains.count(RetI))
6176 return std::nullopt;
6177
6178 // Find the reduction this chain is a part of and calculate the basic cost of
6179 // the reduction on its own.
6180 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
6181 Instruction *ReductionPhi = LastChain;
6182 while (!isa<PHINode>(ReductionPhi))
6183 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
6184
6185 const RecurrenceDescriptor &RdxDesc =
6186 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6187
6189 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6190
6191 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6192 // normal fmul instruction to the cost of the fadd reduction.
6193 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6194 BaseCost +=
6195 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6196
6197 // If we're using ordered reductions then we can just return the base cost
6198 // here, since getArithmeticReductionCost calculates the full ordered
6199 // reduction cost when FP reassociation is not allowed.
6200 if (useOrderedReductions(RdxDesc))
6201 return BaseCost;
6202
6203 // Get the operand that was not the reduction chain and match it to one of the
6204 // patterns, returning the better cost if it is found.
6205 Instruction *RedOp = RetI->getOperand(1) == LastChain
6206 ? dyn_cast<Instruction>(RetI->getOperand(0))
6207 : dyn_cast<Instruction>(RetI->getOperand(1));
6208
6209 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6210
6211 Instruction *Op0, *Op1;
6212 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6213 match(RedOp,
6215 match(Op0, m_ZExtOrSExt(m_Value())) &&
6216 Op0->getOpcode() == Op1->getOpcode() &&
6217 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6219 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6220
6221 // Matched reduce.add(ext(mul(ext(A), ext(B)))
6222 // Note that the extend opcodes need to all match, or if A==B they will have
6223 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6224 // which is equally fine.
6225 bool IsUnsigned = isa<ZExtInst>(Op0);
6226 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6227 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6228
6229 InstructionCost ExtCost =
6230 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6232 InstructionCost MulCost =
6233 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6234 InstructionCost Ext2Cost =
6235 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6237
6239 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6240
6241 if (RedCost.isValid() &&
6242 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6243 return I == RetI ? RedCost : 0;
6244 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6245 !TheLoop->isLoopInvariant(RedOp)) {
6246 // Matched reduce(ext(A))
6247 bool IsUnsigned = isa<ZExtInst>(RedOp);
6248 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6250 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6251 RdxDesc.getFastMathFlags(), CostKind);
6252
6253 InstructionCost ExtCost =
6254 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6256 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6257 return I == RetI ? RedCost : 0;
6258 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6259 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6260 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6261 Op0->getOpcode() == Op1->getOpcode() &&
6263 bool IsUnsigned = isa<ZExtInst>(Op0);
6264 Type *Op0Ty = Op0->getOperand(0)->getType();
6265 Type *Op1Ty = Op1->getOperand(0)->getType();
6266 Type *LargestOpTy =
6267 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6268 : Op0Ty;
6269 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6270
6271 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6272 // different sizes. We take the largest type as the ext to reduce, and add
6273 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6275 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6278 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6280 InstructionCost MulCost =
6281 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6282
6284 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6285 InstructionCost ExtraExtCost = 0;
6286 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6287 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6288 ExtraExtCost = TTI.getCastInstrCost(
6289 ExtraExtOp->getOpcode(), ExtType,
6290 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6292 }
6293
6294 if (RedCost.isValid() &&
6295 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6296 return I == RetI ? RedCost : 0;
6297 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6298 // Matched reduce.add(mul())
6299 InstructionCost MulCost =
6300 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6301
6303 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6304
6305 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6306 return I == RetI ? RedCost : 0;
6307 }
6308 }
6309
6310 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6311}
6312
6314LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6315 ElementCount VF) {
6316 // Calculate scalar cost only. Vectorization cost should be ready at this
6317 // moment.
6318 if (VF.isScalar()) {
6319 Type *ValTy = getLoadStoreType(I);
6320 const Align Alignment = getLoadStoreAlignment(I);
6321 unsigned AS = getLoadStoreAddressSpace(I);
6322
6323 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6324 return TTI.getAddressComputationCost(ValTy) +
6325 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6326 TTI::TCK_RecipThroughput, OpInfo, I);
6327 }
6328 return getWideningCost(I, VF);
6329}
6330
6332LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6333 ElementCount VF) {
6334 // If we know that this instruction will remain uniform, check the cost of
6335 // the scalar version.
6337 VF = ElementCount::getFixed(1);
6338
6339 if (VF.isVector() && isProfitableToScalarize(I, VF))
6340 return VectorizationCostTy(InstsToScalarize[VF][I], false);
6341
6342 // Forced scalars do not have any scalarization overhead.
6343 auto ForcedScalar = ForcedScalars.find(VF);
6344 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6345 auto InstSet = ForcedScalar->second;
6346 if (InstSet.count(I))
6347 return VectorizationCostTy(
6348 (getInstructionCost(I, ElementCount::getFixed(1)).first *
6349 VF.getKnownMinValue()),
6350 false);
6351 }
6352
6353 Type *VectorTy;
6354 InstructionCost C = getInstructionCost(I, VF, VectorTy);
6355
6356 bool TypeNotScalarized = false;
6357 if (VF.isVector() && VectorTy->isVectorTy()) {
6358 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6359 if (VF.isScalable())
6360 // <vscale x 1 x iN> is assumed to be profitable over iN because
6361 // scalable registers are a distinct register class from scalar ones.
6362 // If we ever find a target which wants to lower scalable vectors
6363 // back to scalars, we'll need to update this code to explicitly
6364 // ask TTI about the register class uses for each part.
6365 TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6366 else
6367 TypeNotScalarized = NumParts < VF.getKnownMinValue();
6368 } else
6370 }
6371 return VectorizationCostTy(C, TypeNotScalarized);
6372}
6373
6374InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6376
6377 // There is no mechanism yet to create a scalable scalarization loop,
6378 // so this is currently Invalid.
6379 if (VF.isScalable())
6381
6382 if (VF.isScalar())
6383 return 0;
6384
6386 Type *RetTy = ToVectorTy(I->getType(), VF);
6387 if (!RetTy->isVoidTy() &&
6388 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6390 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6391 /*Insert*/ true,
6392 /*Extract*/ false, CostKind);
6393
6394 // Some targets keep addresses scalar.
6395 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6396 return Cost;
6397
6398 // Some targets support efficient element stores.
6399 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6400 return Cost;
6401
6402 // Collect operands to consider.
6403 CallInst *CI = dyn_cast<CallInst>(I);
6404 Instruction::op_range Ops = CI ? CI->args() : I->operands();
6405
6406 // Skip operands that do not require extraction/scalarization and do not incur
6407 // any overhead.
6409 for (auto *V : filterExtractingOperands(Ops, VF))
6410 Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6412 filterExtractingOperands(Ops, VF), Tys, CostKind);
6413}
6414
6416 if (VF.isScalar())
6417 return;
6418 NumPredStores = 0;
6419 for (BasicBlock *BB : TheLoop->blocks()) {
6420 // For each instruction in the old loop.
6421 for (Instruction &I : *BB) {
6423 if (!Ptr)
6424 continue;
6425
6426 // TODO: We should generate better code and update the cost model for
6427 // predicated uniform stores. Today they are treated as any other
6428 // predicated store (see added test cases in
6429 // invariant-store-vectorization.ll).
6430 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6431 NumPredStores++;
6432
6433 if (Legal->isUniformMemOp(I, VF)) {
6434 auto isLegalToScalarize = [&]() {
6435 if (!VF.isScalable())
6436 // Scalarization of fixed length vectors "just works".
6437 return true;
6438
6439 // We have dedicated lowering for unpredicated uniform loads and
6440 // stores. Note that even with tail folding we know that at least
6441 // one lane is active (i.e. generalized predication is not possible
6442 // here), and the logic below depends on this fact.
6443 if (!foldTailByMasking())
6444 return true;
6445
6446 // For scalable vectors, a uniform memop load is always
6447 // uniform-by-parts and we know how to scalarize that.
6448 if (isa<LoadInst>(I))
6449 return true;
6450
6451 // A uniform store isn't neccessarily uniform-by-part
6452 // and we can't assume scalarization.
6453 auto &SI = cast<StoreInst>(I);
6454 return TheLoop->isLoopInvariant(SI.getValueOperand());
6455 };
6456
6457 const InstructionCost GatherScatterCost =
6459 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6460
6461 // Load: Scalar load + broadcast
6462 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6463 // FIXME: This cost is a significant under-estimate for tail folded
6464 // memory ops.
6465 const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6466 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6467
6468 // Choose better solution for the current VF, Note that Invalid
6469 // costs compare as maximumal large. If both are invalid, we get
6470 // scalable invalid which signals a failure and a vectorization abort.
6471 if (GatherScatterCost < ScalarizationCost)
6472 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6473 else
6474 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6475 continue;
6476 }
6477
6478 // We assume that widening is the best solution when possible.
6479 if (memoryInstructionCanBeWidened(&I, VF)) {
6480 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6481 int ConsecutiveStride = Legal->isConsecutivePtr(
6483 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6484 "Expected consecutive stride.");
6485 InstWidening Decision =
6486 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6487 setWideningDecision(&I, VF, Decision, Cost);
6488 continue;
6489 }
6490
6491 // Choose between Interleaving, Gather/Scatter or Scalarization.
6493 unsigned NumAccesses = 1;
6494 if (isAccessInterleaved(&I)) {
6495 auto Group = getInterleavedAccessGroup(&I);
6496 assert(Group && "Fail to get an interleaved access group.");
6497
6498 // Make one decision for the whole group.
6499 if (getWideningDecision(&I, VF) != CM_Unknown)
6500 continue;
6501
6502 NumAccesses = Group->getNumMembers();
6504 InterleaveCost = getInterleaveGroupCost(&I, VF);
6505 }
6506
6507 InstructionCost GatherScatterCost =
6509 ? getGatherScatterCost(&I, VF) * NumAccesses
6511
6512 InstructionCost ScalarizationCost =
6513 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6514
6515 // Choose better solution for the current VF,
6516 // write down this decision and use it during vectorization.
6518 InstWidening Decision;
6519 if (InterleaveCost <= GatherScatterCost &&
6520 InterleaveCost < ScalarizationCost) {
6521 Decision = CM_Interleave;
6522 Cost = InterleaveCost;
6523 } else if (GatherScatterCost < ScalarizationCost) {
6524 Decision = CM_GatherScatter;
6525 Cost = GatherScatterCost;
6526 } else {
6527 Decision = CM_Scalarize;
6528 Cost = ScalarizationCost;
6529 }
6530 // If the instructions belongs to an interleave group, the whole group
6531 // receives the same decision. The whole group receives the cost, but
6532 // the cost will actually be assigned to one instruction.
6533 if (auto Group = getInterleavedAccessGroup(&I))
6534 setWideningDecision(Group, VF, Decision, Cost);
6535 else
6536 setWideningDecision(&I, VF, Decision, Cost);
6537 }
6538 }
6539
6540 // Make sure that any load of address and any other address computation
6541 // remains scalar unless there is gather/scatter support. This avoids
6542 // inevitable extracts into address registers, and also has the benefit of
6543 // activating LSR more, since that pass can't optimize vectorized
6544 // addresses.
6546 return;
6547
6548 // Start with all scalar pointer uses.
6550 for (BasicBlock *BB : TheLoop->blocks())
6551 for (Instruction &I : *BB) {
6552 Instruction *PtrDef =
6553 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6554 if (PtrDef && TheLoop->contains(PtrDef) &&
6556 AddrDefs.insert(PtrDef);
6557 }
6558
6559 // Add all instructions used to generate the addresses.
6561 append_range(Worklist, AddrDefs);
6562 while (!Worklist.empty()) {
6563 Instruction *I = Worklist.pop_back_val();
6564 for (auto &Op : I->operands())
6565 if (auto *InstOp = dyn_cast<Instruction>(Op))
6566 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6567 AddrDefs.insert(InstOp).second)
6568 Worklist.push_back(InstOp);
6569 }
6570
6571 for (auto *I : AddrDefs) {
6572 if (isa<LoadInst>(I)) {
6573 // Setting the desired widening decision should ideally be handled in
6574 // by cost functions, but since this involves the task of finding out
6575 // if the loaded register is involved in an address computation, it is
6576 // instead changed here when we know this is the case.
6577 InstWidening Decision = getWideningDecision(I, VF);
6578 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6579 // Scalarize a widened load of address.
6581 I, VF, CM_Scalarize,
6582 (VF.getKnownMinValue() *
6583 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6584 else if (auto Group = getInterleavedAccessGroup(I)) {
6585 // Scalarize an interleave group of address loads.
6586 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6587 if (Instruction *Member = Group->getMember(I))
6589 Member, VF, CM_Scalarize,
6590 (VF.getKnownMinValue() *
6591 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6592 }
6593 }
6594 } else
6595 // Make sure I gets scalarized and a cost estimate without
6596 // scalarization overhead.
6597 ForcedScalars[VF].insert(I);
6598 }
6599}
6600
6602 assert(!VF.isScalar() &&
6603 "Trying to set a vectorization decision for a scalar VF");
6604
6605 for (BasicBlock *BB : TheLoop->blocks()) {
6606 // For each instruction in the old loop.
6607 for (Instruction &I : *BB) {
6608 CallInst *CI = dyn_cast<CallInst>(&I);
6609
6610 if (!CI)
6611 continue;
6612
6617
6618 Function *ScalarFunc = CI->getCalledFunction();
6619 Type *ScalarRetTy = CI->getType();
6620 SmallVector<Type *, 4> Tys, ScalarTys;
6621 bool MaskRequired = Legal->isMaskRequired(CI);
6622 for (auto &ArgOp : CI->args())
6623 ScalarTys.push_back(ArgOp->getType());
6624
6625 // Compute corresponding vector type for return value and arguments.
6626 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
6627 for (Type *ScalarTy : ScalarTys)
6628 Tys.push_back(ToVectorTy(ScalarTy, VF));
6629
6630 // An in-loop reduction using an fmuladd intrinsic is a special case;
6631 // we don't want the normal cost for that intrinsic.
6633 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6636 std::nullopt, *RedCost);
6637 continue;
6638 }
6639
6640 // Estimate cost of scalarized vector call. The source operands are
6641 // assumed to be vectors, so we need to extract individual elements from
6642 // there, execute VF scalar calls, and then gather the result into the
6643 // vector return value.
6644 InstructionCost ScalarCallCost =
6645 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6646
6647 // Compute costs of unpacking argument values for the scalar calls and
6648 // packing the return values to a vector.
6649 InstructionCost ScalarizationCost =
6650 getScalarizationOverhead(CI, VF, CostKind);
6651
6652 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6653
6654 // Find the cost of vectorizing the call, if we can find a suitable
6655 // vector variant of the function.
6656 bool UsesMask = false;
6657 VFInfo FuncInfo;
6658 Function *VecFunc = nullptr;
6659 // Search through any available variants for one we can use at this VF.
6660 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6661 // Must match requested VF.
6662 if (Info.Shape.VF != VF)
6663 continue;
6664
6665 // Must take a mask argument if one is required
6666 if (MaskRequired && !Info.isMasked())
6667 continue;
6668
6669 // Check that all parameter kinds are supported
6670 bool ParamsOk = true;
6671 for (VFParameter Param : Info.Shape.Parameters) {
6672 switch (Param.ParamKind) {
6674 break;
6676 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6677 // Make sure the scalar parameter in the loop is invariant.
6678 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6679 TheLoop))
6680 ParamsOk = false;
6681 break;
6682 }
6684 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6685 // Find the stride for the scalar parameter in this loop and see if
6686 // it matches the stride for the variant.
6687 // TODO: do we need to figure out the cost of an extract to get the
6688 // first lane? Or do we hope that it will be folded away?
6689 ScalarEvolution *SE = PSE.getSE();
6690 const auto *SAR =
6691 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6692
6693 if (!SAR || SAR->getLoop() != TheLoop) {
6694 ParamsOk = false;
6695 break;
6696 }
6697
6698 const SCEVConstant *Step =
6699 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6700
6701 if (!Step ||
6702 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6703 ParamsOk = false;
6704
6705 break;
6706 }
6708 UsesMask = true;
6709 break;
6710 default:
6711 ParamsOk = false;
6712 break;
6713 }
6714 }
6715
6716 if (!ParamsOk)
6717 continue;
6718
6719 // Found a suitable candidate, stop here.
6720 VecFunc = CI->getModule()->getFunction(Info.VectorName);
6721 FuncInfo = Info;
6722 break;
6723 }
6724
6725 // Add in the cost of synthesizing a mask if one wasn't required.
6726 InstructionCost MaskCost = 0;
6727 if (VecFunc && UsesMask && !MaskRequired)
6728 MaskCost = TTI.getShuffleCost(
6731 VecFunc->getFunctionType()->getContext()),
6732 VF));
6733
6734 if (TLI && VecFunc && !CI->isNoBuiltin())
6735 VectorCost =
6736 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6737
6738 // Find the cost of an intrinsic; some targets may have instructions that
6739 // perform the operation without needing an actual call.
6741 if (IID != Intrinsic::not_intrinsic)
6742 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6743
6744 InstructionCost Cost = ScalarCost;
6745 InstWidening Decision = CM_Scalarize;
6746
6747 if (VectorCost <= Cost) {
6748 Cost = VectorCost;
6749 Decision = CM_VectorCall;
6750 }
6751
6752 if (IntrinsicCost <= Cost) {
6753 Cost = IntrinsicCost;
6754 Decision = CM_IntrinsicCall;
6755 }
6756
6757 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6759 }
6760 }
6761}
6762
6764LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6765 Type *&VectorTy) {
6766 Type *RetTy = I->getType();
6768 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6769 auto SE = PSE.getSE();
6771
6772 auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6773 ElementCount VF) -> bool {
6774 if (VF.isScalar())
6775 return true;
6776
6777 auto Scalarized = InstsToScalarize.find(VF);
6778 assert(Scalarized != InstsToScalarize.end() &&
6779 "VF not yet analyzed for scalarization profitability");
6780 return !Scalarized->second.count(I) &&
6781 llvm::all_of(I->users(), [&](User *U) {
6782 auto *UI = cast<Instruction>(U);
6783 return !Scalarized->second.count(UI);
6784 });
6785 };
6786 (void) hasSingleCopyAfterVectorization;
6787
6788 if (isScalarAfterVectorization(I, VF)) {
6789 // With the exception of GEPs and PHIs, after scalarization there should
6790 // only be one copy of the instruction generated in the loop. This is
6791 // because the VF is either 1, or any instructions that need scalarizing
6792 // have already been dealt with by the time we get here. As a result,
6793 // it means we don't have to multiply the instruction cost by VF.
6794 assert(I->getOpcode() == Instruction::GetElementPtr ||
6795 I->getOpcode() == Instruction::PHI ||
6796 (I->getOpcode() == Instruction::BitCast &&
6797 I->getType()->isPointerTy()) ||
6798 hasSingleCopyAfterVectorization(I, VF));
6799 VectorTy = RetTy;
6800 } else
6801 VectorTy = ToVectorTy(RetTy, VF);
6802
6803 // TODO: We need to estimate the cost of intrinsic calls.
6804 switch (I->getOpcode()) {
6805 case Instruction::GetElementPtr:
6806 // We mark this instruction as zero-cost because the cost of GEPs in
6807 // vectorized code depends on whether the corresponding memory instruction
6808 // is scalarized or not. Therefore, we handle GEPs with the memory
6809 // instruction cost.
6810 return 0;
6811 case Instruction::Br: {
6812 // In cases of scalarized and predicated instructions, there will be VF
6813 // predicated blocks in the vectorized loop. Each branch around these
6814 // blocks requires also an extract of its vector compare i1 element.
6815 bool ScalarPredicatedBB = false;
6816 BranchInst *BI = cast<BranchInst>(I);
6817 if (VF.isVector() && BI->isConditional() &&
6818 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6819 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
6820 ScalarPredicatedBB = true;
6821
6822 if (ScalarPredicatedBB) {
6823 // Not possible to scalarize scalable vector with predicated instructions.
6824 if (VF.isScalable())
6826 // Return cost for branches around scalarized and predicated blocks.
6827 auto *Vec_i1Ty =
6828 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6829 return (
6831 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
6832 /*Insert*/ false, /*Extract*/ true, CostKind) +
6833 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6834 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6835 // The back-edge branch will remain, as will all scalar branches.
6836 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6837 else
6838 // This branch will be eliminated by if-conversion.
6839 return 0;
6840 // Note: We currently assume zero cost for an unconditional branch inside
6841 // a predicated block since it will become a fall-through, although we
6842 // may decide in the future to call TTI for all branches.
6843 }
6844 case Instruction::PHI: {
6845 auto *Phi = cast<PHINode>(I);
6846
6847 // First-order recurrences are replaced by vector shuffles inside the loop.
6848 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6850 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6852 cast<VectorType>(VectorTy), Mask, CostKind,
6853 VF.getKnownMinValue() - 1);
6854 }
6855
6856 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6857 // converted into select instructions. We require N - 1 selects per phi
6858 // node, where N is the number of incoming values.
6859 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6860 return (Phi->getNumIncomingValues() - 1) *
6862 Instruction::Select, ToVectorTy(Phi->getType(), VF),
6863 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6865
6866 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6867 }
6868 case Instruction::UDiv:
6869 case Instruction::SDiv:
6870 case Instruction::URem:
6871 case Instruction::SRem:
6872 if (VF.isVector() && isPredicatedInst(I)) {
6873 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6874 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6875 ScalarCost : SafeDivisorCost;
6876 }
6877 // We've proven all lanes safe to speculate, fall through.
6878 [[fallthrough]];
6879 case Instruction::Add:
6880 case Instruction::FAdd:
6881 case Instruction::Sub:
6882 case Instruction::FSub:
6883 case Instruction::Mul:
6884 case Instruction::FMul:
6885 case Instruction::FDiv:
6886 case Instruction::FRem:
6887 case Instruction::Shl:
6888 case Instruction::LShr:
6889 case Instruction::AShr:
6890 case Instruction::And:
6891 case Instruction::Or:
6892 case Instruction::Xor: {
6893 // If we're speculating on the stride being 1, the multiplication may
6894 // fold away. We can generalize this for all operations using the notion
6895 // of neutral elements. (TODO)
6896 if (I->getOpcode() == Instruction::Mul &&
6897 (PSE.getSCEV(I->getOperand(0))->isOne() ||
6898 PSE.getSCEV(I->getOperand(1))->isOne()))
6899 return 0;
6900
6901 // Detect reduction patterns
6902 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6903 return *RedCost;
6904
6905 // Certain instructions can be cheaper to vectorize if they have a constant
6906 // second vector operand. One example of this are shifts on x86.
6907 Value *Op2 = I->getOperand(1);
6908 auto Op2Info = TTI.getOperandInfo(Op2);
6909 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6910 Legal->isInvariant(Op2))
6912
6913 SmallVector<const Value *, 4> Operands(I->operand_values());
6915 I->getOpcode(), VectorTy, CostKind,
6916 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6917 Op2Info, Operands, I, TLI);
6918 }
6919 case Instruction::FNeg: {
6921 I->getOpcode(), VectorTy, CostKind,
6922 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6923 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6924 I->getOperand(0), I);
6925 }
6926 case Instruction::Select: {
6927 SelectInst *SI = cast<SelectInst>(I);
6928 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6929 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6930
6931 const Value *Op0, *Op1;
6932 using namespace llvm::PatternMatch;
6933 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6934 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6935 // select x, y, false --> x & y
6936 // select x, true, y --> x | y
6937 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6938 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6939 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6940 Op1->getType()->getScalarSizeInBits() == 1);
6941
6944 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6945 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6946 }
6947
6948 Type *CondTy = SI->getCondition()->getType();
6949 if (!ScalarCond)
6950 CondTy = VectorType::get(CondTy, VF);
6951
6953 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6954 Pred = Cmp->getPredicate();
6955 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6956 CostKind, I);
6957 }
6958 case Instruction::ICmp:
6959 case Instruction::FCmp: {
6960 Type *ValTy = I->getOperand(0)->getType();
6961 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6962 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6963 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6964 VectorTy = ToVectorTy(ValTy, VF);
6965 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6966 cast<CmpInst>(I)->getPredicate(), CostKind,
6967 I);
6968 }
6969 case Instruction::Store:
6970 case Instruction::Load: {
6971 ElementCount Width = VF;
6972 if (Width.isVector()) {
6973 InstWidening Decision = getWideningDecision(I, Width);
6974 assert(Decision != CM_Unknown &&
6975 "CM decision should be taken at this point");
6978 if (Decision == CM_Scalarize)
6979 Width = ElementCount::getFixed(1);
6980 }
6981 VectorTy = ToVectorTy(getLoadStoreType(I), Width);
6982 return getMemoryInstructionCost(I, VF);
6983 }
6984 case Instruction::BitCast:
6985 if (I->getType()->isPointerTy())
6986 return 0;
6987 [[fallthrough]];
6988 case Instruction::ZExt:
6989 case Instruction::SExt:
6990 case Instruction::FPToUI:
6991 case Instruction::FPToSI:
6992 case Instruction::FPExt:
6993 case Instruction::PtrToInt:
6994 case Instruction::IntToPtr:
6995 case Instruction::SIToFP:
6996 case Instruction::UIToFP:
6997 case Instruction::Trunc:
6998 case Instruction::FPTrunc: {
6999 // Computes the CastContextHint from a Load/Store instruction.
7000 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7001 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7002 "Expected a load or a store!");
7003
7004 if (VF.isScalar() || !TheLoop->contains(I))
7006
7007 switch (getWideningDecision(I, VF)) {
7019 llvm_unreachable("Instr did not go through cost modelling?");
7022 llvm_unreachable_internal("Instr has invalid widening decision");
7023 }
7024
7025 llvm_unreachable("Unhandled case!");
7026 };
7027
7028 unsigned Opcode = I->getOpcode();
7030 // For Trunc, the context is the only user, which must be a StoreInst.
7031 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7032 if (I->hasOneUse())
7033 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7034 CCH = ComputeCCH(Store);
7035 }
7036 // For Z/Sext, the context is the operand, which must be a LoadInst.
7037 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7038 Opcode == Instruction::FPExt) {
7039 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7040 CCH = ComputeCCH(Load);
7041 }
7042
7043 // We optimize the truncation of induction variables having constant
7044 // integer steps. The cost of these truncations is the same as the scalar
7045 // operation.
7046 if (isOptimizableIVTruncate(I, VF)) {
7047 auto *Trunc = cast<TruncInst>(I);
7048 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7049 Trunc->getSrcTy(), CCH, CostKind, Trunc);
7050 }
7051
7052 // Detect reduction patterns
7053 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7054 return *RedCost;
7055
7056 Type *SrcScalarTy = I->getOperand(0)->getType();
7057 Type *SrcVecTy =
7058 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7060 // This cast is going to be shrunk. This may remove the cast or it might
7061 // turn it into slightly different cast. For example, if MinBW == 16,
7062 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7063 //
7064 // Calculate the modified src and dest types.
7065 Type *MinVecTy = VectorTy;
7066 if (Opcode == Instruction::Trunc) {
7067 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7068 VectorTy =
7069 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7070 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7071 // Leave SrcVecTy unchanged - we only shrink the destination element
7072 // type.
7073 VectorTy =
7074 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7075 }
7076 }
7077
7078 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7079 }
7080 case Instruction::Call:
7081 return getVectorCallCost(cast<CallInst>(I), VF);
7082 case Instruction::ExtractValue:
7084 case Instruction::Alloca:
7085 // We cannot easily widen alloca to a scalable alloca, as
7086 // the result would need to be a vector of pointers.
7087 if (VF.isScalable())
7089 [[fallthrough]];
7090 default:
7091 // This opcode is unknown. Assume that it is the same as 'mul'.
7092 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7093 } // end of switch.
7094}
7095
7097 // Ignore ephemeral values.
7099
7100 // Find all stores to invariant variables. Since they are going to sink
7101 // outside the loop we do not need calculate cost for them.
7102 for (BasicBlock *BB : TheLoop->blocks())
7103 for (Instruction &I : *BB) {
7104 StoreInst *SI;
7105 if ((SI = dyn_cast<StoreInst>(&I)) &&
7106 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7107 ValuesToIgnore.insert(&I);
7108 }
7109
7110 // Ignore type-promoting instructions we identified during reduction
7111 // detection.
7112 for (const auto &Reduction : Legal->getReductionVars()) {
7113 const RecurrenceDescriptor &RedDes = Reduction.second;
7114 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7115 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7116 }
7117 // Ignore type-casting instructions we identified during induction
7118 // detection.
7119 for (const auto &Induction : Legal->getInductionVars()) {
7120 const InductionDescriptor &IndDes = Induction.second;
7121 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7122 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7123 }
7124}
7125
7127 for (const auto &Reduction : Legal->getReductionVars()) {
7128 PHINode *Phi = Reduction.first;
7129 const RecurrenceDescriptor &RdxDesc = Reduction.second;
7130
7131 // We don't collect reductions that are type promoted (yet).
7132 if (RdxDesc.getRecurrenceType() != Phi->getType())
7133 continue;
7134
7135 // If the target would prefer this reduction to happen "in-loop", then we
7136 // want to record it as such.
7137 unsigned Opcode = RdxDesc.getOpcode();
7138 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7139 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7141 continue;
7142
7143 // Check that we can correctly put the reductions into the loop, by
7144 // finding the chain of operations that leads from the phi to the loop
7145 // exit value.
7146 SmallVector<Instruction *, 4> ReductionOperations =
7147 RdxDesc.getReductionOpChain(Phi, TheLoop);
7148 bool InLoop = !ReductionOperations.empty();
7149
7150 if (InLoop) {
7151 InLoopReductions.insert(Phi);
7152 // Add the elements to InLoopReductionImmediateChains for cost modelling.
7153 Instruction *LastChain = Phi;
7154 for (auto *I : ReductionOperations) {
7155 InLoopReductionImmediateChains[I] = LastChain;
7156 LastChain = I;
7157 }
7158 }
7159 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7160 << " reduction for phi: " << *Phi << "\n");
7161 }
7162}
7163
7165 DebugLoc DL, const Twine &Name) {
7167 Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
7168 return tryInsertInstruction(
7169 new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
7170}
7171
7172// This function will select a scalable VF if the target supports scalable
7173// vectors and a fixed one otherwise.
7174// TODO: we could return a pair of values that specify the max VF and
7175// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7176// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7177// doesn't have a cost model that can choose which plan to execute if
7178// more than one is generated.
7181 unsigned WidestType;
7182 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7183
7188
7190 unsigned N = RegSize.getKnownMinValue() / WidestType;
7191 return ElementCount::get(N, RegSize.isScalable());
7192}
7193
7196 ElementCount VF = UserVF;
7197 // Outer loop handling: They may require CFG and instruction level
7198 // transformations before even evaluating whether vectorization is profitable.
7199 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7200 // the vectorization pipeline.
7201 if (!OrigLoop->isInnermost()) {
7202 // If the user doesn't provide a vectorization factor, determine a
7203 // reasonable one.
7204 if (UserVF.isZero()) {
7205 VF = determineVPlanVF(TTI, CM);
7206 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7207
7208 // Make sure we have a VF > 1 for stress testing.
7209 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7210 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7211 << "overriding computed VF.\n");
7212 VF = ElementCount::getFixed(4);
7213 }
7214 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7216 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7217 << "not supported by the target.\n");
7219 "Scalable vectorization requested but not supported by the target",
7220 "the scalable user-specified vectorization width for outer-loop "
7221 "vectorization cannot be used because the target does not support "
7222 "scalable vectors.",
7223 "ScalableVFUnfeasible", ORE, OrigLoop);
7225 }
7226 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7228 "VF needs to be a power of two");
7229 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7230 << "VF " << VF << " to build VPlans.\n");
7231 buildVPlans(VF, VF);
7232
7233 // For VPlan build stress testing, we bail out after VPlan construction.
7236
7237 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7238 }
7239
7240 LLVM_DEBUG(
7241 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7242 "VPlan-native path.\n");
7244}
7245
7246std::optional<VectorizationFactor>
7248 assert(OrigLoop->isInnermost() && "Inner loop expected.");
7251
7252 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7253 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7254 return std::nullopt;
7255
7256 // Invalidate interleave groups if all blocks of loop will be predicated.
7257 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7259 LLVM_DEBUG(
7260 dbgs()
7261 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7262 "which requires masked-interleaved support.\n");
7264 // Invalidating interleave groups also requires invalidating all decisions
7265 // based on them, which includes widening decisions and uniform and scalar
7266 // values.
7268 }
7269
7270 ElementCount MaxUserVF =
7271 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7272 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7273 if (!UserVF.isZero() && UserVFIsLegal) {
7275 "VF needs to be a power of two");
7276 // Collect the instructions (and their associated costs) that will be more
7277 // profitable to scalarize.
7279 if (CM.selectUserVectorizationFactor(UserVF)) {
7280 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7281 buildVPlansWithVPRecipes(UserVF, UserVF);
7282 if (!hasPlanWithVF(UserVF)) {
7283 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
7284 << ".\n");
7285 return std::nullopt;
7286 }
7287
7289 return {{UserVF, 0, 0}};
7290 } else
7291 reportVectorizationInfo("UserVF ignored because of invalid costs.",
7292 "InvalidCost", ORE, OrigLoop);
7293 }
7294
7295 // Populate the set of Vectorization Factor Candidates.
7296 ElementCountSet VFCandidates;
7297 for (auto VF = ElementCount::getFixed(1);
7298 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7299 VFCandidates.insert(VF);
7300 for (auto VF = ElementCount::getScalable(1);
7301 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7302 VFCandidates.insert(VF);
7303
7305 for (const auto &VF : VFCandidates) {
7306 // Collect Uniform and Scalar instructions after vectorization with VF.
7308
7309 // Collect the instructions (and their associated costs) that will be more
7310 // profitable to scalarize.
7311 if (VF.isVector())
7313 }
7314
7315 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7316 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7317
7319 if (!MaxFactors.hasVector())
7321
7322 // Select the optimal vectorization factor.
7323 VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
7324 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7325 if (!hasPlanWithVF(VF.Width)) {
7326 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
7327 << ".\n");
7328 return std::nullopt;
7329 }
7330 return VF;
7331}
7332
7334 assert(count_if(VPlans,
7335 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7336 1 &&
7337 "Best VF has not a single VPlan.");
7338
7339 for (const VPlanPtr &Plan : VPlans) {
7340 if (Plan->hasVF(VF))
7341 return *Plan.get();
7342 }
7343 llvm_unreachable("No plan found!");
7344}
7345
7348 // Reserve first location for self reference to the LoopID metadata node.
7349 MDs.push_back(nullptr);
7350 bool IsUnrollMetadata = false;
7351 MDNode *LoopID = L->getLoopID();
7352 if (LoopID) {
7353 // First find existing loop unrolling disable metadata.
7354 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7355 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7356 if (MD) {
7357 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7358 IsUnrollMetadata =
7359 S && S->getString().starts_with("llvm.loop.unroll.disable");
7360 }
7361 MDs.push_back(LoopID->getOperand(i));
7362 }
7363 }
7364
7365 if (!IsUnrollMetadata) {
7366 // Add runtime unroll disable metadata.
7367 LLVMContext &Context = L->getHeader()->getContext();
7368 SmallVector<Metadata *, 1> DisableOperands;
7369 DisableOperands.push_back(
7370 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7371 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7372 MDs.push_back(DisableNode);
7373 MDNode *NewLoopID = MDNode::get(Context, MDs);
7374 // Set operand 0 to refer to the loop id itself.
7375 NewLoopID->replaceOperandWith(0, NewLoopID);
7376 L->setLoopID(NewLoopID);
7377 }
7378}
7379
7380// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7381// create a merge phi node for it and add it to \p ReductionResumeValues.
7383 VPInstruction *RedResult,
7385 VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) {
7386 if (!RedResult ||
7388 return;
7389
7390 auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
7391 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7392
7393 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
7394 Value *FinalValue =
7395 State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
7396 auto *ResumePhi =
7397 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
7398
7399 // TODO: bc.merge.rdx should not be created here, instead it should be
7400 // modeled in VPlan.
7401 BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7402 // Create a phi node that merges control-flow from the backedge-taken check
7403 // block and the middle block.
7404 auto *BCBlockPhi =
7405 PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
7406 LoopScalarPreHeader->getTerminator()->getIterator());
7407
7408 // If we are fixing reductions in the epilogue loop then we should already
7409 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7410 // we carry over the incoming values correctly.
7411 for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
7412 if (Incoming == LoopMiddleBlock)
7413 BCBlockPhi->addIncoming(FinalValue, Incoming);
7414 else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
7415 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
7416 Incoming);
7417 else
7418 BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
7419 }
7420
7421 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
7422 // TODO: This fixup should instead be modeled in VPlan.
7423 // Fix the scalar loop reduction variable with the incoming reduction sum
7424 // from the vector body and from the backedge value.
7425 int IncomingEdgeBlockIdx =
7426 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
7427 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
7428 // Pick the other block.
7429 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7430 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
7431 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7432 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
7433
7434 ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7435}
7436
7437std::pair<DenseMap<const SCEV *, Value *>,
7440 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7441 InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7442 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7443 assert(BestVPlan.hasVF(BestVF) &&
7444 "Trying to execute plan with unsupported VF");
7445 assert(BestVPlan.hasUF(BestUF) &&
7446 "Trying to execute plan with unsupported UF");
7447 assert(
7448 (IsEpilogueVectorization || !ExpandedSCEVs) &&
7449 "expanded SCEVs to reuse can only be used during epilogue vectorization");
7450
7451 if (!IsEpilogueVectorization)
7452 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7453
7454 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
7455 << ", UF=" << BestUF << '\n');
7456 BestVPlan.setName("Final VPlan");
7457 LLVM_DEBUG(BestVPlan.dump());
7458
7459 // Perform the actual loop transformation.
7460 VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7461 OrigLoop->getHeader()->getContext());
7462
7463 // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7464 // before making any changes to the CFG.
7465 if (!BestVPlan.getPreheader()->empty()) {
7466 State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7468 BestVPlan.getPreheader()->execute(&State);
7469 }
7470 if (!ILV.getTripCount())
7471 ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
7472 else
7473 assert(IsEpilogueVectorization && "should only re-use the existing trip "
7474 "count during epilogue vectorization");
7475
7476 // 1. Set up the skeleton for vectorization, including vector pre-header and
7477 // middle block. The vector loop is created during VPlan execution.
7478 Value *CanonicalIVStartValue;
7479 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7480 ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
7481 : State.ExpandedSCEVs);
7482
7483 // Only use noalias metadata when using memory checks guaranteeing no overlap
7484 // across all iterations.
7485 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7486 std::unique_ptr<LoopVersioning> LVer = nullptr;
7487 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7489
7490 // We currently don't use LoopVersioning for the actual loop cloning but we
7491 // still use it to add the noalias metadata.
7492 // TODO: Find a better way to re-use LoopVersioning functionality to add
7493 // metadata.
7494 LVer = std::make_unique<LoopVersioning>(
7495 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7496 PSE.getSE());
7497 State.LVer = &*LVer;
7499 }
7500
7502
7503 //===------------------------------------------------===//
7504 //
7505 // Notice: any optimization or new instruction that go
7506 // into the code below should also be implemented in
7507 // the cost-model.
7508 //
7509 //===------------------------------------------------===//
7510
7511 // 2. Copy and widen instructions from the old loop into the new loop.
7512 BestVPlan.prepareToExecute(ILV.getTripCount(),
7513 ILV.getOrCreateVectorTripCount(nullptr),
7514 CanonicalIVStartValue, State);
7515
7516 BestVPlan.execute(&State);
7517
7518 // 2.5 Collect reduction resume values.
7520 auto *ExitVPBB =
7521 cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7522 for (VPRecipeBase &R : *ExitVPBB) {
7523 createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R),
7524 ReductionResumeValues, State, OrigLoop,
7525 State.CFG.VPBB2IRBB[ExitVPBB]);
7526 }
7527
7528 // 2.6. Maintain Loop Hints
7529 // Keep all loop hints from the original loop on the vector loop (we'll
7530 // replace the vectorizer-specific hints below).
7531 MDNode *OrigLoopID = OrigLoop->getLoopID();
7532
7533 std::optional<MDNode *> VectorizedLoopID =
7536
7537 VPBasicBlock *HeaderVPBB =
7539 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7540 if (VectorizedLoopID)
7541 L->setLoopID(*VectorizedLoopID);
7542 else {
7543 // Keep all loop hints from the original loop on the vector loop (we'll
7544 // replace the vectorizer-specific hints below).
7545 if (MDNode *LID = OrigLoop->getLoopID())
7546 L->setLoopID(LID);
7547
7548 LoopVectorizeHints Hints(L, true, *ORE);
7549 Hints.setAlreadyVectorized();
7550 }
7552 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7553 if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
7555
7556 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7557 // predication, updating analyses.
7558 ILV.fixVectorizedLoop(State, BestVPlan);
7559
7561
7562 return {State.ExpandedSCEVs, ReductionResumeValues};
7563}
7564
7565#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7567 for (const auto &Plan : VPlans)
7569 Plan->printDOT(O);
7570 else
7571 Plan->print(O);
7572}
7573#endif
7574
7575//===--------------------------------------------------------------------===//
7576// EpilogueVectorizerMainLoop
7577//===--------------------------------------------------------------------===//
7578
7579/// This function is partially responsible for generating the control flow
7580/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7581std::pair<BasicBlock *, Value *>
7583 const SCEV2ValueTy &ExpandedSCEVs) {
7585
7586 // Generate the code to check the minimum iteration count of the vector
7587 // epilogue (see below).
7591
7592 // Generate the code to check any assumptions that we've made for SCEV
7593 // expressions.
7595
7596 // Generate the code that checks at runtime if arrays overlap. We put the
7597 // checks into a separate block to make the more common case of few elements
7598 // faster.
7600
7601 // Generate the iteration count check for the main loop, *after* the check
7602 // for the epilogue loop, so that the path-length is shorter for the case
7603 // that goes directly through the vector epilogue. The longer-path length for
7604 // the main loop is compensated for, by the gain from vectorizing the larger
7605 // trip count. Note: the branch will get updated later on when we vectorize
7606 // the epilogue.
7609
7610 // Generate the induction variable.
7612
7613 // Skip induction resume value creation here because they will be created in
7614 // the second pass for the scalar loop. The induction resume values for the
7615 // inductions in the epilogue loop are created before executing the plan for
7616 // the epilogue loop.
7617
7618 return {completeLoopSkeleton(), nullptr};
7619}
7620
7622 LLVM_DEBUG({
7623 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7624 << "Main Loop VF:" << EPI.MainLoopVF
7625 << ", Main Loop UF:" << EPI.MainLoopUF
7626 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7627 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7628 });
7629}
7630
7633 dbgs() << "intermediate fn:\n"
7634 << *OrigLoop->getHeader()->getParent() << "\n";
7635 });
7636}
7637
7638BasicBlock *
7640 bool ForEpilogue) {
7641 assert(Bypass && "Expected valid bypass basic block.");
7642 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7643 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7644 Value *Count = getTripCount();
7645 // Reuse existing vector loop preheader for TC checks.
7646 // Note that new preheader block is generated for vector loop.
7647 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7648 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7649
7650 // Generate code to check if the loop's trip count is less than VF * UF of the
7651 // main vector loop.
7652 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7653 : VF.isVector())
7656
7657 Value *CheckMinIters = Builder.CreateICmp(
7658 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7659 "min.iters.check");
7660
7661 if (!ForEpilogue)
7662 TCCheckBlock->setName("vector.main.loop.iter.check");
7663
7664 // Create new preheader for vector loop.
7665 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7666 DT, LI, nullptr, "vector.ph");
7667
7668 if (ForEpilogue) {
7669 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7670 DT->getNode(Bypass)->getIDom()) &&
7671 "TC check is expected to dominate Bypass");
7672
7673 // Update dominator for Bypass & LoopExit.
7674 DT->changeImmediateDominator(Bypass, TCCheckBlock);
7675 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7676 // For loops with multiple exits, there's no edge from the middle block
7677 // to exit blocks (as the epilogue must run) and thus no need to update
7678 // the immediate dominator of the exit blocks.
7680
7681 LoopBypassBlocks.push_back(TCCheckBlock);
7682
7683 // Save the trip count so we don't have to regenerate it in the
7684 // vec.epilog.iter.check. This is safe to do because the trip count
7685 // generated here dominates the vector epilog iter check.
7686 EPI.TripCount = Count;
7687 }
7688
7689 BranchInst &BI =
7690 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7693 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7694
7695 return TCCheckBlock;
7696}
7697
7698//===--------------------------------------------------------------------===//
7699// EpilogueVectorizerEpilogueLoop
7700//===--------------------------------------------------------------------===//
7701
7702/// This function is partially responsible for generating the control flow
7703/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7704std::pair<BasicBlock *, Value *>
7706 const SCEV2ValueTy &ExpandedSCEVs) {
7707 createVectorLoopSkeleton("vec.epilog.");
7708
7709 // Now, compare the remaining count and if there aren't enough iterations to
7710 // execute the vectorized epilogue skip to the scalar part.
7711 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7712 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7715 LI, nullptr, "vec.epilog.ph");
7717 VecEpilogueIterationCountCheck);
7718
7719 // Adjust the control flow taking the state info from the main loop
7720 // vectorization into account.
7722 "expected this to be saved from the previous pass.");
7724 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7725
7728
7730 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7731
7732 if (EPI.SCEVSafetyCheck)
7734 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7735 if (EPI.MemSafetyCheck)
7737 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7738
7740 VecEpilogueIterationCountCheck,
7741 VecEpilogueIterationCountCheck->getSinglePredecessor());
7742
7745 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7746 // If there is an epilogue which must run, there's no edge from the
7747 // middle block to exit blocks and thus no need to update the immediate
7748 // dominator of the exit blocks.
7751
7752 // Keep track of bypass blocks, as they feed start values to the induction and
7753 // reduction phis in the scalar loop preheader.
7754 if (EPI.SCEVSafetyCheck)
7756 if (EPI.MemSafetyCheck)
7759
7760 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7761 // reductions which merge control-flow from the latch block and the middle
7762 // block. Update the incoming values here and move the Phi into the preheader.
7763 SmallVector<PHINode *, 4> PhisInBlock;
7764 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7765 PhisInBlock.push_back(&Phi);
7766
7767 for (PHINode *Phi : PhisInBlock) {
7768 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7769 Phi->replaceIncomingBlockWith(
7770 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7771 VecEpilogueIterationCountCheck);
7772
7773 // If the phi doesn't have an incoming value from the
7774 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7775 // value and also those from other check blocks. This is needed for
7776 // reduction phis only.
7777 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7778 return EPI.EpilogueIterationCountCheck == IncB;
7779 }))
7780 continue;
7781 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7782 if (EPI.SCEVSafetyCheck)
7783 Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7784 if (EPI.MemSafetyCheck)
7785 Phi->removeIncomingValue(EPI.MemSafetyCheck);
7786 }
7787
7788 // Generate a resume induction for the vector epilogue and put it in the
7789 // vector epilogue preheader
7790 Type *IdxTy = Legal->getWidestInductionType();
7791 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
7793 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7794 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7796
7797 // Generate induction resume values. These variables save the new starting
7798 // indexes for the scalar loop. They are used to test if there are any tail
7799 // iterations left once the vector loop has completed.
7800 // Note that when the vectorized epilogue is skipped due to iteration count
7801 // check, then the resume value for the induction variable comes from
7802 // the trip count of the main vector loop, hence passing the AdditionalBypass
7803 // argument.
7804 createInductionResumeValues(ExpandedSCEVs,
7805 {VecEpilogueIterationCountCheck,
7806 EPI.VectorTripCount} /* AdditionalBypass */);
7807
7808 return {completeLoopSkeleton(), EPResumeVal};
7809}
7810
7811BasicBlock *
7813 BasicBlock *Bypass, BasicBlock *Insert) {
7814
7816 "Expected trip count to have been safed in the first pass.");
7817 assert(
7818 (!isa<Instruction>(EPI.TripCount) ||
7819 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7820 "saved trip count does not dominate insertion point.");
7821 Value *TC = EPI.TripCount;
7822 IRBuilder<> Builder(Insert->getTerminator());
7823 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7824
7825 // Generate code to check if the loop's trip count is less than VF * UF of the
7826 // vector epilogue loop.
7827 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7830
7831 Value *CheckMinIters =
7832 Builder.CreateICmp(P, Count,
7835 "min.epilog.iters.check");
7836
7837 BranchInst &BI =
7838 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7840 unsigned MainLoopStep = UF * VF.getKnownMinValue();
7841 unsigned EpilogueLoopStep =
7843 // We assume the remaining `Count` is equally distributed in
7844 // [0, MainLoopStep)
7845 // So the probability for `Count < EpilogueLoopStep` should be
7846 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7847 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7848 const uint32_t Weights[] = {EstimatedSkipCount,
7849 MainLoopStep - EstimatedSkipCount};
7850 setBranchWeights(BI, Weights);
7851 }
7852 ReplaceInstWithInst(Insert->getTerminator(), &BI);
7853
7854 LoopBypassBlocks.push_back(Insert);
7855 return Insert;
7856}
7857
7859 LLVM_DEBUG({
7860 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7861 << "Epilogue Loop VF:" << EPI.EpilogueVF
7862 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7863 });
7864}
7865
7868 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7869 });
7870}
7871
7873 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7874 assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7875 bool PredicateAtRangeStart = Predicate(Range.Start);
7876
7877 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
7878 if (Predicate(TmpVF) != PredicateAtRangeStart) {
7879 Range.End = TmpVF;
7880 break;
7881 }
7882
7883 return PredicateAtRangeStart;
7884}
7885
7886/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7887/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7888/// of VF's starting at a given VF and extending it as much as possible. Each
7889/// vectorization decision can potentially shorten this sub-range during
7890/// buildVPlan().
7892 ElementCount MaxVF) {
7893 auto MaxVFTimes2 = MaxVF * 2;
7894 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
7895 VFRange SubRange = {VF, MaxVFTimes2};
7896 VPlans.push_back(buildVPlan(SubRange));
7897 VF = SubRange.End;
7898 }
7899}
7900
7902 VPlan &Plan) {
7903 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7904
7905 // Look for cached value.
7906 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7907 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7908 if (ECEntryIt != EdgeMaskCache.end())
7909 return ECEntryIt->second;
7910
7911 VPValue *SrcMask = getBlockInMask(Src);
7912
7913 // The terminator has to be a branch inst!
7914 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7915 assert(BI && "Unexpected terminator found");
7916
7917 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7918 return EdgeMaskCache[Edge] = SrcMask;
7919
7920 // If source is an exiting block, we know the exit edge is dynamically dead
7921 // in the vector loop, and thus we don't need to restrict the mask. Avoid
7922 // adding uses of an otherwise potentially dead instruction.
7923 if (OrigLoop->isLoopExiting(Src))
7924 return EdgeMaskCache[Edge] = SrcMask;
7925
7926 VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition());
7927 assert(EdgeMask && "No Edge Mask found for condition");
7928
7929 if (BI->getSuccessor(0) != Dst)
7930 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
7931
7932 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
7933 // The condition is 'SrcMask && EdgeMask', which is equivalent to
7934 // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
7935 // The select version does not introduce new UB if SrcMask is false and
7936 // EdgeMask is poison. Using 'and' here introduces undefined behavior.
7937 VPValue *False = Plan.getVPValueOrAddLiveIn(
7939 EdgeMask =
7940 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
7941 }
7942
7943 return EdgeMaskCache[Edge] = EdgeMask;
7944}
7945
7947 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7948
7949 // Look for cached value.
7950 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7951 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
7952 assert(ECEntryIt != EdgeMaskCache.end() &&
7953 "looking up mask for edge which has not been created");
7954 return ECEntryIt->second;
7955}
7956
7958 BasicBlock *Header = OrigLoop->getHeader();
7959
7960 // When not folding the tail, use nullptr to model all-true mask.
7961 if (!CM.foldTailByMasking()) {
7962 BlockMaskCache[Header] = nullptr;
7963 return;
7964 }
7965
7966 // Introduce the early-exit compare IV <= BTC to form header block mask.
7967 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
7968 // constructing the desired canonical IV in the header block as its first
7969 // non-phi instructions.
7970
7971 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
7972 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
7973 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
7974 HeaderVPBB->insert(IV, NewInsertionPoint);
7975
7976 VPBuilder::InsertPointGuard Guard(Builder);
7977 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
7978 VPValue *BlockMask = nullptr;
7980 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
7981 BlockMaskCache[Header] = BlockMask;
7982}
7983
7985 // Return the cached value.
7986 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
7987 assert(BCEntryIt != BlockMaskCache.end() &&
7988 "Trying to access mask for block without one.");
7989 return BCEntryIt->second;
7990}
7991
7993 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7994 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
7995 assert(OrigLoop->getHeader() != BB &&
7996 "Loop header must have cached block mask");
7997
7998 // All-one mask is modelled as no-mask following the convention for masked
7999 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8000 VPValue *BlockMask = nullptr;
8001 // This is the block mask. We OR all incoming edges.
8002 for (auto *Predecessor : predecessors(BB)) {
8003 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8004 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8005 BlockMaskCache[BB] = EdgeMask;
8006 return;
8007 }
8008
8009 if (!BlockMask) { // BlockMask has its initialized nullptr value.
8010 BlockMask = EdgeMask;
8011 continue;
8012 }
8013
8014 BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8015 }
8016
8017 BlockMaskCache[BB] = BlockMask;
8018}
8019
8021VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8022 VFRange &Range, VPlanPtr &Plan) {
8023 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8024 "Must be called with either a load or store");
8025
8026 auto willWiden = [&](ElementCount VF) -> bool {
8028 CM.getWideningDecision(I, VF);
8030 "CM decision should be taken at this point.");
8032 return true;
8033 if (CM.isScalarAfterVectorization(I, VF) ||
8034 CM.isProfitableToScalarize(I, VF))
8035 return false;
8037 };
8038
8040 return nullptr;
8041
8042 VPValue *Mask = nullptr;
8043 if (Legal->isMaskRequired(I))
8044 Mask = getBlockInMask(I->getParent());
8045
8046 // Determine if the pointer operand of the access is either consecutive or
8047 // reverse consecutive.
8049 CM.getWideningDecision(I, Range.Start);
8051 bool Consecutive =
8053
8054 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8055 if (Consecutive) {
8056 auto *GEP = dyn_cast<GetElementPtrInst>(
8057 Ptr->getUnderlyingValue()->stripPointerCasts());
8058 auto *VectorPtr = new VPVectorPointerRecipe(
8059 Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
8060 I->getDebugLoc());
8061 Builder.getInsertBlock()->appendRecipe(VectorPtr);
8062 Ptr = VectorPtr;
8063 }
8064 if (LoadInst *Load = dyn_cast<LoadInst>(I))
8065 return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive,
8066 Reverse);
8067
8068 StoreInst *Store = cast<StoreInst>(I);
8069 return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask,
8070 Consecutive, Reverse);
8071}
8072
8073/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8074/// insert a recipe to expand the step for the induction recipe.
8077 VPValue *Start, const InductionDescriptor &IndDesc,
8078 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,
8079 VFRange &Range) {
8080 assert(IndDesc.getStartValue() ==
8081 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8082 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8083 "step must be loop invariant");
8084
8085 VPValue *Step =
8087 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8088 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
8089 }
8090 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8091 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
8092}
8093
8094VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8095 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
8096
8097 // Check if this is an integer or fp induction. If so, build the recipe that
8098 // produces its scalar and vector values.
8099 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8100 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8101 *PSE.getSE(), *OrigLoop, Range);
8102
8103 // Check if this is pointer induction. If so, build the recipe for it.
8104 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8105 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8106 *PSE.getSE());
8108 Phi, Operands[0], Step, *II,
8110 [&](ElementCount VF) {
8111 return CM.isScalarAfterVectorization(Phi, VF);
8112 },
8113 Range));
8114 }
8115 return nullptr;
8116}
8117
8118VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8120 // Optimize the special case where the source is a constant integer
8121 // induction variable. Notice that we can only optimize the 'trunc' case
8122 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8123 // (c) other casts depend on pointer size.
8124
8125 // Determine whether \p K is a truncation based on an induction variable that
8126 // can be optimized.
8127 auto isOptimizableIVTruncate =
8128 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8129 return [=](ElementCount VF) -> bool {
8130 return CM.isOptimizableIVTruncate(K, VF);
8131 };
8132 };
8133
8135 isOptimizableIVTruncate(I), Range)) {
8136
8137 auto *Phi = cast<PHINode>(I->getOperand(0));
8138 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8139 VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue());
8140 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8141 *OrigLoop, Range);
8142 }
8143 return nullptr;
8144}
8145
8146VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8148 VPlanPtr &Plan) {
8149 unsigned NumIncoming = Phi->getNumIncomingValues();
8150
8151 // We know that all PHIs in non-header blocks are converted into selects, so
8152 // we don't have to worry about the insertion order and we can just use the
8153 // builder. At this point we generate the predication tree. There may be
8154 // duplications since this is a simple recursive scan, but future
8155 // optimizations will clean it up.
8156 SmallVector<VPValue *, 2> OperandsWithMask;
8157
8158 for (unsigned In = 0; In < NumIncoming; In++) {
8159 OperandsWithMask.push_back(Operands[In]);
8160 VPValue *EdgeMask =
8161 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan);
8162 if (!EdgeMask) {
8163 assert(In == 0 && "Both null and non-null edge masks found");
8165 "Distinct incoming values with one having a full mask");
8166 break;
8167 }
8168 OperandsWithMask.push_back(EdgeMask);
8169 }
8170 return new VPBlendRecipe(Phi, OperandsWithMask);
8171}
8172
8173VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8175 VFRange &Range,
8176 VPlanPtr &Plan) {
8178 [this, CI](ElementCount VF) {
8179 return CM.isScalarWithPredication(CI, VF);
8180 },
8181 Range);
8182
8183 if (IsPredicated)
8184 return nullptr;
8185
8187 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8188 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8189 ID == Intrinsic::pseudoprobe ||
8190 ID == Intrinsic::experimental_noalias_scope_decl))
8191 return nullptr;
8192
8193 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8194
8195 // Is it beneficial to perform intrinsic call compared to lib call?
8196 bool ShouldUseVectorIntrinsic =
8198 [&](ElementCount VF) -> bool {
8199 return CM.getCallWideningDecision(CI, VF).Kind ==
8201 },
8202 Range);
8203 if (ShouldUseVectorIntrinsic)
8204 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID,
8205 CI->getDebugLoc());
8206
8207 Function *Variant = nullptr;
8208 std::optional<unsigned> MaskPos;
8209 // Is better to call a vectorized version of the function than to to scalarize
8210 // the call?
8211 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8212 [&](ElementCount VF) -> bool {
8213 // The following case may be scalarized depending on the VF.
8214 // The flag shows whether we can use a usual Call for vectorized
8215 // version of the instruction.
8216
8217 // If we've found a variant at a previous VF, then stop looking. A
8218 // vectorized variant of a function expects input in a certain shape
8219 // -- basically the number of input registers, the number of lanes
8220 // per register, and whether there's a mask required.
8221 // We store a pointer to the variant in the VPWidenCallRecipe, so
8222 // once we have an appropriate variant it's only valid for that VF.
8223 // This will force a different vplan to be generated for each VF that
8224 // finds a valid variant.
8225 if (Variant)
8226 return false;
8228 CM.getCallWideningDecision(CI, VF);
8230 Variant = Decision.Variant;
8231 MaskPos = Decision.MaskPos;
8232 return true;
8233 }
8234
8235 return false;
8236 },
8237 Range);
8238 if (ShouldUseVectorCall) {
8239 if (MaskPos.has_value()) {
8240 // We have 2 cases that would require a mask:
8241 // 1) The block needs to be predicated, either due to a conditional
8242 // in the scalar loop or use of an active lane mask with
8243 // tail-folding, and we use the appropriate mask for the block.
8244 // 2) No mask is required for the block, but the only available
8245 // vector variant at this VF requires a mask, so we synthesize an
8246 // all-true mask.
8247 VPValue *Mask = nullptr;
8248 if (Legal->isMaskRequired(CI))
8249 Mask = getBlockInMask(CI->getParent());
8250 else
8251 Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
8252 IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
8253
8254 Ops.insert(Ops.begin() + *MaskPos, Mask);
8255 }
8256
8257 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
8259 Variant);
8260 }
8261
8262 return nullptr;
8263}
8264
8265bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8266 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8267 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8268 // Instruction should be widened, unless it is scalar after vectorization,
8269 // scalarization is profitable or it is predicated.
8270 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8271 return CM.isScalarAfterVectorization(I, VF) ||
8272 CM.isProfitableToScalarize(I, VF) ||
8273 CM.isScalarWithPredication(I, VF);
8274 };
8276 Range);
8277}
8278
8279VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8281 VPBasicBlock *VPBB, VPlanPtr &Plan) {
8282 switch (I->getOpcode()) {
8283 default:
8284 return nullptr;
8285 case Instruction::SDiv:
8286 case Instruction::UDiv:
8287 case Instruction::SRem:
8288 case Instruction::URem: {
8289 // If not provably safe, use a select to form a safe divisor before widening the
8290 // div/rem operation itself. Otherwise fall through to general handling below.
8291 if (CM.isPredicatedInst(I)) {
8292 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8293 VPValue *Mask = getBlockInMask(I->getParent());
8294 VPValue *One = Plan->getVPValueOrAddLiveIn(
8295 ConstantInt::get(I->getType(), 1u, false));
8296 auto *SafeRHS =
8297 new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
8298 I->getDebugLoc());
8299 VPBB->appendRecipe(SafeRHS);
8300 Ops[1] = SafeRHS;
8301 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8302 }
8303 [[fallthrough]];
8304 }
8305 case Instruction::Add:
8306 case Instruction::And:
8307 case Instruction::AShr:
8308 case Instruction::FAdd:
8309 case Instruction::FCmp:
8310 case Instruction::FDiv:
8311 case Instruction::FMul:
8312 case Instruction::FNeg:
8313 case Instruction::FRem:
8314 case Instruction::FSub:
8315 case Instruction::ICmp:
8316 case Instruction::LShr:
8317 case Instruction::Mul:
8318 case Instruction::Or:
8319 case Instruction::Select:
8320 case Instruction::Shl:
8321 case Instruction::Sub:
8322 case Instruction::Xor:
8323 case Instruction::Freeze:
8324 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8325 };
8326}
8327
8329 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8330 for (VPHeaderPHIRecipe *R : PhisToFix) {
8331 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8332 VPRecipeBase *IncR =
8333 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8334 R->addOperand(IncR->getVPSingleValue());
8335 }
8336}
8337
8339 VFRange &Range,
8340 VPlan &Plan) {
8342 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8343 Range);
8344
8345 bool IsPredicated = CM.isPredicatedInst(I);
8346
8347 // Even if the instruction is not marked as uniform, there are certain
8348 // intrinsic calls that can be effectively treated as such, so we check for
8349 // them here. Conservatively, we only do this for scalable vectors, since
8350 // for fixed-width VFs we can always fall back on full scalarization.
8351 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8352 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8353 case Intrinsic::assume:
8354 case Intrinsic::lifetime_start:
8355 case Intrinsic::lifetime_end:
8356 // For scalable vectors if one of the operands is variant then we still
8357 // want to mark as uniform, which will generate one instruction for just
8358 // the first lane of the vector. We can't scalarize the call in the same
8359 // way as for fixed-width vectors because we don't know how many lanes
8360 // there are.
8361 //
8362 // The reasons for doing it this way for scalable vectors are:
8363 // 1. For the assume intrinsic generating the instruction for the first
8364 // lane is still be better than not generating any at all. For
8365 // example, the input may be a splat across all lanes.
8366 // 2. For the lifetime start/end intrinsics the pointer operand only
8367 // does anything useful when the input comes from a stack object,
8368 // which suggests it should always be uniform. For non-stack objects
8369 // the effect is to poison the object, which still allows us to
8370 // remove the call.
8371 IsUniform = true;
8372 break;
8373 default:
8374 break;
8375 }
8376 }
8377 VPValue *BlockInMask = nullptr;
8378 if (!IsPredicated) {
8379 // Finalize the recipe for Instr, first if it is not predicated.
8380 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8381 } else {
8382 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8383 // Instructions marked for predication are replicated and a mask operand is
8384 // added initially. Masked replicate recipes will later be placed under an
8385 // if-then construct to prevent side-effects. Generate recipes to compute
8386 // the block mask for this region.
8387 BlockInMask = getBlockInMask(I->getParent());
8388 }
8389
8390 auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()),
8391 IsUniform, BlockInMask);
8392 return Recipe;
8393}
8394
8397 VPBasicBlock *VPBB, VPlanPtr &Plan) {
8398 // First, check for specific widening recipes that deal with inductions, Phi
8399 // nodes, calls and memory operations.
8400 VPRecipeBase *Recipe;
8401 if (auto Phi = dyn_cast<PHINode>(Instr)) {
8402 if (Phi->getParent() != OrigLoop->getHeader())
8403 return tryToBlend(Phi, Operands, Plan);
8404
8405 // Always record recipes for header phis. Later first-order recurrence phis
8406 // can have earlier phis as incoming values.
8407 recordRecipeOf(Phi);
8408
8409 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8410 return Recipe;
8411
8412 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8413 assert((Legal->isReductionVariable(Phi) ||
8414 Legal->isFixedOrderRecurrence(Phi)) &&
8415 "can only widen reductions and fixed-order recurrences here");
8416 VPValue *StartV = Operands[0];
8417 if (Legal->isReductionVariable(Phi)) {
8418 const RecurrenceDescriptor &RdxDesc =
8419 Legal->getReductionVars().find(Phi)->second;
8420 assert(RdxDesc.getRecurrenceStartValue() ==
8421 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8422 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8423 CM.isInLoopReduction(Phi),
8424 CM.useOrderedReductions(RdxDesc));
8425 } else {
8426 // TODO: Currently fixed-order recurrences are modeled as chains of
8427 // first-order recurrences. If there are no users of the intermediate
8428 // recurrences in the chain, the fixed order recurrence should be modeled
8429 // directly, enabling more efficient codegen.
8430 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8431 }
8432
8433 // Record the incoming value from the backedge, so we can add the incoming
8434 // value from the backedge after all recipes have been created.
8435 auto *Inc = cast<Instruction>(
8436 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
8437 auto RecipeIter = Ingredient2Recipe.find(Inc);
8438 if (RecipeIter == Ingredient2Recipe.end())
8439 recordRecipeOf(Inc);
8440
8441 PhisToFix.push_back(PhiRecipe);
8442 return PhiRecipe;
8443 }
8444
8445 if (isa<TruncInst>(Instr) &&
8446 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8447 Range, *Plan)))
8448 return Recipe;
8449
8450 // All widen recipes below deal only with VF > 1.
8452 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8453 return nullptr;
8454
8455 if (auto *CI = dyn_cast<CallInst>(Instr))
8456 return tryToWidenCall(CI, Operands, Range, Plan);
8457
8458 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8459 return tryToWidenMemory(Instr, Operands, Range, Plan);
8460
8461 if (!shouldWiden(Instr, Range))
8462 return nullptr;
8463
8464 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8465 return new VPWidenGEPRecipe(GEP,
8466 make_range(Operands.begin(), Operands.end()));
8467
8468 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8469 return new VPWidenSelectRecipe(
8470 *SI, make_range(Operands.begin(), Operands.end()));
8471 }
8472
8473 if (auto *CI = dyn_cast<CastInst>(Instr)) {
8474 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8475 *CI);
8476 }
8477
8478 return tryToWiden(Instr, Operands, VPBB, Plan);
8479}
8480
8481void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8482 ElementCount MaxVF) {
8483 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8484
8485 auto MaxVFTimes2 = MaxVF * 2;
8486 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8487 VFRange SubRange = {VF, MaxVFTimes2};
8488 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8489 // Now optimize the initial VPlan.
8490 if (!Plan->hasVF(ElementCount::getFixed(1)))
8492 *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
8493 VPlanTransforms::optimize(*Plan, *PSE.getSE());
8494 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8495 VPlans.push_back(std::move(Plan));
8496 }
8497 VF = SubRange.End;
8498 }
8499}
8500
8501// Add the necessary canonical IV and branch recipes required to control the
8502// loop.
8503static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8504 DebugLoc DL) {
8505 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8506 auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);
8507
8508 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8509 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8510 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8511 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8512 Header->insert(CanonicalIVPHI, Header->begin());
8513
8514 VPBuilder Builder(TopRegion->getExitingBasicBlock());
8515 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8516 auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8517 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
8518 "index.next");
8519 CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8520
8521 // Add the BranchOnCount VPInstruction to the latch.
8523 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8524}
8525
8526// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8527// original exit block.
8528static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
8529 VPlan &Plan) {
8530 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8531 BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8532 // Only handle single-exit loops with unique exit blocks for now.
8533 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8534 return;
8535
8536 // Introduce VPUsers modeling the exit values.
8537 for (PHINode &ExitPhi : ExitBB->phis()) {
8538 Value *IncomingValue =
8539 ExitPhi.getIncomingValueForBlock(ExitingBB);
8540 VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue);
8541 Plan.addLiveOut(&ExitPhi, V);
8542 }
8543}
8544
8546LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8547
8549
8550 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8551
8552 // ---------------------------------------------------------------------------
8553 // Pre-construction: record ingredients whose recipes we'll need to further
8554 // process after constructing the initial VPlan.
8555 // ---------------------------------------------------------------------------
8556
8557 // For each interleave group which is relevant for this (possibly trimmed)
8558 // Range, add it to the set of groups to be later applied to the VPlan and add
8559 // placeholders for its members' Recipes which we'll be replacing with a
8560 // single VPInterleaveRecipe.
8562 auto applyIG = [IG, this](ElementCount VF) -> bool {
8563 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8564 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8566 // For scalable vectors, the only interleave factor currently supported
8567 // is 2 since we require the (de)interleave2 intrinsics instead of
8568 // shufflevectors.
8569 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8570 "Unsupported interleave factor for scalable vectors");
8571 return Result;
8572 };
8573 if (!getDecisionAndClampRange(applyIG, Range))
8574 continue;
8575 InterleaveGroups.insert(IG);
8576 for (unsigned i = 0; i < IG->getFactor(); i++)
8577 if (Instruction *Member = IG->getMember(i))
8578 RecipeBuilder.recordRecipeOf(Member);
8579 };
8580
8581 // ---------------------------------------------------------------------------
8582 // Build initial VPlan: Scan the body of the loop in a topological order to
8583 // visit each basic block after having visited its predecessor basic blocks.
8584 // ---------------------------------------------------------------------------
8585
8586 // Create initial VPlan skeleton, having a basic block for the pre-header
8587 // which contains SCEV expansions that need to happen before the CFG is
8588 // modified; a basic block for the vector pre-header, followed by a region for
8589 // the vector loop, followed by the middle basic block. The skeleton vector
8590 // loop region contains a header and latch basic blocks.
8592 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8593 *PSE.getSE());
8594 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8595 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8596 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8597 Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);
8598 Plan->getVectorLoopRegion()->setExiting(LatchVPBB);
8599
8600 // Don't use getDecisionAndClampRange here, because we don't know the UF
8601 // so this function is better to be conservative, rather than to split
8602 // it up into different VPlans.
8603 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8604 bool IVUpdateMayOverflow = false;
8605 for (ElementCount VF : Range)
8606 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8607
8609 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8610 // When not folding the tail, we know that the induction increment will not
8611 // overflow.
8612 bool HasNUW = Style == TailFoldingStyle::None;
8613 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8614
8615 // Scan the body of the loop in a topological order to visit each basic block
8616 // after having visited its predecessor basic blocks.
8617 LoopBlocksDFS DFS(OrigLoop);
8618 DFS.perform(LI);
8619
8620 VPBasicBlock *VPBB = HeaderVPBB;
8621 BasicBlock *HeaderBB = OrigLoop->getHeader();
8622 bool NeedsMasks =
8623 CM.foldTailByMasking() ||
8624 any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
8625 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
8626 return Legal->blockNeedsPredication(BB) || NeedsBlends;
8627 });
8628 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8629 // Relevant instructions from basic block BB will be grouped into VPRecipe
8630 // ingredients and fill a new VPBasicBlock.
8631 if (VPBB != HeaderVPBB)
8632 VPBB->setName(BB->getName());
8633 Builder.setInsertPoint(VPBB);
8634
8635 if (VPBB == HeaderVPBB)
8636 RecipeBuilder.createHeaderMask(*Plan);
8637 else if (NeedsMasks)
8638 RecipeBuilder.createBlockInMask(BB, *Plan);
8639
8640 // Introduce each ingredient into VPlan.
8641 // TODO: Model and preserve debug intrinsics in VPlan.
8642 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
8643 Instruction *Instr = &I;
8645 auto *Phi = dyn_cast<PHINode>(Instr);
8646 if (Phi && Phi->getParent() == HeaderBB) {
8647 Operands.push_back(Plan->getVPValueOrAddLiveIn(
8648 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8649 } else {
8650 auto OpRange = Plan->mapToVPValues(Instr->operands());
8651 Operands = {OpRange.begin(), OpRange.end()};
8652 }
8653
8654 // Invariant stores inside loop will be deleted and a single store
8655 // with the final reduction value will be added to the exit block
8656 StoreInst *SI;
8657 if ((SI = dyn_cast<StoreInst>(&I)) &&
8658 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8659 continue;
8660
8661 VPRecipeBase *Recipe = RecipeBuilder.tryToCreateWidenRecipe(
8662 Instr, Operands, Range, VPBB, Plan);
8663 if (!Recipe)
8664 Recipe = RecipeBuilder.handleReplication(Instr, Range, *Plan);
8665 for (auto *Def : Recipe->definedValues()) {
8666 auto *UV = Def->getUnderlyingValue();
8667 Plan->addVPValue(UV, Def);
8668 }
8669
8670 RecipeBuilder.setRecipe(Instr, Recipe);
8671 if (isa<VPHeaderPHIRecipe>(Recipe)) {
8672 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8673 // the following cases, VPHeaderPHIRecipes may be created after non-phi
8674 // recipes and need to be moved to the phi section of HeaderVPBB:
8675 // * tail-folding (non-phi recipes computing the header mask are
8676 // introduced earlier than regular header phi recipes, and should appear
8677 // after them)
8678 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8679
8680 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8681 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8682 "unexpected recipe needs moving");
8683 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8684 } else
8685 VPBB->appendRecipe(Recipe);
8686 }
8687
8689 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8690 }
8691
8692 // After here, VPBB should not be used.
8693 VPBB = nullptr;
8694
8695 if (CM.requiresScalarEpilogue(Range)) {
8696 // No edge from the middle block to the unique exit block has been inserted
8697 // and there is nothing to fix from vector loop; phis should have incoming
8698 // from scalar loop only.
8699 } else
8700 addUsersInExitBlock(HeaderVPBB, OrigLoop, *Plan);
8701
8702 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8703 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8704 "entry block must be set to a VPRegionBlock having a non-empty entry "
8705 "VPBasicBlock");
8706 RecipeBuilder.fixHeaderPhis();
8707
8708 // ---------------------------------------------------------------------------
8709 // Transform initial VPlan: Apply previously taken decisions, in order, to
8710 // bring the VPlan to its final state.
8711 // ---------------------------------------------------------------------------
8712
8713 // Adjust the recipes for any inloop reductions.
8714 adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start);
8715
8716 // Interleave memory: for each Interleave Group we marked earlier as relevant
8717 // for this VPlan, replace the Recipes widening its memory instructions with a
8718 // single VPInterleaveRecipe at its insertion point.
8719 for (const auto *IG : InterleaveGroups) {
8720 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8721 RecipeBuilder.getRecipe(IG->getInsertPos()));
8722 SmallVector<VPValue *, 4> StoredValues;
8723 for (unsigned i = 0; i < IG->getFactor(); ++i)
8724 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8725 auto *StoreR =
8726 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
8727 StoredValues.push_back(StoreR->getStoredValue());
8728 }
8729
8730 bool NeedsMaskForGaps =
8731 IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8732 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8733 Recipe->getMask(), NeedsMaskForGaps);
8734 VPIG->insertBefore(Recipe);
8735 unsigned J = 0;
8736 for (unsigned i = 0; i < IG->getFactor(); ++i)
8737 if (Instruction *Member = IG->getMember(i)) {
8738 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
8739 if (!Member->getType()->isVoidTy()) {
8740 VPValue *OriginalV = MemberR->getVPSingleValue();
8741 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8742 J++;
8743 }
8744 MemberR->eraseFromParent();
8745 }
8746 }
8747
8748 for (ElementCount VF : Range)
8749 Plan->addVF(VF);
8750 Plan->setName("Initial VPlan");
8751
8752 // Replace VPValues for known constant strides guaranteed by predicate scalar
8753 // evolution.
8754 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8755 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8756 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
8757 // Only handle constant strides for now.
8758 if (!ScevStride)
8759 continue;
8760 Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt());
8761
8762 auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI);
8763 // The versioned value may not be used in the loop directly, so just add a
8764 // new live-in in those cases.
8765 Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV);
8766 }
8767
8768 // From this point onwards, VPlan-to-VPlan transformations may change the plan
8769 // in ways that accessing values using original IR values is incorrect.
8770 Plan->disableValue2VPValue();
8771
8773 return Legal->blockNeedsPredication(BB);
8774 });
8775
8776 // Sink users of fixed-order recurrence past the recipe defining the previous
8777 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8779 return nullptr;
8780
8781 if (useActiveLaneMask(Style)) {
8782 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8783 // TailFoldingStyle is visible there.
8784 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8785 bool WithoutRuntimeCheck =
8787 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8788 WithoutRuntimeCheck);
8789 }
8790 return Plan;
8791}
8792
8793VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8794 // Outer loop handling: They may require CFG and instruction level
8795 // transformations before even evaluating whether vectorization is profitable.
8796 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8797 // the vectorization pipeline.
8798 assert(!OrigLoop->isInnermost());
8799 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8800
8801 // Create new empty VPlan
8802 auto Plan = VPlan::createInitialVPlan(
8803 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8804 *PSE.getSE());
8805
8806 // Build hierarchical CFG
8807 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8808 HCFGBuilder.buildHierarchicalCFG();
8809
8810 for (ElementCount VF : Range)
8811 Plan->addVF(VF);
8812
8814 Plan,
8815 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
8816 *PSE.getSE(), *TLI);
8817
8818 // Remove the existing terminator of the exiting block of the top-most region.
8819 // A BranchOnCount will be added instead when adding the canonical IV recipes.
8820 auto *Term =
8821 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8822 Term->eraseFromParent();
8823
8824 // Tail folding is not supported for outer loops, so the induction increment
8825 // is guaranteed to not wrap.
8826 bool HasNUW = true;
8827 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
8828 DebugLoc());
8829 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8830 return Plan;
8831}
8832
8833// Adjust the recipes for reductions. For in-loop reductions the chain of
8834// instructions leading from the loop exit instr to the phi need to be converted
8835// to reductions, with one operand being vector and the other being the scalar
8836// reduction chain. For other reductions, a select is introduced between the phi
8837// and live-out recipes when folding the tail.
8838//
8839// A ComputeReductionResult recipe is added to the middle block, also for
8840// in-loop reductions which compute their result in-loop, because generating
8841// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8842void LoopVectorizationPlanner::adjustRecipesForReductions(
8843 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
8844 ElementCount MinVF) {
8845 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8846 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8847 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8848 // sank outside of the loop would keep the same order as they had in the
8849 // original loop.
8850 SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8851 for (VPRecipeBase &R : Header->phis()) {
8852 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
8853 ReductionPHIList.emplace_back(ReductionPhi);
8854 }
8855 bool HasIntermediateStore = false;
8856 stable_sort(ReductionPHIList,
8857 [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8858 const VPReductionPHIRecipe *R2) {
8859 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8860 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8861 HasIntermediateStore |= IS1 || IS2;
8862
8863 // If neither of the recipes has an intermediate store, keep the
8864 // order the same.
8865 if (!IS1 && !IS2)
8866 return false;
8867
8868 // If only one of the recipes has an intermediate store, then
8869 // move it towards the beginning of the list.
8870 if (IS1 && !IS2)
8871 return true;
8872
8873 if (!IS1 && IS2)
8874 return false;
8875
8876 // If both recipes have an intermediate store, then the recipe
8877 // with the later store should be processed earlier. So it
8878 // should go to the beginning of the list.
8879 return DT->dominates(IS2, IS1);
8880 });
8881
8882 if (HasIntermediateStore && ReductionPHIList.size() > 1)
8883 for (VPRecipeBase *R : ReductionPHIList)
8884 R->moveBefore(*Header, Header->getFirstNonPhi());
8885
8886 for (VPRecipeBase &R : Header->phis()) {
8887 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8888 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8889 continue;
8890
8891 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
8892 RecurKind Kind = RdxDesc.getRecurrenceKind();
8894 "AnyOf reductions are not allowed for in-loop reductions");
8895
8896 // Collect the chain of "link" recipes for the reduction starting at PhiR.
8898 Worklist.insert(PhiR);
8899 for (unsigned I = 0; I != Worklist.size(); ++I) {
8900 VPSingleDefRecipe *Cur = Worklist[I];
8901 for (VPUser *U : Cur->users()) {
8902 auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
8903 if (!UserRecipe) {
8904 assert(isa<VPLiveOut>(U) &&
8905 "U must either be a VPSingleDef or VPLiveOut");
8906 continue;
8907 }
8908 Worklist.insert(UserRecipe);
8909 }
8910 }
8911
8912 // Visit operation "Links" along the reduction chain top-down starting from
8913 // the phi until LoopExitValue. We keep track of the previous item
8914 // (PreviousLink) to tell which of the two operands of a Link will remain
8915 // scalar and which will be reduced. For minmax by select(cmp), Link will be
8916 // the select instructions. Blend recipes of in-loop reduction phi's will
8917 // get folded to their non-phi operand, as the reduction recipe handles the
8918 // condition directly.
8919 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
8920 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
8921 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
8922
8923 // Index of the first operand which holds a non-mask vector operand.
8924 unsigned IndexOfFirstOperand;
8925 // Recognize a call to the llvm.fmuladd intrinsic.
8926 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
8927 VPValue *VecOp;
8928 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
8929 if (IsFMulAdd) {
8930 assert(
8932 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
8933 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
8934 isa<VPWidenCallRecipe>(CurrentLink)) &&
8935 CurrentLink->getOperand(2) == PreviousLink &&
8936 "expected a call where the previous link is the added operand");
8937
8938 // If the instruction is a call to the llvm.fmuladd intrinsic then we
8939 // need to create an fmul recipe (multiplying the first two operands of
8940 // the fmuladd together) to use as the vector operand for the fadd
8941 // reduction.
8942 VPInstruction *FMulRecipe = new VPInstruction(
8943 Instruction::FMul,
8944 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
8945 CurrentLinkI->getFastMathFlags());
8946 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
8947 VecOp = FMulRecipe;
8948 } else {
8949 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
8950 if (PhiR->isInLoop() && Blend) {
8951 assert(Blend->getNumIncomingValues() == 2 &&
8952 "Blend must have 2 incoming values");
8953 if (Blend->getIncomingValue(0) == PhiR)
8954 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
8955 else {
8956 assert(Blend->getIncomingValue(1) == PhiR &&
8957 "PhiR must be an operand of the blend");
8958 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
8959 }
8960 continue;
8961 }
8962
8964 if (isa<VPWidenRecipe>(CurrentLink)) {
8965 assert(isa<CmpInst>(CurrentLinkI) &&
8966 "need to have the compare of the select");
8967 continue;
8968 }
8969 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
8970 "must be a select recipe");
8971 IndexOfFirstOperand = 1;
8972 } else {
8973 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
8974 "Expected to replace a VPWidenSC");
8975 IndexOfFirstOperand = 0;
8976 }
8977 // Note that for non-commutable operands (cmp-selects), the semantics of
8978 // the cmp-select are captured in the recurrence kind.
8979 unsigned VecOpId =
8980 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
8981 ? IndexOfFirstOperand + 1
8982 : IndexOfFirstOperand;
8983 VecOp = CurrentLink->getOperand(VecOpId);
8984 assert(VecOp != PreviousLink &&
8985 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
8986 (VecOpId - IndexOfFirstOperand)) ==
8987 PreviousLink &&
8988 "PreviousLink must be the operand other than VecOp");
8989 }
8990
8991 BasicBlock *BB = CurrentLinkI->getParent();
8992 VPValue *CondOp = nullptr;
8994 CondOp = RecipeBuilder.getBlockInMask(BB);
8995
8996 VPReductionRecipe *RedRecipe = new VPReductionRecipe(
8997 RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp);
8998 // Append the recipe to the end of the VPBasicBlock because we need to
8999 // ensure that it comes after all of it's inputs, including CondOp.
9000 // Note that this transformation may leave over dead recipes (including
9001 // CurrentLink), which will be cleaned by a later VPlan transform.
9002 LinkVPBB->appendRecipe(RedRecipe);
9003 CurrentLink->replaceAllUsesWith(RedRecipe);
9004 PreviousLink = RedRecipe;
9005 }
9006 }
9007 Builder.setInsertPoint(&*LatchVPBB->begin());
9008 for (VPRecipeBase &R :
9009 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9010 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9011 if (!PhiR)
9012 continue;
9013
9014 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9015 // If tail is folded by masking, introduce selects between the phi
9016 // and the live-out instruction of each reduction, at the beginning of the
9017 // dedicated latch block.
9018 auto *OrigExitingVPV = PhiR->getBackedgeValue();
9019 auto *NewExitingVPV = PhiR->getBackedgeValue();
9020 if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9021 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9022 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9023 "reduction recipe must be defined before latch");
9024 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9025 std::optional<FastMathFlags> FMFs =
9026 PhiTy->isFloatingPointTy()
9027 ? std::make_optional(RdxDesc.getFastMathFlags())
9028 : std::nullopt;
9029 NewExitingVPV =
9030 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9031 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9032 return isa<VPInstruction>(&U) &&
9033 cast<VPInstruction>(&U)->getOpcode() ==
9035 });
9038 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
9040 PhiR->setOperand(1, NewExitingVPV);
9041 }
9042
9043 // If the vector reduction can be performed in a smaller type, we truncate
9044 // then extend the loop exit value to enable InstCombine to evaluate the
9045 // entire expression in the smaller type.
9046 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9047 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
9048 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9049 Type *RdxTy = RdxDesc.getRecurrenceType();
9050 auto *Trunc =
9051 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9052 auto *Extnd =
9053 RdxDesc.isSigned()
9054 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9055 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9056
9057 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9058 Extnd->insertAfter(Trunc);
9059 if (PhiR->getOperand(1) == NewExitingVPV)
9060 PhiR->setOperand(1, Extnd->getVPSingleValue());
9061 NewExitingVPV = Extnd;
9062 }
9063
9064 // We want code in the middle block to appear to execute on the location of
9065 // the scalar loop's latch terminator because: (a) it is all compiler
9066 // generated, (b) these instructions are always executed after evaluating
9067 // the latch conditional branch, and (c) other passes may add new
9068 // predecessors which terminate on this line. This is the easiest way to
9069 // ensure we don't accidentally cause an extra step back into the loop while
9070 // debugging.
9071 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9072
9073 // TODO: At the moment ComputeReductionResult also drives creation of the
9074 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9075 // even for in-loop reductions, until the reduction resume value handling is
9076 // also modeled in VPlan.
9077 auto *FinalReductionResult = new VPInstruction(
9078 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9079 cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
9080 ->appendRecipe(FinalReductionResult);
9081 OrigExitingVPV->replaceUsesWithIf(
9082 FinalReductionResult,
9083 [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
9084 }
9085
9087}
9088
9089#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9091 VPSlotTracker &SlotTracker) const {
9092 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9093 IG->getInsertPos()->printAsOperand(O, false);
9094 O << ", ";
9096 VPValue *Mask = getMask();
9097 if (Mask) {
9098 O << ", ";
9099 Mask->printAsOperand(O, SlotTracker);
9100 }
9101
9102 unsigned OpIdx = 0;
9103 for (unsigned i = 0; i < IG->getFactor(); ++i) {
9104 if (!IG->getMember(i))
9105 continue;
9106 if (getNumStoreOperands() > 0) {
9107 O << "\n" << Indent << " store ";
9108 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9109 O << " to index " << i;
9110 } else {
9111 O << "\n" << Indent << " ";
9113 O << " = load from index " << i;
9114 }
9115 ++OpIdx;
9116 }
9117}
9118#endif
9119
9122 "Not a pointer induction according to InductionDescriptor!");
9123 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9124 "Unexpected type.");
9125
9126 auto *IVR = getParent()->getPlan()->getCanonicalIV();
9127 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
9128
9129 if (onlyScalarsGenerated(State.VF.isScalable())) {
9130 // This is the normalized GEP that starts counting at zero.
9131 Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9132 CanonicalIV, IndDesc.getStep()->getType());
9133 // Determine the number of scalars we need to generate for each unroll
9134 // iteration. If the instruction is uniform, we only need to generate the
9135 // first lane. Otherwise, we generate all VF values.
9136 bool IsUniform = vputils::onlyFirstLaneUsed(this);
9137 assert((IsUniform || !State.VF.isScalable()) &&
9138 "Cannot scalarize a scalable VF");
9139 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9140
9141 for (unsigned Part = 0; Part < State.UF; ++Part) {
9142 Value *PartStart =
9143 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9144
9145 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9146 Value *Idx = State.Builder.CreateAdd(
9147 PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9148 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9149
9150 Value *Step = State.get(getOperand(1), VPIteration(Part, Lane));
9151 Value *SclrGep = emitTransformedIndex(
9152 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step,
9153 IndDesc.getKind(), IndDesc.getInductionBinOp());
9154 SclrGep->setName("next.gep");
9155 State.set(this, SclrGep, VPIteration(Part, Lane));
9156 }
9157 }
9158 return;
9159 }
9160
9161 Type *PhiType = IndDesc.getStep()->getType();
9162
9163 // Build a pointer phi
9164 Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9165 Type *ScStValueType = ScalarStartValue->getType();
9166 PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
9167 CanonicalIV->getIterator());
9168
9169 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9170 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9171
9172 // A pointer induction, performed by using a gep
9173 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
9174
9175 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9176 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9177 Value *NumUnrolledElems =
9178 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9179 Value *InductionGEP = GetElementPtrInst::Create(
9180 State.Builder.getInt8Ty(), NewPointerPhi,
9181 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9182 InductionLoc);
9183 // Add induction update using an incorrect block temporarily. The phi node
9184 // will be fixed after VPlan execution. Note that at this point the latch
9185 // block cannot be used, as it does not exist yet.
9186 // TODO: Model increment value in VPlan, by turning the recipe into a
9187 // multi-def and a subclass of VPHeaderPHIRecipe.
9188 NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9189
9190 // Create UF many actual address geps that use the pointer
9191 // phi as base and a vectorized version of the step value
9192 // (<step*0, ..., step*N>) as offset.
9193 for (unsigned Part = 0; Part < State.UF; ++Part) {
9194 Type *VecPhiType = VectorType::get(PhiType, State.VF);
9195 Value *StartOffsetScalar =
9196 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9197 Value *StartOffset =
9198 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9199 // Create a vector of consecutive numbers from zero to VF.
9200 StartOffset = State.Builder.CreateAdd(
9201 StartOffset, State.Builder.CreateStepVector(VecPhiType));
9202
9203 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
9204 "scalar step must be the same across all parts");
9205 Value *GEP = State.Builder.CreateGEP(
9206 State.Builder.getInt8Ty(), NewPointerPhi,
9207 State.Builder.CreateMul(
9208 StartOffset,
9209 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9210 "vector.gep"));
9211 State.set(this, GEP, Part);
9212 }
9213}
9214
9216 assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9217
9218 // Fast-math-flags propagate from the original induction instruction.
9220 if (FPBinOp)
9221 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9222
9223 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9224 Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9225 Value *DerivedIV = emitTransformedIndex(
9226 State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9227 Kind, cast_if_present<BinaryOperator>(FPBinOp));
9228 DerivedIV->setName("offset.idx");
9229 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9230
9231 State.set(this, DerivedIV, VPIteration(0, 0));
9232}
9233
9235 assert(!State.Instance && "Interleave group being replicated.");
9236 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9238 NeedsMaskForGaps);
9239}
9240
9242 assert(!State.Instance && "Reduction being replicated.");
9243 Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true);
9244 RecurKind Kind = RdxDesc.getRecurrenceKind();
9245 bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc);
9246 // Propagate the fast-math flags carried by the underlying instruction.
9248 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
9249 for (unsigned Part = 0; Part < State.UF; ++Part) {
9250 Value *NewVecOp = State.get(getVecOp(), Part);
9251 if (VPValue *Cond = getCondOp()) {
9252 Value *NewCond = State.get(Cond, Part, State.VF.isScalar());
9253 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
9254 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
9255 Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
9256 RdxDesc.getFastMathFlags());
9257 if (State.VF.isVector()) {
9258 Iden =
9259 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9260 }
9261
9262 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);
9263 NewVecOp = Select;
9264 }
9265 Value *NewRed;
9266 Value *NextInChain;
9267 if (IsOrdered) {
9268 if (State.VF.isVector())
9269 NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
9270 PrevInChain);
9271 else
9272 NewRed = State.Builder.CreateBinOp(
9273 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
9274 NewVecOp);
9275 PrevInChain = NewRed;
9276 } else {
9277 PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);
9278 NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
9279 }
9281 NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
9282 NewRed, PrevInChain);
9283 } else if (IsOrdered)
9284 NextInChain = NewRed;
9285 else
9286 NextInChain = State.Builder.CreateBinOp(
9287 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
9288 State.set(this, NextInChain, Part, /*IsScalar*/ true);
9289 }
9290}
9291
9294 if (State.Instance) { // Generate a single instance.
9295 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9296 State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
9297 // Insert scalar instance packing it into a vector.
9298 if (State.VF.isVector() && shouldPack()) {
9299 // If we're constructing lane 0, initialize to start from poison.
9300 if (State.Instance->Lane.isFirstLane()) {
9301 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9302 Value *Poison = PoisonValue::get(
9303 VectorType::get(UI->getType(), State.VF));
9304 State.set(this, Poison, State.Instance->Part);
9305 }
9306 State.packScalarIntoVectorValue(this, *State.Instance);
9307 }
9308 return;
9309 }
9310
9311 if (IsUniform) {
9312 // If the recipe is uniform across all parts (instead of just per VF), only
9313 // generate a single instance.
9314 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9315 all_of(operands(), [](VPValue *Op) {
9316 return Op->isDefinedOutsideVectorRegions();
9317 })) {
9318 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
9319 if (user_begin() != user_end()) {
9320 for (unsigned Part = 1; Part < State.UF; ++Part)
9321 State.set(this, State.get(this, VPIteration(0, 0)),
9322 VPIteration(Part, 0));
9323 }
9324 return;
9325 }
9326
9327 // Uniform within VL means we need to generate lane 0 only for each
9328 // unrolled copy.
9329 for (unsigned Part = 0; Part < State.UF; ++Part)
9330 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
9331 return;
9332 }
9333
9334 // A store of a loop varying value to a uniform address only needs the last
9335 // copy of the store.
9336 if (isa<StoreInst>(UI) &&
9338 auto Lane = VPLane::getLastLaneForVF(State.VF);
9339 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
9340 State);
9341 return;
9342 }
9343
9344 // Generate scalar instances for all VF lanes of all UF parts.
9345 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9346 const unsigned EndLane = State.VF.getKnownMinValue();
9347 for (unsigned Part = 0; Part < State.UF; ++Part)
9348 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9349 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
9350}
9351
9353 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9354
9355 // Attempt to issue a wide load.
9356 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9357 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9358
9359 assert((LI || SI) && "Invalid Load/Store instruction");
9360 assert((!SI || StoredValue) && "No stored value provided for widened store");
9361 assert((!LI || !StoredValue) && "Stored value provided for widened load");
9362
9363 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9364
9365 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9366 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9367 bool CreateGatherScatter = !isConsecutive();
9368
9369 auto &Builder = State.Builder;
9370 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9371 bool isMaskRequired = getMask();
9372 if (isMaskRequired) {
9373 // Mask reversal is only needed for non-all-one (null) masks, as reverse of
9374 // a null all-one mask is a null mask.
9375 for (unsigned Part = 0; Part < State.UF; ++Part) {
9376 Value *Mask = State.get(getMask(), Part);
9377 if (isReverse())
9378 Mask = Builder.CreateVectorReverse(Mask, "reverse");
9379 BlockInMaskParts[Part] = Mask;
9380 }
9381 }
9382
9383 // Handle Stores:
9384 if (SI) {
9385 State.setDebugLocFrom(SI->getDebugLoc());
9386
9387 for (unsigned Part = 0; Part < State.UF; ++Part) {
9388 Instruction *NewSI = nullptr;
9389 Value *StoredVal = State.get(StoredValue, Part);
9390 if (CreateGatherScatter) {
9391 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9392 Value *VectorGep = State.get(getAddr(), Part);
9393 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9394 MaskPart);
9395 } else {
9396 if (isReverse()) {
9397 // If we store to reverse consecutive memory locations, then we need
9398 // to reverse the order of elements in the stored value.
9399 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9400 // We don't want to update the value in the map as it might be used in
9401 // another expression. So don't call resetVectorValue(StoredVal).
9402 }
9403 auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true);
9404 if (isMaskRequired)
9405 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9406 BlockInMaskParts[Part]);
9407 else
9408 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9409 }
9410 State.addMetadata(NewSI, SI);
9411 }
9412 return;
9413 }
9414
9415 // Handle loads.
9416 assert(LI && "Must have a load instruction");
9417 State.setDebugLocFrom(LI->getDebugLoc());
9418 for (unsigned Part = 0; Part < State.UF; ++Part) {
9419 Value *NewLI;
9420 if (CreateGatherScatter) {
9421 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9422 Value *VectorGep = State.get(getAddr(), Part);
9423 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9424 nullptr, "wide.masked.gather");
9425 State.addMetadata(NewLI, LI);
9426 } else {
9427 auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true);
9428 if (isMaskRequired)
9429 NewLI = Builder.CreateMaskedLoad(
9430 DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9431 PoisonValue::get(DataTy), "wide.masked.load");
9432 else
9433 NewLI =
9434 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9435
9436 // Add metadata to the load, but setVectorValue to the reverse shuffle.
9437 State.addMetadata(NewLI, LI);
9438 if (Reverse)
9439 NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9440 }
9441
9442 State.set(getVPSingleValue(), NewLI, Part);
9443 }
9444}
9445
9446// Determine how to lower the scalar epilogue, which depends on 1) optimising
9447// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9448// predication, and 4) a TTI hook that analyses whether the loop is suitable
9449// for predication.
9454 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9455 // don't look at hints or options, and don't request a scalar epilogue.
9456 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9457 // LoopAccessInfo (due to code dependency and not being able to reliably get
9458 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9459 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9460 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9461 // back to the old way and vectorize with versioning when forced. See D81345.)
9462 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9466
9467 // 2) If set, obey the directives
9468 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9476 };
9477 }
9478
9479 // 3) If set, obey the hints
9480 switch (Hints.getPredicate()) {
9485 };
9486
9487 // 4) if the TTI hook indicates this is profitable, request predication.
9488 TailFoldingInfo TFI(TLI, &LVL, IAI);
9491
9493}
9494
9495// Process the loop in the VPlan-native vectorization path. This path builds
9496// VPlan upfront in the vectorization pipeline, which allows to apply
9497// VPlan-to-VPlan transformations from the very beginning without modifying the
9498// input LLVM IR.
9505 LoopVectorizationRequirements &Requirements) {
9506
9507 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9508 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9509 return false;
9510 }
9511 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9512 Function *F = L->getHeader()->getParent();
9513 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9514
9516 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9517
9518 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9519 &Hints, IAI);
9520 // Use the planner for outer loop vectorization.
9521 // TODO: CM is not used at this point inside the planner. Turn CM into an
9522 // optional argument if we don't need it in the future.
9523 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9524 ORE);
9525
9526 // Get user vectorization factor.
9527 ElementCount UserVF = Hints.getWidth();
9528
9530
9531 // Plan how to best vectorize, return the best VF and its cost.
9532 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9533
9534 // If we are stress testing VPlan builds, do not attempt to generate vector
9535 // code. Masked vector code generation support will follow soon.
9536 // Also, do not attempt to vectorize if no vector code will be produced.
9538 return false;
9539
9540 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9541
9542 {
9543 bool AddBranchWeights =
9544 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9545 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9546 F->getParent()->getDataLayout(), AddBranchWeights);
9547 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9548 VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9549 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9550 << L->getHeader()->getParent()->getName() << "\"\n");
9551 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9552 }
9553
9554 reportVectorization(ORE, L, VF, 1);
9555
9556 // Mark the loop as already vectorized to avoid vectorizing again.
9557 Hints.setAlreadyVectorized();
9558 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9559 return true;
9560}
9561
9562// Emit a remark if there are stores to floats that required a floating point
9563// extension. If the vectorized loop was generated with floating point there
9564// will be a performance penalty from the conversion overhead and the change in
9565// the vector width.
9568 for (BasicBlock *BB : L->getBlocks()) {
9569 for (Instruction &Inst : *BB) {
9570 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9571 if (S->getValueOperand()->getType()->isFloatTy())
9572 Worklist.push_back(S);
9573 }
9574 }
9575 }
9576
9577 // Traverse the floating point stores upwards searching, for floating point
9578 // conversions.
9581 while (!Worklist.empty()) {
9582 auto *I = Worklist.pop_back_val();
9583 if (!L->contains(I))
9584 continue;
9585 if (!Visited.insert(I).second)
9586 continue;
9587
9588 // Emit a remark if the floating point store required a floating
9589 // point conversion.
9590 // TODO: More work could be done to identify the root cause such as a
9591 // constant or a function return type and point the user to it.
9592 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9593 ORE->emit([&]() {
9594 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9595 I->getDebugLoc(), L->getHeader())
9596 << "floating point conversion changes vector width. "
9597 << "Mixed floating point precision requires an up/down "
9598 << "cast that will negatively impact performance.";
9599 });
9600
9601 for (Use &Op : I->operands())
9602 if (auto *OpI = dyn_cast<Instruction>(Op))
9603 Worklist.push_back(OpI);
9604 }
9605}
9606
9607static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9609 std::optional<unsigned> VScale, Loop *L,
9610 ScalarEvolution &SE,
9612 InstructionCost CheckCost = Checks.getCost();
9613 if (!CheckCost.isValid())
9614 return false;
9615
9616 // When interleaving only scalar and vector cost will be equal, which in turn
9617 // would lead to a divide by 0. Fall back to hard threshold.
9618 if (VF.Width.isScalar()) {
9619 if (CheckCost > VectorizeMemoryCheckThreshold) {
9620 LLVM_DEBUG(
9621 dbgs()
9622 << "LV: Interleaving only is not profitable due to runtime checks\n");
9623 return false;
9624 }
9625 return true;
9626 }
9627
9628 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9629 double ScalarC = *VF.ScalarCost.getValue();
9630 if (ScalarC == 0)
9631 return true;
9632
9633 // First, compute the minimum iteration count required so that the vector
9634 // loop outperforms the scalar loop.
9635 // The total cost of the scalar loop is
9636 // ScalarC * TC
9637 // where
9638 // * TC is the actual trip count of the loop.
9639 // * ScalarC is the cost of a single scalar iteration.
9640 //
9641 // The total cost of the vector loop is
9642 // RtC + VecC * (TC / VF) + EpiC
9643 // where
9644 // * RtC is the cost of the generated runtime checks
9645 // * VecC is the cost of a single vector iteration.
9646 // * TC is the actual trip count of the loop
9647 // * VF is the vectorization factor
9648 // * EpiCost is the cost of the generated epilogue, including the cost
9649 // of the remaining scalar operations.
9650 //
9651 // Vectorization is profitable once the total vector cost is less than the
9652 // total scalar cost:
9653 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9654 //
9655 // Now we can compute the minimum required trip count TC as
9656 // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
9657 //
9658 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9659 // the computations are performed on doubles, not integers and the result
9660 // is rounded up, hence we get an upper estimate of the TC.
9661 unsigned IntVF = VF.Width.getKnownMinValue();
9662 if (VF.Width.isScalable()) {
9663 unsigned AssumedMinimumVscale = 1;
9664 if (VScale)
9665 AssumedMinimumVscale = *VScale;
9666 IntVF *= AssumedMinimumVscale;
9667 }
9668 double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
9669 double RtC = *CheckCost.getValue();
9670 double MinTC1 = RtC / (ScalarC - VecCOverVF);
9671
9672 // Second, compute a minimum iteration count so that the cost of the
9673 // runtime checks is only a fraction of the total scalar loop cost. This
9674 // adds a loop-dependent bound on the overhead incurred if the runtime
9675 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9676 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9677 // cost, compute
9678 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9679 double MinTC2 = RtC * 10 / ScalarC;
9680
9681 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9682 // epilogue is allowed, choose the next closest multiple of VF. This should
9683 // partly compensate for ignoring the epilogue cost.
9684 uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
9685 if (SEL == CM_ScalarEpilogueAllowed)
9686 MinTC = alignTo(MinTC, IntVF);
9688
9689 LLVM_DEBUG(
9690 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9691 << VF.MinProfitableTripCount << "\n");
9692
9693 // Skip vectorization if the expected trip count is less than the minimum
9694 // required trip count.
9695 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9698 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9699 "trip count < minimum profitable VF ("
9700 << *ExpectedTC << " < " << VF.MinProfitableTripCount
9701 << ")\n");
9702
9703 return false;
9704 }
9705 }
9706 return true;
9707}
9708
9710 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9712 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9714
9716 assert((EnableVPlanNativePath || L->isInnermost()) &&
9717 "VPlan-native path is not enabled. Only process inner loops.");
9718
9719#ifndef NDEBUG
9720 const std::string DebugLocStr = getDebugLocString(L);
9721#endif /* NDEBUG */
9722
9723 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9724 << L->getHeader()->getParent()->getName() << "' from "
9725 << DebugLocStr << "\n");
9726
9727 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9728
9729 LLVM_DEBUG(
9730 dbgs() << "LV: Loop hints:"
9731 << " force="
9733 ? "disabled"
9735 ? "enabled"
9736 : "?"))
9737 << " width=" << Hints.getWidth()
9738 << " interleave=" << Hints.getInterleave() << "\n");
9739
9740 // Function containing loop
9741 Function *F = L->getHeader()->getParent();
9742
9743 // Looking at the diagnostic output is the only way to determine if a loop
9744 // was vectorized (other than looking at the IR or machine code), so it
9745 // is important to generate an optimization remark for each loop. Most of
9746 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9747 // generated as OptimizationRemark and OptimizationRemarkMissed are
9748 // less verbose reporting vectorized loops and unvectorized loops that may
9749 // benefit from vectorization, respectively.
9750
9751 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9752 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9753 return false;
9754 }
9755
9756 PredicatedScalarEvolution PSE(*SE, *L);
9757
9758 // Check if it is legal to vectorize the loop.
9759 LoopVectorizationRequirements Requirements;
9760 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9761 &Requirements, &Hints, DB, AC, BFI, PSI);
9763 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9764 Hints.emitRemarkWithHints();
9765 return false;
9766 }
9767
9768 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9769 // here. They may require CFG and instruction level transformations before
9770 // even evaluating whether vectorization is profitable. Since we cannot modify
9771 // the incoming IR, we need to build VPlan upfront in the vectorization
9772 // pipeline.
9773 if (!L->isInnermost())
9774 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9775 ORE, BFI, PSI, Hints, Requirements);
9776
9777 assert(L->isInnermost() && "Inner loop expected.");
9778
9779 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9780 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9781
9782 // If an override option has been passed in for interleaved accesses, use it.
9783 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9784 UseInterleaved = EnableInterleavedMemAccesses;
9785
9786 // Analyze interleaved memory accesses.
9787 if (UseInterleaved)
9789
9790 // Check the function attributes and profiles to find out if this function
9791 // should be optimized for size.
9793 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9794
9795 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9796 // count by optimizing for size, to minimize overheads.
9797 auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9798 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9799 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9800 << "This loop is worth vectorizing only if no scalar "
9801 << "iteration overheads are incurred.");
9803 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9804 else {
9805 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
9806 LLVM_DEBUG(dbgs() << "\n");
9807 // Predicate tail-folded loops are efficient even when the loop
9808 // iteration count is low. However, setting the epilogue policy to
9809 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9810 // with runtime checks. It's more effective to let
9811 // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9812 // for the loop.
9815 } else {
9816 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
9817 "small to consider vectorizing.\n");
9819 "The trip count is below the minial threshold value.",
9820 "loop trip count is too low, avoiding vectorization",
9821 "LowTripCount", ORE, L);
9822 Hints.emitRemarkWithHints();
9823 return false;
9824 }
9825 }
9826 }
9827
9828 // Check the function attributes to see if implicit floats or vectors are
9829 // allowed.
9830 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9832 "Can't vectorize when the NoImplicitFloat attribute is used",
9833 "loop not vectorized due to NoImplicitFloat attribute",
9834 "NoImplicitFloat", ORE, L);
9835 Hints.emitRemarkWithHints();
9836 return false;
9837 }
9838
9839 // Check if the target supports potentially unsafe FP vectorization.
9840 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9841 // for the target we're vectorizing for, to make sure none of the
9842 // additional fp-math flags can help.
9843 if (Hints.isPotentiallyUnsafe() &&
9846 "Potentially unsafe FP op prevents vectorization",
9847 "loop not vectorized due to unsafe FP support.",
9848 "UnsafeFP", ORE, L);
9849 Hints.emitRemarkWithHints();
9850 return false;
9851 }
9852
9853 bool AllowOrderedReductions;
9854 // If the flag is set, use that instead and override the TTI behaviour.
9855 if (ForceOrderedReductions.getNumOccurrences() > 0)
9856 AllowOrderedReductions = ForceOrderedReductions;
9857 else
9858 AllowOrderedReductions = TTI->enableOrderedReductions();
9859 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9860 ORE->emit([&]() {
9861 auto *ExactFPMathInst = Requirements.getExactFPInst();
9862 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9863 ExactFPMathInst->getDebugLoc(),
9864 ExactFPMathInst->getParent())
9865 << "loop not vectorized: cannot prove it is safe to reorder "
9866 "floating-point operations";
9867 });
9868 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9869 "reorder floating-point operations\n");
9870 Hints.emitRemarkWithHints();
9871 return false;
9872 }
9873
9874 // Use the cost model.
9875 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9876 F, &Hints, IAI);
9877 // Use the planner for vectorization.
9878 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9879 ORE);
9880
9881 // Get user vectorization factor and interleave count.
9882 ElementCount UserVF = Hints.getWidth();
9883 unsigned UserIC = Hints.getInterleave();
9884
9885 // Plan how to best vectorize, return the best VF and its cost.
9886 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9887
9889 unsigned IC = 1;
9890
9891 bool AddBranchWeights =
9892 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9893 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9894 F->getParent()->getDataLayout(), AddBranchWeights);
9895 if (MaybeVF) {
9896 VF = *MaybeVF;
9897 // Select the interleave count.
9898 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9899
9900 unsigned SelectedIC = std::max(IC, UserIC);
9901 // Optimistically generate runtime checks if they are needed. Drop them if
9902 // they turn out to not be profitable.
9903 if (VF.Width.isVector() || SelectedIC > 1)
9904 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
9905
9906 // Check if it is profitable to vectorize with runtime checks.
9907 bool ForceVectorization =
9909 if (!ForceVectorization &&
9911 *PSE.getSE(), SEL)) {
9912 ORE->emit([&]() {
9914 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9915 L->getHeader())
9916 << "loop not vectorized: cannot prove it is safe to reorder "
9917 "memory operations";
9918 });
9919 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9920 Hints.emitRemarkWithHints();
9921 return false;
9922 }
9923 }
9924
9925 // Identify the diagnostic messages that should be produced.
9926 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9927 bool VectorizeLoop = true, InterleaveLoop = true;
9928 if (VF.Width.isScalar()) {
9929 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9930 VecDiagMsg = std::make_pair(
9931 "VectorizationNotBeneficial",
9932 "the cost-model indicates that vectorization is not beneficial");
9933 VectorizeLoop = false;
9934 }
9935
9936 if (!MaybeVF && UserIC > 1) {
9937 // Tell the user interleaving was avoided up-front, despite being explicitly
9938 // requested.
9939 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9940 "interleaving should be avoided up front\n");
9941 IntDiagMsg = std::make_pair(
9942 "InterleavingAvoided",
9943 "Ignoring UserIC, because interleaving was avoided up front");
9944 InterleaveLoop = false;
9945 } else if (IC == 1 && UserIC <= 1) {
9946 // Tell the user interleaving is not beneficial.
9947 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9948 IntDiagMsg = std::make_pair(
9949 "InterleavingNotBeneficial",
9950 "the cost-model indicates that interleaving is not beneficial");
9951 InterleaveLoop = false;
9952 if (UserIC == 1) {
9953 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9954 IntDiagMsg.second +=
9955 " and is explicitly disabled or interleave count is set to 1";
9956 }
9957 } else if (IC > 1 && UserIC == 1) {
9958 // Tell the user interleaving is beneficial, but it explicitly disabled.
9959 LLVM_DEBUG(
9960 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9961 IntDiagMsg = std::make_pair(
9962 "InterleavingBeneficialButDisabled",
9963 "the cost-model indicates that interleaving is beneficial "
9964 "but is explicitly disabled or interleave count is set to 1");
9965 InterleaveLoop = false;
9966 }
9967
9968 // Override IC if user provided an interleave count.
9969 IC = UserIC > 0 ? UserIC : IC;
9970
9971 // Emit diagnostic messages, if any.
9972 const char *VAPassName = Hints.vectorizeAnalysisPassName();
9973 if (!VectorizeLoop && !InterleaveLoop) {
9974 // Do not vectorize or interleaving the loop.
9975 ORE->emit([&]() {
9976 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9977 L->getStartLoc(), L->getHeader())
9978 << VecDiagMsg.second;
9979 });
9980 ORE->emit([&]() {
9981 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9982 L->getStartLoc(), L->getHeader())
9983 << IntDiagMsg.second;
9984 });
9985 return false;
9986 } else if (!VectorizeLoop && InterleaveLoop) {
9987 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9988 ORE->emit([&]() {
9989 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9990 L->getStartLoc(), L->getHeader())
9991 << VecDiagMsg.second;
9992 });
9993 } else if (VectorizeLoop && !InterleaveLoop) {
9994 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9995 << ") in " << DebugLocStr << '\n');
9996 ORE->emit([&]() {
9997 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9998 L->getStartLoc(), L->getHeader())
9999 << IntDiagMsg.second;
10000 });
10001 } else if (VectorizeLoop && InterleaveLoop) {
10002 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10003 << ") in " << DebugLocStr << '\n');
10004 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10005 }
10006
10007 bool DisableRuntimeUnroll = false;
10008 MDNode *OrigLoopID = L->getLoopID();
10009 {
10010 using namespace ore;
10011 if (!VectorizeLoop) {
10012 assert(IC > 1 && "interleave count should not be 1 or 0");
10013 // If we decided that it is not legal to vectorize the loop, then
10014 // interleave it.
10015 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10016 &CM, BFI, PSI, Checks);
10017
10018 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10019 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10020
10021 ORE->emit([&]() {
10022 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10023 L->getHeader())
10024 << "interleaved loop (interleaved count: "
10025 << NV("InterleaveCount", IC) << ")";
10026 });
10027 } else {
10028 // If we decided that it is *legal* to vectorize the loop, then do it.
10029
10030 // Consider vectorizing the epilogue too if it's profitable.
10031 VectorizationFactor EpilogueVF =
10033 if (EpilogueVF.Width.isVector()) {
10034
10035 // The first pass vectorizes the main loop and creates a scalar epilogue
10036 // to be vectorized by executing the plan (potentially with a different
10037 // factor) again shortly afterwards.
10038 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10039 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10040 EPI, &LVL, &CM, BFI, PSI, Checks);
10041
10042 std::unique_ptr<VPlan> BestMainPlan(
10044 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
10045 EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
10046 ++LoopsVectorized;
10047
10048 // Second pass vectorizes the epilogue and adjusts the control flow
10049 // edges from the first pass.
10050 EPI.MainLoopVF = EPI.EpilogueVF;
10051 EPI.MainLoopUF = EPI.EpilogueUF;
10052 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10053 ORE, EPI, &LVL, &CM, BFI, PSI,
10054 Checks);
10055
10056 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10057 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10058 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10059 Header->setName("vec.epilog.vector.body");
10060
10061 // Re-use the trip count and steps expanded for the main loop, as
10062 // skeleton creation needs it as a value that dominates both the scalar
10063 // and vector epilogue loops
10064 // TODO: This is a workaround needed for epilogue vectorization and it
10065 // should be removed once induction resume value creation is done
10066 // directly in VPlan.
10067 EpilogILV.setTripCount(MainILV.getTripCount());
10068 for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
10069 auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
10070 auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn(
10071 ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10072 ExpandR->replaceAllUsesWith(ExpandedVal);
10073 if (BestEpiPlan.getTripCount() == ExpandR)
10074 BestEpiPlan.resetTripCount(ExpandedVal);
10075 ExpandR->eraseFromParent();
10076 }
10077
10078 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10079 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10080 // before vectorizing the epilogue loop.
10081 for (VPRecipeBase &R : Header->phis()) {
10082 if (isa<VPCanonicalIVPHIRecipe>(&R))
10083 continue;
10084
10085 Value *ResumeV = nullptr;
10086 // TODO: Move setting of resume values to prepareToExecute.
10087 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10088 ResumeV = ReductionResumeValues
10089 .find(&ReductionPhi->getRecurrenceDescriptor())
10090 ->second;
10091 } else {
10092 // Create induction resume values for both widened pointer and
10093 // integer/fp inductions and update the start value of the induction
10094 // recipes to use the resume value.
10095 PHINode *IndPhi = nullptr;
10096 const InductionDescriptor *ID;
10097 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10098 IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10099 ID = &Ind->getInductionDescriptor();
10100 } else {
10101 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10102 IndPhi = WidenInd->getPHINode();
10103 ID = &WidenInd->getInductionDescriptor();
10104 }
10105
10106 ResumeV = MainILV.createInductionResumeValue(
10107 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
10109 }
10110 assert(ResumeV && "Must have a resume value");
10111 VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV);
10112 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10113 }
10114
10115 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10116 DT, true, &ExpandedSCEVs);
10117 ++LoopsEpilogueVectorized;
10118
10119 if (!MainILV.areSafetyChecksAdded())
10120 DisableRuntimeUnroll = true;
10121 } else {
10122 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10123 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10124 PSI, Checks);
10125
10126 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10127 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10128 ++LoopsVectorized;
10129
10130 // Add metadata to disable runtime unrolling a scalar loop when there
10131 // are no runtime checks about strides and memory. A scalar loop that is
10132 // rarely used is not worth unrolling.
10133 if (!LB.areSafetyChecksAdded())
10134 DisableRuntimeUnroll = true;
10135 }
10136 // Report the vectorization decision.
10137 reportVectorization(ORE, L, VF, IC);
10138 }
10139
10142 }
10143
10144 std::optional<MDNode *> RemainderLoopID =
10147 if (RemainderLoopID) {
10148 L->setLoopID(*RemainderLoopID);
10149 } else {
10150 if (DisableRuntimeUnroll)
10152
10153 // Mark the loop as already vectorized to avoid vectorizing again.
10154 Hints.setAlreadyVectorized();
10155 }
10156
10157 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10158 return true;
10159}
10160
10166 SE = &SE_;
10167 LI = &LI_;
10168 TTI = &TTI_;
10169 DT = &DT_;
10170 BFI = BFI_;
10171 TLI = TLI_;
10172 AC = &AC_;
10173 LAIs = &LAIs_;
10174 DB = &DB_;
10175 ORE = &ORE_;
10176 PSI = PSI_;
10177
10178 // Don't attempt if
10179 // 1. the target claims to have no vector registers, and
10180 // 2. interleaving won't help ILP.
10181 //
10182 // The second condition is necessary because, even if the target has no
10183 // vector registers, loop vectorization may still enable scalar
10184 // interleaving.
10187 return LoopVectorizeResult(false, false);
10188
10189 bool Changed = false, CFGChanged = false;
10190
10191 // The vectorizer requires loops to be in simplified form.
10192 // Since simplification may add new inner loops, it has to run before the
10193 // legality and profitability checks. This means running the loop vectorizer
10194 // will simplify all loops, regardless of whether anything end up being
10195 // vectorized.
10196 for (const auto &L : *LI)
10197 Changed |= CFGChanged |=
10198 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10199
10200 // Build up a worklist of inner-loops to vectorize. This is necessary as
10201 // the act of vectorizing or partially unrolling a loop creates new loops
10202 // and can invalidate iterators across the loops.
10203 SmallVector<Loop *, 8> Worklist;
10204
10205 for (Loop *L : *LI)
10206 collectSupportedLoops(*L, LI, ORE, Worklist);
10207
10208 LoopsAnalyzed += Worklist.size();
10209
10210 // Now walk the identified inner loops.
10211 while (!Worklist.empty()) {
10212 Loop *L = Worklist.pop_back_val();
10213
10214 // For the inner loops we actually process, form LCSSA to simplify the
10215 // transform.
10216 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10217
10218 Changed |= CFGChanged |= processLoop(L);
10219
10220 if (Changed) {
10221 LAIs->clear();
10222
10223#ifndef NDEBUG
10224 if (VerifySCEV)
10225 SE->verify();
10226#endif
10227 }
10228 }
10229
10230 // Process each loop nest in the function.
10231 return LoopVectorizeResult(Changed, CFGChanged);
10232}
10233
10236 auto &LI = AM.getResult<LoopAnalysis>(F);
10237 // There are no loops in the function. Return before computing other expensive
10238 // analyses.
10239 if (LI.empty())
10240 return PreservedAnalyses::all();
10242 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10243 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10244 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10245 auto &AC = AM.getResult<AssumptionAnalysis>(F);
10246 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10248
10250 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10252 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10253 BlockFrequencyInfo *BFI = nullptr;
10254 if (PSI && PSI->hasProfileSummary())
10256 LoopVectorizeResult Result =
10257 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10258 if (!Result.MadeAnyChange)
10259 return PreservedAnalyses::all();
10261
10262 if (isAssignmentTrackingEnabled(*F.getParent())) {
10263 for (auto &BB : F)
10265 }
10266
10267 // We currently do not preserve loopinfo/dominator analyses with outer loop
10268 // vectorization. Until this is addressed, mark these analyses as preserved
10269 // only for non-VPlan-native path.
10270 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10271 if (!EnableVPlanNativePath) {
10272 PA.preserve<LoopAnalysis>();
10275 }
10276
10277 if (Result.MadeCFGChange) {
10278 // Making CFG changes likely means a loop got vectorized. Indicate that
10279 // extra simplification passes should be run.
10280 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10281 // be run if runtime checks have been added.
10284 } else {
10286 }
10287 return PA;
10288}
10289
10291 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10292 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10293 OS, MapClassName2PassName);
10294
10295 OS << '<';
10296 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10297 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10298 OS << '>';
10299}
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
ReachingDefAnalysis InstSet & ToRemove
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:693
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
#define DEBUG_WITH_TYPE(TYPE, X)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition: Debug.h:64
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
uint64_t Addr
std::string Name
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define Check(C,...)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
This header provides classes for managing per-loop analyses.
static const char * VerboseDebug
loop Loop Strength Reduction
#define LV_NAME
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check")))
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static std::optional< unsigned > getSmallBestKnownTC(ScalarEvolution &SE, Loop *L)
Returns "best known" trip count for the specified loop L as defined by the following procedure: 1) Re...
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static void createAndCollectMergePhiForReduction(VPInstruction *RedResult, DenseMap< const RecurrenceDescriptor *, Value * > &ReductionResumeValues, VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock)
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range)
Creates a VPWidenIntOrFpInductionRecpipe for Phi.
static Value * interleaveVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vals, const Twine &Name)
Return a vector containing interleaved elements from multiple smaller input vectors.
static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or it's operands.
static void emitInvalidCostRemarks(SmallVector< InstructionVFPair > InvalidCosts, OptimizationRemarkEmitter *ORE, Loop *TheLoop)
const char LLVMLoopVectorizeFollowupAll[]
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, DebugLoc DL)
static std::optional< unsigned > getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI)
Convenience function that returns the value of vscale_range iff vscale_range.min == vscale_range....
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static constexpr uint32_t MemCheckBypassWeights[]
static Type * MaybeVectorizeType(Type *Elt, ElementCount VF)
static cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing."))
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static Type * smallestIntegerVectorType(Type *T1, Type *T2)
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, std::optional< unsigned > VScale, Loop *L, ScalarEvolution &SE, ScalarEpilogueLowering SEL)
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
const char VerboseDebug[]
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I)
Create an analysis remark that explains why vectorization failed.
static constexpr uint32_t SCEVCheckBypassWeights[]
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
const char LLVMLoopVectorizeFollowupVectorized[]
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static std::string getDebugLocString(const Loop *L)
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
const char LLVMLoopVectorizeFollowupEpilogue[]
static bool useActiveLaneMask(TailFoldingStyle Style)
static Type * largestIntegerVectorType(Type *T1, Type *T2)
static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop, VPlan &Plan)
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
static unsigned getReciprocalPredBlockProb()
A helper function that returns the reciprocal of the block probability of predicated blocks.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
#define DEBUG_TYPE
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static void AddRuntimeUnrollDisableMetaData(Loop *L)
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static cl::opt< bool > PrintVPlansInDotFormat("vplan-print-in-dot-format", cl::Hidden, cl::desc("Use dot format instead of plain text when dumping VPlans"))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file implements a map that provides insertion order iteration.
#define R2(n)
This file contains the declarations for metadata subclasses.
#define T1
Module.h This file contains the declarations for the Module class.
LLVMContext & Context
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(VerifyEach)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
separate const offset from gep
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
This defines the Use class.
This file defines the VPlanHCFGBuilder class which contains the public interface (buildHierarchicalCF...
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:348
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:500
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
void registerAssumption(AssumeInst *CI)
Add an @llvm.assume intrinsic to this function's cache.
unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
Definition: Attributes.cpp:411
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:429
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:498
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
Definition: BasicBlock.cpp:354
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:347
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:439
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:205
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:164
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:155
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:220
BinaryOps getOpcode() const
Definition: InstrTypes.h:491
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
void setCondition(Value *V)
static BranchInst * Create(BasicBlock *IfTrue, BasicBlock::iterator InsertBefore)
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:70
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:2179
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1703
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1648
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1639
unsigned arg_size() const
Definition: InstrTypes.h:1646
This class represents a function call, abstracting a target machine's calling convention.
static bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:965
@ FIRST_ICMP_PREDICATE
Definition: InstrTypes.h:996
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:988
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:990
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:991
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:849
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:856
This is an important base class in LLVM.
Definition: Constant.h:41
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
A debug info location.
Definition: DebugLoc.h:33
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:101
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:211
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
DomTreeNodeBase * getIDom() const
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:311
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:299
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:296
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition: TypeSize.h:302
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:307
BasicBlock * emitMinimumVectorEpilogueIterCountCheck(BasicBlock *Bypass, BasicBlock *Insert)
Emits an iteration count bypass check after the main vector loop has finished to see if there are any...
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (ie the ...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the main loop strategy (ie the firs...
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:318
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent function types.
Definition: DerivedTypes.h:103
param_iterator param_begin() const
Definition: DerivedTypes.h:128
param_iterator param_end() const
Definition: DerivedTypes.h:129
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:680
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:200
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:695
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:677
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:669
static GetElementPtrInst * Create(Type *PointeeType, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &NameStr, BasicBlock::iterator InsertBefore)
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:655
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1715
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:921
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:505
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2443
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1806
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1214
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2499
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:460
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:578
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1110
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:175
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:520
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:305
Value * CreateVectorReverse(Value *V, const Twine &Name="")
Return a vector value that contains the vector V reversed.
Definition: IRBuilder.cpp:1170
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:480
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2188
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2380
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2224
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:145
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1338
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2477
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:598
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1321
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:465
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1660
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1825
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1865
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2334
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:510
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1398
Value * CreateStepVector(Type *DstType, const Twine &Name="")
Creates a vector of type DstType with the linear sequence <0, 1, ...>
Definition: IRBuilder.cpp:109
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2043
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1355
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2649
A struct for saving information about induction variables.
BinaryOperator * getInductionBinOp() const
InductionKind getKind() const
const SCEV * getStep() const
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
Value * getStartValue() const
An extension of the inner loop vectorizer that creates a skeleton for a vectorized loop that has its ...
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
virtual std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)=0
The interface for creating a vectorized skeleton using one of two different strategies,...
std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
PHINode * createInductionResumeValue(PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, ArrayRef< BasicBlock * > BypassBlocks, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create a new phi node for the induction variable OrigPhi to resume iteration count in the scalar epil...
void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
BasicBlock * LoopScalarBody
The scalar loop body.
Value * TripCount
Trip count of the original loop.
void sinkScalarOperands(Instruction *PredInst)
Iteratively sink the scalarized operands of a predicated instruction into the block that was created ...
const TargetLibraryInfo * TLI
Target Library Info.
DenseMap< PHINode *, Value * > IVEndValues
ElementCount MinProfitableTripCount
Value * createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL)
Returns a bitcasted value to the requested vector type.
const TargetTransformInfo * TTI
Target Transform Info.
Value * VectorTripCount
Trip count of the widened loop (TripCount - TripCount % (VF*UF))
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
BasicBlock * emitSCEVChecks(BasicBlock *Bypass)
Emit a bypass check to see if all of the SCEV assumptions we've had to make are correct.
LoopVectorizationCostModel * Cost
The profitablity analysis.
SmallMapVector< const RecurrenceDescriptor *, PHINode *, 4 > ReductionResumeValues
BlockFrequencyInfo * BFI
BFI and PSI are used to check for profile guided size optimizations.
Value * getTripCount() const
Returns the original loop trip count.
BasicBlock * LoopMiddleBlock
Middle Block between the vector and the scalar.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State)
Create code for the loop exit value of the reduction.
SmallVector< Instruction *, 4 > PredicatedInstructions
Store instructions that were predicated.
void createVectorLoopSkeleton(StringRef Prefix)
Emit basic blocks (prefixed with Prefix) for the iteration check, vector loop preheader,...
BasicBlock * completeLoopSkeleton()
Complete the loop skeleton by adding debug MDs, creating appropriate conditional branches in the midd...
BasicBlock * emitMemRuntimeChecks(BasicBlock *Bypass)
Emit bypass checks to check any memory assumptions we may have made.
BasicBlock * LoopScalarPreHeader
The scalar-loop preheader.
LoopVectorizationLegality * Legal
The legality analysis.
void emitIterationCountCheck(BasicBlock *Bypass)
Emit a bypass check to see if the vector trip count is zero, including if it overflows.
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, BasicBlock *MiddleBlock, BasicBlock *VectorHeader, VPlan &Plan, VPTransformState &State)
Set up the values of the IVs correctly when exiting the vector loop.
LoopInfo * LI
Loop Info.
void createInductionResumeValues(const SCEV2ValueTy &ExpandedSCEVs, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create new phi nodes for the induction variables to resume iteration count in the scalar epilogue,...
ProfileSummaryInfo * PSI
void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State)
Fix the non-induction PHIs in Plan.
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
BasicBlock * LoopVectorPreHeader
The vector-loop preheader.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc)
Returns true if the reordering of FP operations is not allowed, but we are able to vectorize with str...
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
Value * getOrCreateVectorTripCount(BasicBlock *InsertBlock)
Returns (and creates if needed) the trip count of the widened loop.
IRBuilder Builder
The builder that we use.
void vectorizeInterleaveGroup(const InterleaveGroup< Instruction > *Group, ArrayRef< VPValue * > VPDefs, VPTransformState &State, VPValue *Addr, ArrayRef< VPValue * > StoredValues, VPValue *BlockInMask, bool NeedsMaskForGaps)
Try to vectorize interleaved access group Group with the base address given in Addr,...
void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State)
Create the exit value of first order recurrences in the middle block and update their users.
virtual std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
unsigned UF
The vectorization unroll factor to use.
void fixVectorizedLoop(VPTransformState &State, VPlan &Plan)
Fix the vectorized code, taking care of header phi's, live-outs, and more.
BasicBlock * LoopExitBlock
The unique ExitBlock of the scalar loop if one exists.
SmallVector< BasicBlock *, 4 > LoopBypassBlocks
A list of all bypass blocks. The first block is the entry of the loop.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:453
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:80
const BasicBlock * getParent() const
Definition: Instruction.h:151
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
void replaceSuccessorWith(BasicBlock *OldBB, BasicBlock *NewBB)
Replace specified successor OldBB to point at the provided block.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:148
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:251
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:450
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
The group of interleaved loads/stores sharing the same stride and close to each other.
Definition: VectorUtils.h:444
uint32_t getFactor() const
Definition: VectorUtils.h:460
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
Definition: VectorUtils.h:514
uint32_t getIndex(const InstTy *Instr) const
Get the index for the given member.
Definition: VectorUtils.h:521
bool isReverse() const
Definition: VectorUtils.h:459
InstTy * getInsertPos() const
Definition: VectorUtils.h:530
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
Align getAlign() const
Definition: VectorUtils.h:461
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:586
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
Definition: VectorUtils.h:631
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
Definition: VectorUtils.h:642
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
Definition: VectorUtils.h:623
bool invalidateGroups()
Invalidate groups, e.g., in case all blocks in loop will be predicated contrary to original assumptio...
Definition: VectorUtils.h:606
iterator_range< SmallPtrSetIterator< llvm::InterleaveGroup< Instruction > * > > getInterleaveGroups()
Definition: VectorUtils.h:636
void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
Definition: Interval.h:36
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:184
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
iterator_range< block_iterator > blocks() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
Store the result of a depth first search within basic blocks contained by a single loop.
Definition: LoopIterator.h:97
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
Definition: LoopIterator.h:136
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopInfo.cpp:1222
RPOIterator endRPO() const
Definition: LoopIterator.h:140
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopIterator.h:180
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool requiresScalarEpilogue(VFRange Range) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
bool isAccessInterleaved(Instruction *Instr)
Check if Instr belongs to any interleaved access group.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI)
LoopVectorizationLegality * Legal
Vectorization legality.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
std::pair< InstructionCost, bool > VectorizationCostTy
The vectorization cost is a combination of the cost itself and a boolean indicating whether any of th...
DemandedBits * DB
Demanded bits analysis.
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
Loop * TheLoop
The loop that we evaluate.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallVector< RegisterUsage, 8 > calculateRegisterUsage(ArrayRef< ElementCount > VFs)
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
VectorizationCostTy expectedCost(ElementCount VF, SmallVectorImpl< InstructionVFPair > *Invalid=nullptr)
Returns the expected execution cost.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool isEpilogueVectorizationProfitable(const ElementCount VF) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
void collectUniformsAndScalars(ElementCount VF)
Collect Uniform and Scalar values for the given VF.
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const
Returns true if I is an instruction which requires predication and for which our chosen predication s...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr)
Get the interleaved access group that Instr belongs to.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost)
void setTailFoldingStyles()
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
bool hasVectorCallVariants() const
Returns true if there is at least one function call in the loop which has a vectorized variant availa...
bool isInvariantAddressOfReduction(Value *V)
Returns True if given address is invariant and is used to store recurrent expression.
bool blockNeedsPredication(BasicBlock *BB) const
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const
Check if this pointer is consecutive when vectorizing.
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
bool isReductionVariable(PHINode *PN) const
Returns True if PN is a reduction variable in this loop.
bool isFixedOrderRecurrence(const PHINode *Phi) const
Returns True if Phi is a fixed-order recurrence in this loop.
const InductionDescriptor * getPointerInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is pointer induction.
const InductionDescriptor * getIntOrFpInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is an integer or floating point induction.
bool isInductionPhi(const Value *V) const
Returns True if V is a Phi node of an induction variable in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool isInvariant(Value *V) const
Returns true if value V is uniform across VF lanes, when VF is provided, and otherwise if V is invari...
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Type * getWidestInductionType()
Returns the widest induction type.
const LoopAccessInfo * getLAI() const
bool prepareToFoldTailByMasking()
Return true if we can vectorize this loop while folding its tail by masking, and mark all respective ...
bool isUniformMemOp(Instruction &I, ElementCount VF) const
A uniform memory op is a load or store which accesses the same memory location on all VF lanes,...
bool isMaskRequired(const Instruction *I) const
Returns true if vector representation of the instruction I requires mask.
const RuntimePointerChecking * getRuntimePointerChecking() const
Returns the information that we collected about runtime memory check.
Planner drives the vectorization process after having passed Legality checks.
std::optional< VectorizationFactor > plan(ElementCount UserVF, unsigned UserIC)
Plan how to best vectorize, return the best VF and its cost, or std::nullopt if vectorization and int...
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
std::pair< DenseMap< const SCEV *, Value * >, DenseMap< const RecurrenceDescriptor *, Value * > > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool IsEpilogueVectorization, const DenseMap< const SCEV *, Value * > *ExpandedSCEVs=nullptr)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
VPlan & getBestPlanFor(ElementCount VF) const
Return the best VPlan for VF.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
void printPlans(raw_ostream &O)
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
void prepareNoAliasMetadata()
Set up the aliasing scopes based on the memchecks.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition: LoopInfo.cpp:66
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:631
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:60
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Definition: LoopInfo.cpp:501
Metadata node.
Definition: Metadata.h:1067
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1068
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1428
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1541
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1434
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:597
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
size_type size() const
Definition: MapVector.h:60
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:191
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:287
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:783
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
void setIncomingValueForBlock(const BasicBlock *BB, Value *V)
Set every incoming value(s) for block BB to V.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr, BasicBlock::iterator InsertBefore)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static unsigned getIncomingValueNumForOperand(unsigned i)
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:144
void preserve()
Mark an analysis as preserved.
Definition: Analysis.h:129
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:71
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
Instruction * getLoopExitInstr() const
static unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
TrackingVH< Value > getRecurrenceStartValue() const
SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF) const
Returns identity corresponding to the RecurrenceKind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
bool Need
This flag indicates if we need to add the runtime check.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class represents a constant integer value.
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
This class represents an assumption made using SCEV expressions which can be checked at run-time.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isZero() const
Return true if the expression is a constant zero.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getSmallConstantMaxTripCount(const Loop *L)
Returns the upper bound of the loop trip count as a normal unsigned value.
const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVMContext & getContext() const
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
value_type pop_back_val()
Definition: SetVector.h:285
This class provides computation of slot numbers for LLVM Assembly writing.
Definition: AsmWriter.cpp:690
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:356
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
iterator end() const
Definition: SmallPtrSet.h:385
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
iterator begin() const
Definition: SmallPtrSet.h:380
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
std::optional< unsigned > getVScaleForTuning() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const
Return true if the target supports masked scatter.
bool enableAggressiveInterleaving(bool LoopHasReductions) const
Don't restrict interleaved unrolling to small loops.
bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
bool prefersVectorizedAddressing() const
Return true if target doesn't mind addresses in vectors.
bool hasBranchDivergence(const Function *F=nullptr) const
Return true if branch divergence exists.
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE=nullptr, const SCEV *Ptr=nullptr) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const
Get target-customized preferences for the generic loop unrolling transformation.
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instructions unique non-constant operands.
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
std::optional< unsigned > getMaxVScale() const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool enableOrderedReductions() const
Return true if we should be enabling ordered reductions for the target.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Query the target what the preferred style of tail folding is.
unsigned getRegUsageForType(Type *Ty) const
Returns the estimated number of registers required to represent Ty.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add ...
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isElementTypeLegalForScalableVector(Type *Ty) const
ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const
bool enableMaskedInterleavedAccessVectorization() const
Enable matching of interleaved access groups that contain predicated accesses or gaps and therefore v...
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
const char * getRegisterClassName(unsigned ClassID) const
bool preferEpilogueVectorization() const
Return true if the loop vectorizer should consider vectorizing an otherwise scalar epilogue loop.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt) const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
bool isLegalMaskedStore(Type *DataType, Align Alignment) const
Return true if the target supports masked store.
bool enableInterleavedAccessVectorization() const
Enable matching of interleaved access groups.
unsigned getMinTripCountTailFoldingThreshold() const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
unsigned getMaxInterleaveFactor(ElementCount VF) const
unsigned getNumberOfParts(Type *Tp) const
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
Query the target whether it would be prefered to create a predicated vector loop, which can avoid the...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool isLegalMaskedLoad(Type *DataType, Align Alignment) const
Return true if the target supports masked load.
Value handle that tracks a Value across RAUW.
Definition: ValueHandle.h:331
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition: Type.h:243
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isTokenTy() const
Return true if this is 'token'.
Definition: Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
op_iterator op_begin()
Definition: User.h:234
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
op_iterator op_end()
Definition: User.h:236
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:70
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:2594
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition: VPlan.h:2662
void execute(VPTransformState *State) override
The method which generates the output IR instructions that correspond to this VPBasicBlock,...
Definition: VPlan.cpp:452
iterator end()
Definition: VPlan.h:2625
iterator begin()
Recipe iterator methods.
Definition: VPlan.h:2623
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition: VPlan.h:2672
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition: VPlan.cpp:210
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition: VPlan.h:2653
bool empty() const
Definition: VPlan.h:2634
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition: VPlan.h:1882
VPRegionBlock * getParent()
Definition: VPlan.h:493
const VPBasicBlock * getExitingBasicBlock() const
Definition: VPlan.cpp:175
void setName(const Twine &newName)
Definition: VPlan.h:486
VPlan * getPlan()
Definition: VPlan.cpp:148
const VPBasicBlock * getEntryBasicBlock() const
Definition: VPlan.cpp:153
VPBlockBase * getSingleSuccessor() const
Definition: VPlan.h:528
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBase NewBlock after BlockPtr.
Definition: VPlan.h:3178
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPBasicBlock * getInsertBlock() const
VPValue * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL={}, const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createOverflowingOp(unsigned Opcode, std::initializer_list< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL={}, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPValue * createNot(VPValue *Operand, DebugLoc DL={}, const Twine &Name="")
VPValue * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL={}, const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition: VPlan.h:2363
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition: VPlanValue.h:420
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition: VPlanValue.h:398
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition: VPlanValue.h:410
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPCanonicalIVPHIRecipe * getCanonicalIV() const
Definition: VPlan.h:2529
VPValue * getStepValue() const
Definition: VPlan.h:2532
VPValue * getStartValue() const
Definition: VPlan.h:2528
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition: VPlan.h:1569
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition: VPlan.h:1613
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition: VPlan.h:1602
This is a concrete Recipe that models a single VPlan-level instruction.
Definition: VPlan.h:1139
@ FirstOrderRecurrenceSplice
Definition: VPlan.h:1145
unsigned getOpcode() const
Definition: VPlan.h:1209
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition: VPlan.h:1936
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition: VPlan.h:1977
VPValue * getMask() const
Return the mask used by this recipe.
Definition: VPlan.h:1983
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition: VPlan.h:1990
unsigned getNumStoreOperands() const
Returns the number of stored operands of this interleave group.
Definition: VPlan.h:2010
static VPLane getLastLaneForVF(const ElementCount &VF)
Definition: VPlan.h:169
static VPLane getFirstLane()
Definition: VPlan.h:167
A value that is used outside the VPlan.
Definition: VPlan.h:673
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition: VPlan.h:713
VPBasicBlock * getParent()
Definition: VPlan.h:738
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
void createHeaderMask(VPlan &Plan)
Create the mask for the vector loop header block.
VPValue * getBlockInMask(BasicBlock *BB) const
Returns the entry mask for the block BB.
VPValue * getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const
A helper that returns the previously computed predicate of the edge between SRC and DST.
void fixHeaderPhis()
Add the incoming values from the backedge to reduction & first-order recurrence cross-iteration phis.
VPValue * createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlan &Plan)
A helper function that computes the predicate of the edge between SRC and DST.
void createBlockInMask(BasicBlock *BB, VPlan &Plan)
A helper function that computes the predicate of the block BB, assuming that the header block of the ...
void recordRecipeOf(Instruction *I)
Mark given ingredient for recording its recipe once one is created for it.
VPRecipeBase * tryToCreateWidenRecipe(Instruction *Instr, ArrayRef< VPValue * > Operands, VFRange &Range, VPBasicBlock *VPBB, VPlanPtr &Plan)
Create and return a widened recipe for I if one can be created within the given VF Range.
VPReplicateRecipe * handleReplication(Instruction *I, VFRange &Range, VPlan &Plan)
Build a VPReplicationRecipe for I.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
void setFlags(Instruction *I) const
Set the IR flags for I.
Definition: VPlan.h:1068
A recipe for handling reduction phis.
Definition: VPlan.h:1823
bool isInLoop() const
Returns true, if the phi is part of an in-loop reduction.
Definition: VPlan.h:1877
const RecurrenceDescriptor & getRecurrenceDescriptor() const
Definition: VPlan.h:1869
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition: VPlan.h:2025
VPValue * getVecOp() const
The VPValue of the vector value to be reduced.
Definition: VPlan.h:2060
VPValue * getCondOp() const
The VPValue of the condition for the block.
Definition: VPlan.h:2062
VPValue * getChainOp() const
The VPValue of the scalar Chain being accumulated.
Definition: VPlan.h:2058
void execute(VPTransformState &State) override
Generate the reduction in the loop.
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition: VPlan.h:2727
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition: VPlan.h:2798
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition: VPlan.h:2071
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition: VPlan.h:830
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition: VPlan.h:887
This class can be used to assign consecutive numbers to all VPValues in a VPlan and allows querying t...
Definition: VPlanValue.h:448
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition: VPlanValue.h:204
operand_range operands()
Definition: VPlanValue.h:279
void setOperand(unsigned I, VPValue *New)
Definition: VPlanValue.h:259
unsigned getNumOperands() const
Definition: VPlanValue.h:253
VPValue * getOperand(unsigned N) const
Definition: VPlanValue.h:254
void addOperand(VPValue *Operand)
Definition: VPlanValue.h:248
Value * getUnderlyingValue()
Return the underlying Value attached to this VPValue.
Definition: VPlanValue.h:78
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const
Definition: VPlan.cpp:1312
void replaceAllUsesWith(VPValue *New)
Definition: VPlan.cpp:1280
user_iterator user_begin()
Definition: VPlanValue.h:130
unsigned getNumUsers() const
Definition: VPlanValue.h:113
Value * getLiveInIRValue()
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition: VPlanValue.h:174
user_iterator user_end()
Definition: VPlanValue.h:132
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition: VPlanValue.h:169
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition: VPlan.cpp:1284
user_range users()
Definition: VPlanValue.h:134
A recipe to compute the pointers for widened memory accesses of IndexTy for all parts.
Definition: VPlan.h:1513
A recipe for widening Call instructions.
Definition: VPlan.h:1398
A Recipe for widening the canonical induction variable of the vector loop.
Definition: VPlan.h:2449
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition: VPlan.h:1309
A recipe for handling GEP instructions.
Definition: VPlan.h:1471
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition: VPlan.h:1626
A Recipe for widening load/store operations.
Definition: VPlan.h:2230
VPValue * getMask() const
Return the mask used by this recipe.
Definition: VPlan.h:2287
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition: VPlan.h:2281
void execute(VPTransformState &State) override
Generate the wide load/store.
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition: VPlan.h:2296
bool isStore() const
Returns true if this recipe is a store.
Definition: VPlan.h:2293
A recipe for handling phis that are widened in the vector loop.
Definition: VPlan.h:1751
VPValue * getIncomingValue(unsigned I)
Returns the I th incoming VPValue.
Definition: VPlan.h:1790
VPBasicBlock * getIncomingBlock(unsigned I)
Returns the I th incoming VPBasicBlock.
Definition: VPlan.h:1787
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void execute(VPTransformState &State) override
Generate vector values for the pointer induction.
VPWidenRecipe is a recipe for producing a copy of vector type its ingredient.
Definition: VPlan.h:1277
Main class to build the VPlan H-CFG for an incoming IR.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition: VPlan.h:2828
void prepareToExecute(Value *TripCount, Value *VectorTripCount, Value *CanonicalIVStartValue, VPTransformState &State)
Prepare the plan for execution, setting up the required live-in values.
Definition: VPlan.cpp:792
VPBasicBlock * getEntry()
Definition: VPlan.h:2925
VPValue & getVectorTripCount()
The vector trip count.
Definition: VPlan.h:2950
void setName(const Twine &newName)
Definition: VPlan.h:2985
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition: VPlan.h:2953
VPValue * getTripCount() const
The trip count of the original loop.
Definition: VPlan.h:2929
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition: VPlan.h:2943
void removeLiveOut(PHINode *PN)
Definition: VPlan.h:3061
void addLiveOut(PHINode *PN, VPValue *V)
Definition: VPlan.cpp:1003
VPBasicBlock * getPreheader()
Definition: VPlan.h:3080
VPValue * getVPValueOrAddLiveIn(Value *V)
Gets the VPValue for V or adds a new live-in (if none exists yet) for V.
Definition: VPlan.h:3006
VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition: VPlan.h:3042
static VPlanPtr createInitialVPlan(const SCEV *TripCount, ScalarEvolution &PSE)
Create initial VPlan skeleton, having an "entry" VPBasicBlock (wrapping original scalar pre-header) w...
Definition: VPlan.cpp:778
bool hasVF(ElementCount VF)
Definition: VPlan.h:2967
bool hasUF(unsigned UF) const
Definition: VPlan.h:2974
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition: VPlan.h:2936
LLVM_DUMP_METHOD void dump() const
Dump the plan to stderr (for debugging).
Definition: VPlan.cpp:1000
iterator_range< mapped_iterator< Use *, std::function< VPValue *(Value *)> > > mapToVPValues(User::op_range Operands)
Returns a range mapping the values the range Operands to their corresponding VPValues.
Definition: VPlan.h:3034
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition: VPlan.cpp:834
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition: VPlan.h:3050
const MapVector< PHINode *, VPLiveOut * > & getLiveOuts() const
Definition: VPlan.h:3066
VPValue * getSCEVExpansion(const SCEV *S) const
Definition: VPlan.h:3070
VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition: VPlan.cpp:1084
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition: Value.cpp:157
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition: Value.cpp:693
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:683
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:217
constexpr bool isNonZero() const
Definition: TypeSize.h:158
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:203
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition: TypeSize.h:243
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr bool isZero() const
Definition: TypeSize.h:156
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:210
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:239
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:224
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition: ilist_node.h:109
A range adaptor for a pair of iterators.
IteratorT end() const
IteratorT begin() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:660
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition: DwarfDebug.h:190
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:765
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:718
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:236
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE)
Get or create a VPValue that corresponds to the expansion of Expr.
Definition: VPlan.cpp:1422
bool isUniformAfterVectorization(VPValue *VPV)
Returns true if VPV is uniform after vectorization.
Definition: VPlan.h:3402
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
Definition: VPlan.cpp:1412
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
@ Offset
Definition: DWP.cpp:456
Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
Definition: LoopUtils.cpp:1820
void stable_sort(R &&Range)
Definition: STLExtras.h:2004
bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:849
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1731
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2415
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
unsigned getLoadStoreAddressSpace(Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:6966
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
const SCEV * createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, Loop *OrigLoop)
std::pair< Instruction *, ElementCount > InstructionVFPair
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:425
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
Definition: LoopUtils.cpp:263
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2082
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:665
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
bool VerifySCEV
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition: VPlanCFG.h:226
Value * createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right)
Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
Definition: LoopUtils.cpp:1046
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1738
Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
cl::opt< bool > EnableVPlanNativePath("enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization."))
Definition: VPlan.cpp:53
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1656
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
std::unique_ptr< VPlan > VPlanPtr
Definition: VPlan.h:134
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:116
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1745
cl::opt< bool > EnableLoopVectorization
Value * createOrderedReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, Value *Src, Value *Start)
Create an ordered reduction intrinsic using the given recurrence descriptor Desc.
Definition: LoopUtils.cpp:1223
Align getLoadStoreAlignment(Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition: STLExtras.h:581
void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition: STLExtras.h:336
Type * ToVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
Definition: VectorUtils.h:133
bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
Definition: DebugInfo.cpp:2322
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, Loop *RemainderLoop, uint64_t UF)
Set weights for UnrolledLoop and RemainderLoop based on weights for OrigLoop and the following distri...
Definition: LoopUtils.cpp:1628
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
ScalarEpilogueLowering
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
@ Invalid
Denotes invalid value.
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1930
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
void reportVectorizationInfo(const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1888
Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
Definition: LoopUtils.cpp:1880
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2048
InstructionCost Cost
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:613
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
bool verifyVPlanIsValid(const VPlan &Plan)
Verify invariants for general VPlans.
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:491
cl::opt< bool > EnableLoopInterleaving
Value * createTargetReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, Value *Src, PHINode *OrigPhi=nullptr)
Create a generic target reduction using a recurrence descriptor Desc The target is queried to determi...
Definition: LoopUtils.cpp:1207
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
#define OP(n)
Definition: regex2.h:73
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:26
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:50
ElementCountComparator creates a total ordering for ElementCount for the purposes of using it in a se...
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
SmallMapVector< unsigned, unsigned, 4 > LoopInvariantRegs
Holds the number of loop invariant values that are used in the loop.
TargetLibraryInfo * TLI
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LoopVectorizePass(LoopVectorizeOptions Opts={})
BlockFrequencyInfo * BFI
ScalarEvolution * SE
AssumptionCache * AC
LoopVectorizeResult runImpl(Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
Storage for information about made changes.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition: PassManager.h:91
A marker to determine if extra passes after loop vectorization should be run.
Definition: LoopVectorize.h:85
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Flags describing the kind of vector reduction.
Parameters that control the generic loop unrolling transformation.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Definition: VPlan.h:87
ElementCount End
Definition: VPlan.h:92
A recipe for handling first-order recurrence phis.
Definition: VPlan.h:1796
VPIteration represents a single point in the iteration space of the output (vectorized and/or unrolle...
Definition: VPlan.h:219
bool isFirstIteration() const
Definition: VPlan.h:231
BasicBlock * PrevBB
The previous IR BasicBlock created or used.
Definition: VPlan.h:369
SmallDenseMap< VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
Definition: VPlan.h:377
BasicBlock * ExitBB
The last IR BasicBlock in the output IR.
Definition: VPlan.h:373
BasicBlock * getPreheaderBBFor(VPRecipeBase *R)
Returns the BasicBlock* mapped to the pre-header of the loop region containing R.
Definition: VPlan.cpp:348
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
Definition: VPlan.h:236
Value * get(VPValue *Def, unsigned Part, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def and a given Part if IsScalar is false,...
Definition: VPlan.cpp:247
DenseMap< const SCEV *, Value * > ExpandedSCEVs
Map SCEVs to their expanded values.
Definition: VPlan.h:413
void addMetadata(Instruction *To, Instruction *From)
Add metadata from one instruction to another.
Definition: VPlan.cpp:361
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
Definition: VPlan.h:416
struct llvm::VPTransformState::CFGState CFG
LoopVersioning * LVer
LoopVersioning.
Definition: VPlan.h:409
void addNewMetadata(Instruction *To, const Instruction *Orig)
Add additional metadata to To that was not present on Orig.
Definition: VPlan.cpp:353
void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance)
Construct the vector value of a scalarized value V one lane at a time.
Definition: VPlan.cpp:402
void set(VPValue *Def, Value *V, unsigned Part, bool IsScalar=false)
Set the generated vector Value for a given VPValue and a given Part, if IsScalar is false.
Definition: VPlan.h:288
std::optional< VPIteration > Instance
Hold the indices to generate specific scalar instructions.
Definition: VPlan.h:248
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
Definition: VPlan.h:393
VPlan * Plan
Pointer to the VPlan code is generated for.
Definition: VPlan.h:399
InnerLoopVectorizer * ILV
Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
Definition: VPlan.h:396
ElementCount VF
The chosen Vectorization and Unroll Factors of the loop being vectorized.
Definition: VPlan.h:242
void setDebugLocFrom(DebugLoc DL)
Set the debug location in the builder using the debug location DL.
Definition: VPlan.cpp:381
A recipe for widening select instructions.
Definition: VPlan.h:1437
static void dropPoisonGeneratingRecipes(VPlan &Plan, function_ref< bool(BasicBlock *)> BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimize(VPlan &Plan, ScalarEvolution &SE)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void VPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, ScalarEvolution &SE, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs, LLVMContext &Ctx)
Insert truncates and extends for any truncated recipe.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Sink users of fixed-order recurrences after the recipe defining their previous value.
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.