LLVM 19.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanHCFGBuilder.h"
62#include "VPlanPatternMatch.h"
63#include "VPlanTransforms.h"
64#include "VPlanVerifier.h"
65#include "llvm/ADT/APInt.h"
66#include "llvm/ADT/ArrayRef.h"
67#include "llvm/ADT/DenseMap.h"
69#include "llvm/ADT/Hashing.h"
70#include "llvm/ADT/MapVector.h"
71#include "llvm/ADT/STLExtras.h"
73#include "llvm/ADT/SmallSet.h"
75#include "llvm/ADT/Statistic.h"
76#include "llvm/ADT/StringRef.h"
77#include "llvm/ADT/Twine.h"
82#include "llvm/Analysis/CFG.h"
98#include "llvm/IR/Attributes.h"
99#include "llvm/IR/BasicBlock.h"
100#include "llvm/IR/CFG.h"
101#include "llvm/IR/Constant.h"
102#include "llvm/IR/Constants.h"
103#include "llvm/IR/DataLayout.h"
104#include "llvm/IR/DebugInfo.h"
106#include "llvm/IR/DebugLoc.h"
107#include "llvm/IR/DerivedTypes.h"
109#include "llvm/IR/Dominators.h"
110#include "llvm/IR/Function.h"
111#include "llvm/IR/IRBuilder.h"
112#include "llvm/IR/InstrTypes.h"
113#include "llvm/IR/Instruction.h"
114#include "llvm/IR/Instructions.h"
116#include "llvm/IR/Intrinsics.h"
117#include "llvm/IR/MDBuilder.h"
118#include "llvm/IR/Metadata.h"
119#include "llvm/IR/Module.h"
120#include "llvm/IR/Operator.h"
121#include "llvm/IR/PatternMatch.h"
123#include "llvm/IR/Type.h"
124#include "llvm/IR/Use.h"
125#include "llvm/IR/User.h"
126#include "llvm/IR/Value.h"
127#include "llvm/IR/ValueHandle.h"
129#include "llvm/IR/Verifier.h"
130#include "llvm/Support/Casting.h"
133#include "llvm/Support/Debug.h"
146#include <algorithm>
147#include <cassert>
148#include <cmath>
149#include <cstdint>
150#include <functional>
151#include <iterator>
152#include <limits>
153#include <map>
154#include <memory>
155#include <string>
156#include <tuple>
157#include <utility>
158
159using namespace llvm;
160
161#define LV_NAME "loop-vectorize"
162#define DEBUG_TYPE LV_NAME
163
164#ifndef NDEBUG
165const char VerboseDebug[] = DEBUG_TYPE "-verbose";
166#endif
167
168/// @{
169/// Metadata attribute names
170const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
172 "llvm.loop.vectorize.followup_vectorized";
174 "llvm.loop.vectorize.followup_epilogue";
175/// @}
176
177STATISTIC(LoopsVectorized, "Number of loops vectorized");
178STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
179STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
180
182 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
183 cl::desc("Enable vectorization of epilogue loops."));
184
186 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
187 cl::desc("When epilogue vectorization is enabled, and a value greater than "
188 "1 is specified, forces the given VF for all applicable epilogue "
189 "loops."));
190
192 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
193 cl::desc("Only loops with vectorization factor equal to or larger than "
194 "the specified value are considered for epilogue vectorization."));
195
196/// Loops with a known constant trip count below this number are vectorized only
197/// if no scalar iteration overheads are incurred.
199 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
200 cl::desc("Loops with a constant trip count that is smaller than this "
201 "value are vectorized only if no scalar iteration overheads "
202 "are incurred."));
203
205 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
206 cl::desc("The maximum allowed number of runtime memory checks"));
207
209 "vectorize-use-legacy-cost-model", cl::init(false), cl::Hidden,
210 cl::desc("Use the legacy cost model instead of the VPlan-based cost model. "
211 "This option will be removed in the future."));
212
213// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
214// that predication is preferred, and this lists all options. I.e., the
215// vectorizer will try to fold the tail-loop (epilogue) into the vector body
216// and predicate the instructions accordingly. If tail-folding fails, there are
217// different fallback strategies depending on these values:
219 enum Option {
223 };
224} // namespace PreferPredicateTy
225
227 "prefer-predicate-over-epilogue",
230 cl::desc("Tail-folding and predication preferences over creating a scalar "
231 "epilogue loop."),
233 "scalar-epilogue",
234 "Don't tail-predicate loops, create scalar epilogue"),
236 "predicate-else-scalar-epilogue",
237 "prefer tail-folding, create scalar epilogue if tail "
238 "folding fails."),
240 "predicate-dont-vectorize",
241 "prefers tail-folding, don't attempt vectorization if "
242 "tail-folding fails.")));
243
245 "force-tail-folding-style", cl::desc("Force the tail folding style"),
246 cl::init(TailFoldingStyle::None),
248 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
250 TailFoldingStyle::Data, "data",
251 "Create lane mask for data only, using active.lane.mask intrinsic"),
252 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
253 "data-without-lane-mask",
254 "Create lane mask with compare/stepvector"),
255 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
256 "Create lane mask using active.lane.mask intrinsic, and use "
257 "it for both data and control flow"),
258 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
259 "data-and-control-without-rt-check",
260 "Similar to data-and-control, but remove the runtime check"),
261 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
262 "Use predicated EVL instructions for tail folding. If EVL "
263 "is unsupported, fallback to data-without-lane-mask.")));
264
266 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
267 cl::desc("Maximize bandwidth when selecting vectorization factor which "
268 "will be determined by the smallest type in loop."));
269
271 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
272 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
273
274/// An interleave-group may need masking if it resides in a block that needs
275/// predication, or in order to mask away gaps.
277 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
278 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
279
281 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
282 cl::desc("A flag that overrides the target's number of scalar registers."));
283
285 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
286 cl::desc("A flag that overrides the target's number of vector registers."));
287
289 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
290 cl::desc("A flag that overrides the target's max interleave factor for "
291 "scalar loops."));
292
294 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
295 cl::desc("A flag that overrides the target's max interleave factor for "
296 "vectorized loops."));
297
299 "force-target-instruction-cost", cl::init(0), cl::Hidden,
300 cl::desc("A flag that overrides the target's expected cost for "
301 "an instruction to a single constant value. Mostly "
302 "useful for getting consistent testing."));
303
305 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
306 cl::desc(
307 "Pretend that scalable vectors are supported, even if the target does "
308 "not support them. This flag should only be used for testing."));
309
311 "small-loop-cost", cl::init(20), cl::Hidden,
312 cl::desc(
313 "The cost of a loop that is considered 'small' by the interleaver."));
314
316 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
317 cl::desc("Enable the use of the block frequency analysis to access PGO "
318 "heuristics minimizing code growth in cold regions and being more "
319 "aggressive in hot regions."));
320
321// Runtime interleave loops for load/store throughput.
323 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
324 cl::desc(
325 "Enable runtime interleaving until load/store ports are saturated"));
326
327/// The number of stores in a loop that are allowed to need predication.
329 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
330 cl::desc("Max number of stores to be predicated behind an if."));
331
333 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
334 cl::desc("Count the induction variable only once when interleaving"));
335
337 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
338 cl::desc("Enable if predication of stores during vectorization."));
339
341 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
342 cl::desc("The maximum interleave count to use when interleaving a scalar "
343 "reduction in a nested loop."));
344
345static cl::opt<bool>
346 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
348 cl::desc("Prefer in-loop vector reductions, "
349 "overriding the targets preference."));
350
352 "force-ordered-reductions", cl::init(false), cl::Hidden,
353 cl::desc("Enable the vectorisation of loops with in-order (strict) "
354 "FP reductions"));
355
357 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
358 cl::desc(
359 "Prefer predicating a reduction operation over an after loop select."));
360
361namespace llvm {
363 "enable-vplan-native-path", cl::Hidden,
364 cl::desc("Enable VPlan-native vectorization path with "
365 "support for outer loop vectorization."));
366}
367
368// This flag enables the stress testing of the VPlan H-CFG construction in the
369// VPlan-native vectorization path. It must be used in conjuction with
370// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
371// verification of the H-CFGs built.
373 "vplan-build-stress-test", cl::init(false), cl::Hidden,
374 cl::desc(
375 "Build VPlan for every supported loop nest in the function and bail "
376 "out right after the build (stress test the VPlan H-CFG construction "
377 "in the VPlan-native vectorization path)."));
378
380 "interleave-loops", cl::init(true), cl::Hidden,
381 cl::desc("Enable loop interleaving in Loop vectorization passes"));
383 "vectorize-loops", cl::init(true), cl::Hidden,
384 cl::desc("Run the Loop vectorization passes"));
385
387 "vplan-print-in-dot-format", cl::Hidden,
388 cl::desc("Use dot format instead of plain text when dumping VPlans"));
389
391 "force-widen-divrem-via-safe-divisor", cl::Hidden,
392 cl::desc(
393 "Override cost based safe divisor widening for div/rem instructions"));
394
396 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
398 cl::desc("Try wider VFs if they enable the use of vector variants"));
399
400// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
401// variables not overflowing do not hold. See `emitSCEVChecks`.
402static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
403// Likelyhood of bypassing the vectorized loop because pointers overlap. See
404// `emitMemRuntimeChecks`.
405static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
406// Likelyhood of bypassing the vectorized loop because there are zero trips left
407// after prolog. See `emitIterationCountCheck`.
408static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
409
410/// A helper function that returns true if the given type is irregular. The
411/// type is irregular if its allocated size doesn't equal the store size of an
412/// element of the corresponding vector type.
413static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
414 // Determine if an array of N elements of type Ty is "bitcast compatible"
415 // with a <N x Ty> vector.
416 // This is only true if there is no padding between the array elements.
417 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
418}
419
420/// Returns "best known" trip count for the specified loop \p L as defined by
421/// the following procedure:
422/// 1) Returns exact trip count if it is known.
423/// 2) Returns expected trip count according to profile data if any.
424/// 3) Returns upper bound estimate if it is known.
425/// 4) Returns std::nullopt if all of the above failed.
426static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
427 Loop *L) {
428 // Check if exact trip count is known.
429 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
430 return ExpectedTC;
431
432 // Check if there is an expected trip count available from profile data.
434 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
435 return *EstimatedTC;
436
437 // Check if upper bound estimate is known.
438 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
439 return ExpectedTC;
440
441 return std::nullopt;
442}
443
444/// Return a vector containing interleaved elements from multiple
445/// smaller input vectors.
447 const Twine &Name) {
448 unsigned Factor = Vals.size();
449 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
450
451 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
452#ifndef NDEBUG
453 for (Value *Val : Vals)
454 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
455#endif
456
457 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
458 // must use intrinsics to interleave.
459 if (VecTy->isScalableTy()) {
460 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
461 return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
462 Vals,
463 /*FMFSource=*/nullptr, Name);
464 }
465
466 // Fixed length. Start by concatenating all vectors into a wide vector.
467 Value *WideVec = concatenateVectors(Builder, Vals);
468
469 // Interleave the elements into the wide vector.
470 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
471 return Builder.CreateShuffleVector(
472 WideVec, createInterleaveMask(NumElts, Factor), Name);
473}
474
475namespace {
476// Forward declare GeneratedRTChecks.
477class GeneratedRTChecks;
478
479using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
480} // namespace
481
482namespace llvm {
483
485
486/// InnerLoopVectorizer vectorizes loops which contain only one basic
487/// block to a specified vectorization factor (VF).
488/// This class performs the widening of scalars into vectors, or multiple
489/// scalars. This class also implements the following features:
490/// * It inserts an epilogue loop for handling loops that don't have iteration
491/// counts that are known to be a multiple of the vectorization factor.
492/// * It handles the code generation for reduction variables.
493/// * Scalarization (implementation using scalars) of un-vectorizable
494/// instructions.
495/// InnerLoopVectorizer does not perform any vectorization-legality
496/// checks, and relies on the caller to check for the different legality
497/// aspects. The InnerLoopVectorizer relies on the
498/// LoopVectorizationLegality class to provide information about the induction
499/// and reduction variables that were found to a given vectorization factor.
501public:
504 const TargetLibraryInfo *TLI,
508 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
510 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
511 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
512 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
513 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
515 // Query this against the original loop and save it here because the profile
516 // of the original loop header may change as the transformation happens.
519
521 this->MinProfitableTripCount = VecWidth;
522 else
523 this->MinProfitableTripCount = MinProfitableTripCount;
524 }
525
526 virtual ~InnerLoopVectorizer() = default;
527
528 /// Create a new empty loop that will contain vectorized instructions later
529 /// on, while the old loop will be used as the scalar remainder. Control flow
530 /// is generated around the vectorized (and scalar epilogue) loops consisting
531 /// of various checks and bypasses. Return the pre-header block of the new
532 /// loop and the start value for the canonical induction, if it is != 0. The
533 /// latter is the case when vectorizing the epilogue loop. In the case of
534 /// epilogue vectorization, this function is overriden to handle the more
535 /// complex control flow around the loops. \p ExpandedSCEVs is used to
536 /// look up SCEV expansions for expressions needed during skeleton creation.
537 virtual std::pair<BasicBlock *, Value *>
538 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
539
540 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
541 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
542
543 // Return true if any runtime check is added.
545
546 /// A helper function to scalarize a single Instruction in the innermost loop.
547 /// Generates a sequence of scalar instances for each lane between \p MinLane
548 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
549 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
550 /// Instr's operands.
551 void scalarizeInstruction(const Instruction *Instr,
552 VPReplicateRecipe *RepRecipe,
553 const VPIteration &Instance,
554 VPTransformState &State);
555
556 /// Try to vectorize interleaved access group \p Group with the base address
557 /// given in \p Addr, optionally masking the vector operations if \p
558 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
559 /// values in the vectorized loop.
561 ArrayRef<VPValue *> VPDefs,
563 ArrayRef<VPValue *> StoredValues,
564 VPValue *BlockInMask, bool NeedsMaskForGaps);
565
566 /// Fix the non-induction PHIs in \p Plan.
567 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
568
569 /// Create a new phi node for the induction variable \p OrigPhi to resume
570 /// iteration count in the scalar epilogue, from where the vectorized loop
571 /// left off. \p Step is the SCEV-expanded induction step to use. In cases
572 /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
573 /// and the resume values can come from an additional bypass block, the \p
574 /// AdditionalBypass pair provides information about the bypass block and the
575 /// end value on the edge from bypass to this loop.
577 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
578 ArrayRef<BasicBlock *> BypassBlocks,
579 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
580
581 /// Returns the original loop trip count.
582 Value *getTripCount() const { return TripCount; }
583
584 /// Used to set the trip count after ILV's construction and after the
585 /// preheader block has been executed. Note that this always holds the trip
586 /// count of the original loop for both main loop and epilogue vectorization.
587 void setTripCount(Value *TC) { TripCount = TC; }
588
589protected:
591
592 /// A small list of PHINodes.
594
595 /// A type for scalarized values in the new loop. Each value from the
596 /// original loop, when scalarized, is represented by UF x VF scalar values
597 /// in the new unrolled loop, where UF is the unroll factor and VF is the
598 /// vectorization factor.
600
601 /// Set up the values of the IVs correctly when exiting the vector loop.
602 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
603 Value *VectorTripCount, Value *EndValue,
604 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
605 VPlan &Plan, VPTransformState &State);
606
607 /// Iteratively sink the scalarized operands of a predicated instruction into
608 /// the block that was created for it.
609 void sinkScalarOperands(Instruction *PredInst);
610
611 /// Returns (and creates if needed) the trip count of the widened loop.
613
614 /// Returns a bitcasted value to the requested vector type.
615 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
617 const DataLayout &DL);
618
619 /// Emit a bypass check to see if the vector trip count is zero, including if
620 /// it overflows.
622
623 /// Emit a bypass check to see if all of the SCEV assumptions we've
624 /// had to make are correct. Returns the block containing the checks or
625 /// nullptr if no checks have been added.
627
628 /// Emit bypass checks to check any memory assumptions we may have made.
629 /// Returns the block containing the checks or nullptr if no checks have been
630 /// added.
632
633 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
634 /// vector loop preheader, middle block and scalar preheader.
636
637 /// Create new phi nodes for the induction variables to resume iteration count
638 /// in the scalar epilogue, from where the vectorized loop left off.
639 /// In cases where the loop skeleton is more complicated (eg. epilogue
640 /// vectorization) and the resume values can come from an additional bypass
641 /// block, the \p AdditionalBypass pair provides information about the bypass
642 /// block and the end value on the edge from bypass to this loop.
644 const SCEV2ValueTy &ExpandedSCEVs,
645 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
646
647 /// Complete the loop skeleton by adding debug MDs, creating appropriate
648 /// conditional branches in the middle block, preparing the builder and
649 /// running the verifier. Return the preheader of the completed vector loop.
651
652 /// Allow subclasses to override and print debug traces before/after vplan
653 /// execution, when trace information is requested.
654 virtual void printDebugTracesAtStart(){};
655 virtual void printDebugTracesAtEnd(){};
656
657 /// The original loop.
659
660 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
661 /// dynamic knowledge to simplify SCEV expressions and converts them to a
662 /// more usable form.
664
665 /// Loop Info.
667
668 /// Dominator Tree.
670
671 /// Target Library Info.
673
674 /// Target Transform Info.
676
677 /// Assumption Cache.
679
680 /// Interface to emit optimization remarks.
682
683 /// The vectorization SIMD factor to use. Each vector will have this many
684 /// vector elements.
686
688
689 /// The vectorization unroll factor to use. Each scalar is vectorized to this
690 /// many different vector instructions.
691 unsigned UF;
692
693 /// The builder that we use
695
696 // --- Vectorization state ---
697
698 /// The vector-loop preheader.
700
701 /// The scalar-loop preheader.
703
704 /// Middle Block between the vector and the scalar.
706
707 /// The unique ExitBlock of the scalar loop if one exists. Note that
708 /// there can be multiple exiting edges reaching this block.
710
711 /// The scalar loop body.
713
714 /// A list of all bypass blocks. The first block is the entry of the loop.
716
717 /// Store instructions that were predicated.
719
720 /// Trip count of the original loop.
721 Value *TripCount = nullptr;
722
723 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
725
726 /// The legality analysis.
728
729 /// The profitablity analysis.
731
732 // Record whether runtime checks are added.
733 bool AddedSafetyChecks = false;
734
735 // Holds the end values for each induction variable. We save the end values
736 // so we can later fix-up the external users of the induction variables.
738
739 /// BFI and PSI are used to check for profile guided size optimizations.
742
743 // Whether this loop should be optimized for size based on profile guided size
744 // optimizatios.
746
747 /// Structure to hold information about generated runtime checks, responsible
748 /// for cleaning the checks, if vectorization turns out unprofitable.
749 GeneratedRTChecks &RTChecks;
750
751 // Holds the resume values for reductions in the loops, used to set the
752 // correct start value of reduction PHIs when vectorizing the epilogue.
755};
756
758public:
761 const TargetLibraryInfo *TLI,
763 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
766 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
768 ElementCount::getFixed(1),
769 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
770 BFI, PSI, Check) {}
771};
772
773/// Encapsulate information regarding vectorization of a loop and its epilogue.
774/// This information is meant to be updated and used across two stages of
775/// epilogue vectorization.
778 unsigned MainLoopUF = 0;
780 unsigned EpilogueUF = 0;
785 Value *TripCount = nullptr;
787
789 ElementCount EVF, unsigned EUF)
790 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
791 assert(EUF == 1 &&
792 "A high UF for the epilogue loop is likely not beneficial.");
793 }
794};
795
796/// An extension of the inner loop vectorizer that creates a skeleton for a
797/// vectorized loop that has its epilogue (residual) also vectorized.
798/// The idea is to run the vplan on a given loop twice, firstly to setup the
799/// skeleton and vectorize the main loop, and secondly to complete the skeleton
800/// from the first step and vectorize the epilogue. This is achieved by
801/// deriving two concrete strategy classes from this base class and invoking
802/// them in succession from the loop vectorizer planner.
804public:
812 GeneratedRTChecks &Checks)
814 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
815 CM, BFI, PSI, Checks),
816 EPI(EPI) {}
817
818 // Override this function to handle the more complex control flow around the
819 // three loops.
820 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
821 const SCEV2ValueTy &ExpandedSCEVs) final {
822 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
823 }
824
825 /// The interface for creating a vectorized skeleton using one of two
826 /// different strategies, each corresponding to one execution of the vplan
827 /// as described above.
828 virtual std::pair<BasicBlock *, Value *>
829 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
830
831 /// Holds and updates state information required to vectorize the main loop
832 /// and its epilogue in two separate passes. This setup helps us avoid
833 /// regenerating and recomputing runtime safety checks. It also helps us to
834 /// shorten the iteration-count-check path length for the cases where the
835 /// iteration count of the loop is so small that the main vector loop is
836 /// completely skipped.
838};
839
840/// A specialized derived class of inner loop vectorizer that performs
841/// vectorization of *main* loops in the process of vectorizing loops and their
842/// epilogues.
844public:
852 GeneratedRTChecks &Check)
854 EPI, LVL, CM, BFI, PSI, Check) {}
855 /// Implements the interface for creating a vectorized skeleton using the
856 /// *main loop* strategy (ie the first pass of vplan execution).
857 std::pair<BasicBlock *, Value *>
858 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
859
860protected:
861 /// Emits an iteration count bypass check once for the main loop (when \p
862 /// ForEpilogue is false) and once for the epilogue loop (when \p
863 /// ForEpilogue is true).
864 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
865 void printDebugTracesAtStart() override;
866 void printDebugTracesAtEnd() override;
867};
868
869// A specialized derived class of inner loop vectorizer that performs
870// vectorization of *epilogue* loops in the process of vectorizing loops and
871// their epilogues.
873public:
881 GeneratedRTChecks &Checks)
883 EPI, LVL, CM, BFI, PSI, Checks) {
885 }
886 /// Implements the interface for creating a vectorized skeleton using the
887 /// *epilogue loop* strategy (ie the second pass of vplan execution).
888 std::pair<BasicBlock *, Value *>
889 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
890
891protected:
892 /// Emits an iteration count bypass check after the main vector loop has
893 /// finished to see if there are any iterations left to execute by either
894 /// the vector epilogue or the scalar epilogue.
896 BasicBlock *Bypass,
897 BasicBlock *Insert);
898 void printDebugTracesAtStart() override;
899 void printDebugTracesAtEnd() override;
900};
901} // end namespace llvm
902
903/// Look for a meaningful debug location on the instruction or it's
904/// operands.
906 if (!I)
907 return DebugLoc();
908
910 if (I->getDebugLoc() != Empty)
911 return I->getDebugLoc();
912
913 for (Use &Op : I->operands()) {
914 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
915 if (OpInst->getDebugLoc() != Empty)
916 return OpInst->getDebugLoc();
917 }
918
919 return I->getDebugLoc();
920}
921
922/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
923/// is passed, the message relates to that particular instruction.
924#ifndef NDEBUG
925static void debugVectorizationMessage(const StringRef Prefix,
926 const StringRef DebugMsg,
927 Instruction *I) {
928 dbgs() << "LV: " << Prefix << DebugMsg;
929 if (I != nullptr)
930 dbgs() << " " << *I;
931 else
932 dbgs() << '.';
933 dbgs() << '\n';
934}
935#endif
936
937/// Create an analysis remark that explains why vectorization failed
938///
939/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
940/// RemarkName is the identifier for the remark. If \p I is passed it is an
941/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
942/// the location of the remark. \return the remark object that can be
943/// streamed to.
945 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
946 Value *CodeRegion = TheLoop->getHeader();
947 DebugLoc DL = TheLoop->getStartLoc();
948
949 if (I) {
950 CodeRegion = I->getParent();
951 // If there is no debug location attached to the instruction, revert back to
952 // using the loop's.
953 if (I->getDebugLoc())
954 DL = I->getDebugLoc();
955 }
956
957 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
958}
959
960namespace llvm {
961
962/// Return a value for Step multiplied by VF.
964 int64_t Step) {
965 assert(Ty->isIntegerTy() && "Expected an integer step");
966 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
967}
968
969/// Return the runtime value for VF.
971 return B.CreateElementCount(Ty, VF);
972}
973
975 Loop *OrigLoop) {
976 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
977 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
978
979 ScalarEvolution &SE = *PSE.getSE();
980 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
981}
982
984 const StringRef OREMsg, const StringRef ORETag,
985 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
986 Instruction *I) {
987 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
988 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
989 ORE->emit(
990 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
991 << "loop not vectorized: " << OREMsg);
992}
993
994/// Reports an informative message: print \p Msg for debugging purposes as well
995/// as an optimization remark. Uses either \p I as location of the remark, or
996/// otherwise \p TheLoop.
997static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
998 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
999 Instruction *I = nullptr) {
1001 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1002 ORE->emit(
1003 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1004 << Msg);
1005}
1006
1007/// Report successful vectorization of the loop. In case an outer loop is
1008/// vectorized, prepend "outer" to the vectorization remark.
1010 VectorizationFactor VF, unsigned IC) {
1012 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
1013 nullptr));
1014 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
1015 ORE->emit([&]() {
1016 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
1017 TheLoop->getHeader())
1018 << "vectorized " << LoopType << "loop (vectorization width: "
1019 << ore::NV("VectorizationFactor", VF.Width)
1020 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
1021 });
1022}
1023
1024} // end namespace llvm
1025
1026namespace llvm {
1027
1028// Loop vectorization cost-model hints how the scalar epilogue loop should be
1029// lowered.
1031
1032 // The default: allowing scalar epilogues.
1034
1035 // Vectorization with OptForSize: don't allow epilogues.
1037
1038 // A special case of vectorisation with OptForSize: loops with a very small
1039 // trip count are considered for vectorization under OptForSize, thereby
1040 // making sure the cost of their loop body is dominant, free of runtime
1041 // guards and scalar iteration overheads.
1043
1044 // Loop hint predicate indicating an epilogue is undesired.
1046
1047 // Directive indicating we must either tail fold or not vectorize
1050
1051using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1052
1053/// LoopVectorizationCostModel - estimates the expected speedups due to
1054/// vectorization.
1055/// In many cases vectorization is not profitable. This can happen because of
1056/// a number of reasons. In this class we mainly attempt to predict the
1057/// expected speedup/slowdowns due to the supported instruction set. We use the
1058/// TargetTransformInfo to query the different backends for the cost of
1059/// different operations.
1061public:
1065 const TargetTransformInfo &TTI,
1071 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1072 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1073 Hints(Hints), InterleaveInfo(IAI) {}
1074
1075 /// \return An upper bound for the vectorization factors (both fixed and
1076 /// scalable). If the factors are 0, vectorization and interleaving should be
1077 /// avoided up front.
1078 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1079
1080 /// \return True if runtime checks are required for vectorization, and false
1081 /// otherwise.
1082 bool runtimeChecksRequired();
1083
1084 /// Setup cost-based decisions for user vectorization factor.
1085 /// \return true if the UserVF is a feasible VF to be chosen.
1089 return expectedCost(UserVF).isValid();
1090 }
1091
1092 /// \return The size (in bits) of the smallest and widest types in the code
1093 /// that needs to be vectorized. We ignore values that remain scalar such as
1094 /// 64 bit loop indices.
1095 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1096
1097 /// \return The desired interleave count.
1098 /// If interleave count has been specified by metadata it will be returned.
1099 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1100 /// are the selected vectorization factor and the cost of the selected VF.
1101 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1102
1103 /// Memory access instruction may be vectorized in more than one way.
1104 /// Form of instruction after vectorization depends on cost.
1105 /// This function takes cost-based decisions for Load/Store instructions
1106 /// and collects them in a map. This decisions map is used for building
1107 /// the lists of loop-uniform and loop-scalar instructions.
1108 /// The calculated cost is saved with widening decision in order to
1109 /// avoid redundant calculations.
1111
1112 /// A call may be vectorized in different ways depending on whether we have
1113 /// vectorized variants available and whether the target supports masking.
1114 /// This function analyzes all calls in the function at the supplied VF,
1115 /// makes a decision based on the costs of available options, and stores that
1116 /// decision in a map for use in planning and plan execution.
1118
1119 /// A struct that represents some properties of the register usage
1120 /// of a loop.
1122 /// Holds the number of loop invariant values that are used in the loop.
1123 /// The key is ClassID of target-provided register class.
1125 /// Holds the maximum number of concurrent live intervals in the loop.
1126 /// The key is ClassID of target-provided register class.
1128 };
1129
1130 /// \return Returns information about the register usages of the loop for the
1131 /// given vectorization factors.
1134
1135 /// Collect values we want to ignore in the cost model.
1136 void collectValuesToIgnore();
1137
1138 /// Collect all element types in the loop for which widening is needed.
1140
1141 /// Split reductions into those that happen in the loop, and those that happen
1142 /// outside. In loop reductions are collected into InLoopReductions.
1144
1145 /// Returns true if we should use strict in-order reductions for the given
1146 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1147 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1148 /// of FP operations.
1149 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1150 return !Hints->allowReordering() && RdxDesc.isOrdered();
1151 }
1152
1153 /// \returns The smallest bitwidth each instruction can be represented with.
1154 /// The vector equivalents of these instructions should be truncated to this
1155 /// type.
1157 return MinBWs;
1158 }
1159
1160 /// \returns True if it is more profitable to scalarize instruction \p I for
1161 /// vectorization factor \p VF.
1163 assert(VF.isVector() &&
1164 "Profitable to scalarize relevant only for VF > 1.");
1165 assert(
1166 TheLoop->isInnermost() &&
1167 "cost-model should not be used for outer loops (in VPlan-native path)");
1168
1169 auto Scalars = InstsToScalarize.find(VF);
1170 assert(Scalars != InstsToScalarize.end() &&
1171 "VF not yet analyzed for scalarization profitability");
1172 return Scalars->second.contains(I);
1173 }
1174
1175 /// Returns true if \p I is known to be uniform after vectorization.
1177 assert(
1178 TheLoop->isInnermost() &&
1179 "cost-model should not be used for outer loops (in VPlan-native path)");
1180 // Pseudo probe needs to be duplicated for each unrolled iteration and
1181 // vector lane so that profiled loop trip count can be accurately
1182 // accumulated instead of being under counted.
1183 if (isa<PseudoProbeInst>(I))
1184 return false;
1185
1186 if (VF.isScalar())
1187 return true;
1188
1189 auto UniformsPerVF = Uniforms.find(VF);
1190 assert(UniformsPerVF != Uniforms.end() &&
1191 "VF not yet analyzed for uniformity");
1192 return UniformsPerVF->second.count(I);
1193 }
1194
1195 /// Returns true if \p I is known to be scalar after vectorization.
1197 assert(
1198 TheLoop->isInnermost() &&
1199 "cost-model should not be used for outer loops (in VPlan-native path)");
1200 if (VF.isScalar())
1201 return true;
1202
1203 auto ScalarsPerVF = Scalars.find(VF);
1204 assert(ScalarsPerVF != Scalars.end() &&
1205 "Scalar values are not calculated for VF");
1206 return ScalarsPerVF->second.count(I);
1207 }
1208
1209 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1210 /// for vectorization factor \p VF.
1212 return VF.isVector() && MinBWs.contains(I) &&
1213 !isProfitableToScalarize(I, VF) &&
1215 }
1216
1217 /// Decision that was taken during cost calculation for memory instruction.
1220 CM_Widen, // For consecutive accesses with stride +1.
1221 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1228
1229 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1230 /// instruction \p I and vector width \p VF.
1233 assert(VF.isVector() && "Expected VF >=2");
1234 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1235 }
1236
1237 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1238 /// interleaving group \p Grp and vector width \p VF.
1242 assert(VF.isVector() && "Expected VF >=2");
1243 /// Broadcast this decicion to all instructions inside the group.
1244 /// But the cost will be assigned to one instruction only.
1245 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1246 if (auto *I = Grp->getMember(i)) {
1247 if (Grp->getInsertPos() == I)
1248 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1249 else
1250 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1251 }
1252 }
1253 }
1254
1255 /// Return the cost model decision for the given instruction \p I and vector
1256 /// width \p VF. Return CM_Unknown if this instruction did not pass
1257 /// through the cost modeling.
1259 assert(VF.isVector() && "Expected VF to be a vector VF");
1260 assert(
1261 TheLoop->isInnermost() &&
1262 "cost-model should not be used for outer loops (in VPlan-native path)");
1263
1264 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1265 auto Itr = WideningDecisions.find(InstOnVF);
1266 if (Itr == WideningDecisions.end())
1267 return CM_Unknown;
1268 return Itr->second.first;
1269 }
1270
1271 /// Return the vectorization cost for the given instruction \p I and vector
1272 /// width \p VF.
1274 assert(VF.isVector() && "Expected VF >=2");
1275 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1276 assert(WideningDecisions.contains(InstOnVF) &&
1277 "The cost is not calculated");
1278 return WideningDecisions[InstOnVF].second;
1279 }
1280
1285 std::optional<unsigned> MaskPos;
1287 };
1288
1290 Function *Variant, Intrinsic::ID IID,
1291 std::optional<unsigned> MaskPos,
1293 assert(!VF.isScalar() && "Expected vector VF");
1294 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1295 MaskPos, Cost};
1296 }
1297
1299 ElementCount VF) const {
1300 assert(!VF.isScalar() && "Expected vector VF");
1301 return CallWideningDecisions.at(std::make_pair(CI, VF));
1302 }
1303
1304 /// Return True if instruction \p I is an optimizable truncate whose operand
1305 /// is an induction variable. Such a truncate will be removed by adding a new
1306 /// induction variable with the destination type.
1308 // If the instruction is not a truncate, return false.
1309 auto *Trunc = dyn_cast<TruncInst>(I);
1310 if (!Trunc)
1311 return false;
1312
1313 // Get the source and destination types of the truncate.
1314 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1315 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1316
1317 // If the truncate is free for the given types, return false. Replacing a
1318 // free truncate with an induction variable would add an induction variable
1319 // update instruction to each iteration of the loop. We exclude from this
1320 // check the primary induction variable since it will need an update
1321 // instruction regardless.
1322 Value *Op = Trunc->getOperand(0);
1323 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1324 return false;
1325
1326 // If the truncated value is not an induction variable, return false.
1327 return Legal->isInductionPhi(Op);
1328 }
1329
1330 /// Collects the instructions to scalarize for each predicated instruction in
1331 /// the loop.
1333
1334 /// Collect Uniform and Scalar values for the given \p VF.
1335 /// The sets depend on CM decision for Load/Store instructions
1336 /// that may be vectorized as interleave, gather-scatter or scalarized.
1337 /// Also make a decision on what to do about call instructions in the loop
1338 /// at that VF -- scalarize, call a known vector routine, or call a
1339 /// vector intrinsic.
1341 // Do the analysis once.
1342 if (VF.isScalar() || Uniforms.contains(VF))
1343 return;
1346 collectLoopUniforms(VF);
1347 collectLoopScalars(VF);
1348 }
1349
1350 /// Returns true if the target machine supports masked store operation
1351 /// for the given \p DataType and kind of access to \p Ptr.
1352 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1353 return Legal->isConsecutivePtr(DataType, Ptr) &&
1354 TTI.isLegalMaskedStore(DataType, Alignment);
1355 }
1356
1357 /// Returns true if the target machine supports masked load operation
1358 /// for the given \p DataType and kind of access to \p Ptr.
1359 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1360 return Legal->isConsecutivePtr(DataType, Ptr) &&
1361 TTI.isLegalMaskedLoad(DataType, Alignment);
1362 }
1363
1364 /// Returns true if the target machine can represent \p V as a masked gather
1365 /// or scatter operation.
1367 bool LI = isa<LoadInst>(V);
1368 bool SI = isa<StoreInst>(V);
1369 if (!LI && !SI)
1370 return false;
1371 auto *Ty = getLoadStoreType(V);
1373 if (VF.isVector())
1374 Ty = VectorType::get(Ty, VF);
1375 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1376 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1377 }
1378
1379 /// Returns true if the target machine supports all of the reduction
1380 /// variables found for the given VF.
1382 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1383 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1384 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1385 }));
1386 }
1387
1388 /// Given costs for both strategies, return true if the scalar predication
1389 /// lowering should be used for div/rem. This incorporates an override
1390 /// option so it is not simply a cost comparison.
1392 InstructionCost SafeDivisorCost) const {
1393 switch (ForceSafeDivisor) {
1394 case cl::BOU_UNSET:
1395 return ScalarCost < SafeDivisorCost;
1396 case cl::BOU_TRUE:
1397 return false;
1398 case cl::BOU_FALSE:
1399 return true;
1400 };
1401 llvm_unreachable("impossible case value");
1402 }
1403
1404 /// Returns true if \p I is an instruction which requires predication and
1405 /// for which our chosen predication strategy is scalarization (i.e. we
1406 /// don't have an alternate strategy such as masking available).
1407 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1409
1410 /// Returns true if \p I is an instruction that needs to be predicated
1411 /// at runtime. The result is independent of the predication mechanism.
1412 /// Superset of instructions that return true for isScalarWithPredication.
1413 bool isPredicatedInst(Instruction *I) const;
1414
1415 /// Return the costs for our two available strategies for lowering a
1416 /// div/rem operation which requires speculating at least one lane.
1417 /// First result is for scalarization (will be invalid for scalable
1418 /// vectors); second is for the safe-divisor strategy.
1419 std::pair<InstructionCost, InstructionCost>
1421 ElementCount VF) const;
1422
1423 /// Returns true if \p I is a memory instruction with consecutive memory
1424 /// access that can be widened.
1426
1427 /// Returns true if \p I is a memory instruction in an interleaved-group
1428 /// of memory accesses that can be vectorized with wide vector loads/stores
1429 /// and shuffles.
1431
1432 /// Check if \p Instr belongs to any interleaved access group.
1434 return InterleaveInfo.isInterleaved(Instr);
1435 }
1436
1437 /// Get the interleaved access group that \p Instr belongs to.
1440 return InterleaveInfo.getInterleaveGroup(Instr);
1441 }
1442
1443 /// Returns true if we're required to use a scalar epilogue for at least
1444 /// the final iteration of the original loop.
1445 bool requiresScalarEpilogue(bool IsVectorizing) const {
1446 if (!isScalarEpilogueAllowed()) {
1447 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1448 return false;
1449 }
1450 // If we might exit from anywhere but the latch, must run the exiting
1451 // iteration in scalar form.
1453 LLVM_DEBUG(
1454 dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
1455 return true;
1456 }
1457 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1458 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1459 "interleaved group requires scalar epilogue\n");
1460 return true;
1461 }
1462 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1463 return false;
1464 }
1465
1466 /// Returns true if we're required to use a scalar epilogue for at least
1467 /// the final iteration of the original loop for all VFs in \p Range.
1468 /// A scalar epilogue must either be required for all VFs in \p Range or for
1469 /// none.
1471 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1472 return requiresScalarEpilogue(VF.isVector());
1473 };
1474 bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1475 assert(
1476 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1477 "all VFs in range must agree on whether a scalar epilogue is required");
1478 return IsRequired;
1479 }
1480
1481 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1482 /// loop hint annotation.
1484 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1485 }
1486
1487 /// Returns the TailFoldingStyle that is best for the current loop.
1488 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1489 if (!ChosenTailFoldingStyle)
1491 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1492 : ChosenTailFoldingStyle->second;
1493 }
1494
1495 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1496 /// overflow or not.
1497 /// \param IsScalableVF true if scalable vector factors enabled.
1498 /// \param UserIC User specific interleave count.
1499 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1500 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1501 if (!Legal->canFoldTailByMasking()) {
1502 ChosenTailFoldingStyle =
1504 return;
1505 }
1506
1507 if (!ForceTailFoldingStyle.getNumOccurrences()) {
1508 ChosenTailFoldingStyle = std::make_pair(
1509 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1510 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1511 return;
1512 }
1513
1514 // Set styles when forced.
1515 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1516 ForceTailFoldingStyle.getValue());
1518 return;
1519 // Override forced styles if needed.
1520 // FIXME: use actual opcode/data type for analysis here.
1521 // FIXME: Investigate opportunity for fixed vector factor.
1522 bool EVLIsLegal =
1523 IsScalableVF && UserIC <= 1 &&
1524 TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1526 // FIXME: implement support for max safe dependency distance.
1528 if (!EVLIsLegal) {
1529 // If for some reason EVL mode is unsupported, fallback to
1530 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1531 // in a generic way.
1532 ChosenTailFoldingStyle =
1535 LLVM_DEBUG(
1536 dbgs()
1537 << "LV: Preference for VP intrinsics indicated. Will "
1538 "not try to generate VP Intrinsics "
1539 << (UserIC > 1
1540 ? "since interleave count specified is greater than 1.\n"
1541 : "due to non-interleaving reasons.\n"));
1542 }
1543 }
1544
1545 /// Returns true if all loop blocks should be masked to fold tail loop.
1546 bool foldTailByMasking() const {
1547 // TODO: check if it is possible to check for None style independent of
1548 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1550 }
1551
1552 /// Returns true if the instructions in this block requires predication
1553 /// for any reason, e.g. because tail folding now requires a predicate
1554 /// or because the block in the original loop was predicated.
1557 }
1558
1559 /// Returns true if VP intrinsics with explicit vector length support should
1560 /// be generated in the tail folded loop.
1561 bool foldTailWithEVL() const {
1563 }
1564
1565 /// Returns true if the Phi is part of an inloop reduction.
1566 bool isInLoopReduction(PHINode *Phi) const {
1567 return InLoopReductions.contains(Phi);
1568 }
1569
1570 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1571 /// with factor VF. Return the cost of the instruction, including
1572 /// scalarization overhead if it's needed.
1574
1575 /// Estimate cost of a call instruction CI if it were vectorized with factor
1576 /// VF. Return the cost of the instruction, including scalarization overhead
1577 /// if it's needed.
1579
1580 /// Invalidates decisions already taken by the cost model.
1582 WideningDecisions.clear();
1583 CallWideningDecisions.clear();
1584 Uniforms.clear();
1585 Scalars.clear();
1586 }
1587
1588 /// Returns the expected execution cost. The unit of the cost does
1589 /// not matter because we use the 'cost' units to compare different
1590 /// vector widths. The cost that is returned is *not* normalized by
1591 /// the factor width. If \p Invalid is not nullptr, this function
1592 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1593 /// each instruction that has an Invalid cost for the given VF.
1597
1598 bool hasPredStores() const { return NumPredStores > 0; }
1599
1600 /// Returns true if epilogue vectorization is considered profitable, and
1601 /// false otherwise.
1602 /// \p VF is the vectorization factor chosen for the original loop.
1604
1605 /// Returns the execution time cost of an instruction for a given vector
1606 /// width. Vector width of one means scalar.
1608
1609 /// Return the cost of instructions in an inloop reduction pattern, if I is
1610 /// part of that pattern.
1611 std::optional<InstructionCost>
1614
1615private:
1616 unsigned NumPredStores = 0;
1617
1618 /// \return An upper bound for the vectorization factors for both
1619 /// fixed and scalable vectorization, where the minimum-known number of
1620 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1621 /// disabled or unsupported, then the scalable part will be equal to
1622 /// ElementCount::getScalable(0).
1623 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1624 ElementCount UserVF,
1625 bool FoldTailByMasking);
1626
1627 /// \return the maximized element count based on the targets vector
1628 /// registers and the loop trip-count, but limited to a maximum safe VF.
1629 /// This is a helper function of computeFeasibleMaxVF.
1630 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1631 unsigned SmallestType,
1632 unsigned WidestType,
1633 ElementCount MaxSafeVF,
1634 bool FoldTailByMasking);
1635
1636 /// Checks if scalable vectorization is supported and enabled. Caches the
1637 /// result to avoid repeated debug dumps for repeated queries.
1638 bool isScalableVectorizationAllowed();
1639
1640 /// \return the maximum legal scalable VF, based on the safe max number
1641 /// of elements.
1642 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1643
1644 /// Calculate vectorization cost of memory instruction \p I.
1645 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1646
1647 /// The cost computation for scalarized memory instruction.
1648 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1649
1650 /// The cost computation for interleaving group of memory instructions.
1651 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1652
1653 /// The cost computation for Gather/Scatter instruction.
1654 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1655
1656 /// The cost computation for widening instruction \p I with consecutive
1657 /// memory access.
1658 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1659
1660 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1661 /// Load: scalar load + broadcast.
1662 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1663 /// element)
1664 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1665
1666 /// Estimate the overhead of scalarizing an instruction. This is a
1667 /// convenience wrapper for the type-based getScalarizationOverhead API.
1668 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1670
1671 /// Returns true if an artificially high cost for emulated masked memrefs
1672 /// should be used.
1673 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1674
1675 /// Map of scalar integer values to the smallest bitwidth they can be legally
1676 /// represented as. The vector equivalents of these values should be truncated
1677 /// to this type.
1679
1680 /// A type representing the costs for instructions if they were to be
1681 /// scalarized rather than vectorized. The entries are Instruction-Cost
1682 /// pairs.
1683 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1684
1685 /// A set containing all BasicBlocks that are known to present after
1686 /// vectorization as a predicated block.
1688 PredicatedBBsAfterVectorization;
1689
1690 /// Records whether it is allowed to have the original scalar loop execute at
1691 /// least once. This may be needed as a fallback loop in case runtime
1692 /// aliasing/dependence checks fail, or to handle the tail/remainder
1693 /// iterations when the trip count is unknown or doesn't divide by the VF,
1694 /// or as a peel-loop to handle gaps in interleave-groups.
1695 /// Under optsize and when the trip count is very small we don't allow any
1696 /// iterations to execute in the scalar loop.
1697 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1698
1699 /// Control finally chosen tail folding style. The first element is used if
1700 /// the IV update may overflow, the second element - if it does not.
1701 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1702 ChosenTailFoldingStyle;
1703
1704 /// true if scalable vectorization is supported and enabled.
1705 std::optional<bool> IsScalableVectorizationAllowed;
1706
1707 /// A map holding scalar costs for different vectorization factors. The
1708 /// presence of a cost for an instruction in the mapping indicates that the
1709 /// instruction will be scalarized when vectorizing with the associated
1710 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1712
1713 /// Holds the instructions known to be uniform after vectorization.
1714 /// The data is collected per VF.
1716
1717 /// Holds the instructions known to be scalar after vectorization.
1718 /// The data is collected per VF.
1720
1721 /// Holds the instructions (address computations) that are forced to be
1722 /// scalarized.
1724
1725 /// PHINodes of the reductions that should be expanded in-loop.
1726 SmallPtrSet<PHINode *, 4> InLoopReductions;
1727
1728 /// A Map of inloop reduction operations and their immediate chain operand.
1729 /// FIXME: This can be removed once reductions can be costed correctly in
1730 /// VPlan. This was added to allow quick lookup of the inloop operations.
1731 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1732
1733 /// Returns the expected difference in cost from scalarizing the expression
1734 /// feeding a predicated instruction \p PredInst. The instructions to
1735 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1736 /// non-negative return value implies the expression will be scalarized.
1737 /// Currently, only single-use chains are considered for scalarization.
1738 InstructionCost computePredInstDiscount(Instruction *PredInst,
1739 ScalarCostsTy &ScalarCosts,
1740 ElementCount VF);
1741
1742 /// Collect the instructions that are uniform after vectorization. An
1743 /// instruction is uniform if we represent it with a single scalar value in
1744 /// the vectorized loop corresponding to each vector iteration. Examples of
1745 /// uniform instructions include pointer operands of consecutive or
1746 /// interleaved memory accesses. Note that although uniformity implies an
1747 /// instruction will be scalar, the reverse is not true. In general, a
1748 /// scalarized instruction will be represented by VF scalar values in the
1749 /// vectorized loop, each corresponding to an iteration of the original
1750 /// scalar loop.
1751 void collectLoopUniforms(ElementCount VF);
1752
1753 /// Collect the instructions that are scalar after vectorization. An
1754 /// instruction is scalar if it is known to be uniform or will be scalarized
1755 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1756 /// to the list if they are used by a load/store instruction that is marked as
1757 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1758 /// VF values in the vectorized loop, each corresponding to an iteration of
1759 /// the original scalar loop.
1760 void collectLoopScalars(ElementCount VF);
1761
1762 /// Keeps cost model vectorization decision and cost for instructions.
1763 /// Right now it is used for memory instructions only.
1765 std::pair<InstWidening, InstructionCost>>;
1766
1767 DecisionList WideningDecisions;
1768
1769 using CallDecisionList =
1770 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1771
1772 CallDecisionList CallWideningDecisions;
1773
1774 /// Returns true if \p V is expected to be vectorized and it needs to be
1775 /// extracted.
1776 bool needsExtract(Value *V, ElementCount VF) const {
1777 Instruction *I = dyn_cast<Instruction>(V);
1778 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1780 return false;
1781
1782 // Assume we can vectorize V (and hence we need extraction) if the
1783 // scalars are not computed yet. This can happen, because it is called
1784 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1785 // the scalars are collected. That should be a safe assumption in most
1786 // cases, because we check if the operands have vectorizable types
1787 // beforehand in LoopVectorizationLegality.
1788 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1789 };
1790
1791 /// Returns a range containing only operands needing to be extracted.
1792 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1793 ElementCount VF) const {
1795 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1796 }
1797
1798public:
1799 /// The loop that we evaluate.
1801
1802 /// Predicated scalar evolution analysis.
1804
1805 /// Loop Info analysis.
1807
1808 /// Vectorization legality.
1810
1811 /// Vector target information.
1813
1814 /// Target Library Info.
1816
1817 /// Demanded bits analysis.
1819
1820 /// Assumption cache.
1822
1823 /// Interface to emit optimization remarks.
1825
1827
1828 /// Loop Vectorize Hint.
1830
1831 /// The interleave access information contains groups of interleaved accesses
1832 /// with the same stride and close to each other.
1834
1835 /// Values to ignore in the cost model.
1837
1838 /// Values to ignore in the cost model when VF > 1.
1840
1841 /// All element types found in the loop.
1843};
1844} // end namespace llvm
1845
1846namespace {
1847/// Helper struct to manage generating runtime checks for vectorization.
1848///
1849/// The runtime checks are created up-front in temporary blocks to allow better
1850/// estimating the cost and un-linked from the existing IR. After deciding to
1851/// vectorize, the checks are moved back. If deciding not to vectorize, the
1852/// temporary blocks are completely removed.
1853class GeneratedRTChecks {
1854 /// Basic block which contains the generated SCEV checks, if any.
1855 BasicBlock *SCEVCheckBlock = nullptr;
1856
1857 /// The value representing the result of the generated SCEV checks. If it is
1858 /// nullptr, either no SCEV checks have been generated or they have been used.
1859 Value *SCEVCheckCond = nullptr;
1860
1861 /// Basic block which contains the generated memory runtime checks, if any.
1862 BasicBlock *MemCheckBlock = nullptr;
1863
1864 /// The value representing the result of the generated memory runtime checks.
1865 /// If it is nullptr, either no memory runtime checks have been generated or
1866 /// they have been used.
1867 Value *MemRuntimeCheckCond = nullptr;
1868
1869 DominatorTree *DT;
1870 LoopInfo *LI;
1872
1873 SCEVExpander SCEVExp;
1874 SCEVExpander MemCheckExp;
1875
1876 bool CostTooHigh = false;
1877 const bool AddBranchWeights;
1878
1879 Loop *OuterLoop = nullptr;
1880
1881public:
1882 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1884 bool AddBranchWeights)
1885 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1886 MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1887
1888 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1889 /// accurately estimate the cost of the runtime checks. The blocks are
1890 /// un-linked from the IR and is added back during vector code generation. If
1891 /// there is no vector code generation, the check blocks are removed
1892 /// completely.
1893 void Create(Loop *L, const LoopAccessInfo &LAI,
1894 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1895
1896 // Hard cutoff to limit compile-time increase in case a very large number of
1897 // runtime checks needs to be generated.
1898 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1899 // profile info.
1900 CostTooHigh =
1902 if (CostTooHigh)
1903 return;
1904
1905 BasicBlock *LoopHeader = L->getHeader();
1906 BasicBlock *Preheader = L->getLoopPreheader();
1907
1908 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1909 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1910 // may be used by SCEVExpander. The blocks will be un-linked from their
1911 // predecessors and removed from LI & DT at the end of the function.
1912 if (!UnionPred.isAlwaysTrue()) {
1913 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1914 nullptr, "vector.scevcheck");
1915
1916 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1917 &UnionPred, SCEVCheckBlock->getTerminator());
1918 }
1919
1920 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1921 if (RtPtrChecking.Need) {
1922 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1923 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1924 "vector.memcheck");
1925
1926 auto DiffChecks = RtPtrChecking.getDiffChecks();
1927 if (DiffChecks) {
1928 Value *RuntimeVF = nullptr;
1929 MemRuntimeCheckCond = addDiffRuntimeChecks(
1930 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1931 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1932 if (!RuntimeVF)
1933 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1934 return RuntimeVF;
1935 },
1936 IC);
1937 } else {
1938 MemRuntimeCheckCond = addRuntimeChecks(
1939 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1941 }
1942 assert(MemRuntimeCheckCond &&
1943 "no RT checks generated although RtPtrChecking "
1944 "claimed checks are required");
1945 }
1946
1947 if (!MemCheckBlock && !SCEVCheckBlock)
1948 return;
1949
1950 // Unhook the temporary block with the checks, update various places
1951 // accordingly.
1952 if (SCEVCheckBlock)
1953 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1954 if (MemCheckBlock)
1955 MemCheckBlock->replaceAllUsesWith(Preheader);
1956
1957 if (SCEVCheckBlock) {
1958 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1959 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1960 Preheader->getTerminator()->eraseFromParent();
1961 }
1962 if (MemCheckBlock) {
1963 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1964 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1965 Preheader->getTerminator()->eraseFromParent();
1966 }
1967
1968 DT->changeImmediateDominator(LoopHeader, Preheader);
1969 if (MemCheckBlock) {
1970 DT->eraseNode(MemCheckBlock);
1971 LI->removeBlock(MemCheckBlock);
1972 }
1973 if (SCEVCheckBlock) {
1974 DT->eraseNode(SCEVCheckBlock);
1975 LI->removeBlock(SCEVCheckBlock);
1976 }
1977
1978 // Outer loop is used as part of the later cost calculations.
1979 OuterLoop = L->getParentLoop();
1980 }
1981
1982 InstructionCost getCost() {
1983 if (SCEVCheckBlock || MemCheckBlock)
1984 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1985
1986 if (CostTooHigh) {
1988 Cost.setInvalid();
1989 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1990 return Cost;
1991 }
1992
1993 InstructionCost RTCheckCost = 0;
1994 if (SCEVCheckBlock)
1995 for (Instruction &I : *SCEVCheckBlock) {
1996 if (SCEVCheckBlock->getTerminator() == &I)
1997 continue;
2000 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2001 RTCheckCost += C;
2002 }
2003 if (MemCheckBlock) {
2004 InstructionCost MemCheckCost = 0;
2005 for (Instruction &I : *MemCheckBlock) {
2006 if (MemCheckBlock->getTerminator() == &I)
2007 continue;
2010 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2011 MemCheckCost += C;
2012 }
2013
2014 // If the runtime memory checks are being created inside an outer loop
2015 // we should find out if these checks are outer loop invariant. If so,
2016 // the checks will likely be hoisted out and so the effective cost will
2017 // reduce according to the outer loop trip count.
2018 if (OuterLoop) {
2019 ScalarEvolution *SE = MemCheckExp.getSE();
2020 // TODO: If profitable, we could refine this further by analysing every
2021 // individual memory check, since there could be a mixture of loop
2022 // variant and invariant checks that mean the final condition is
2023 // variant.
2024 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
2025 if (SE->isLoopInvariant(Cond, OuterLoop)) {
2026 // It seems reasonable to assume that we can reduce the effective
2027 // cost of the checks even when we know nothing about the trip
2028 // count. Assume that the outer loop executes at least twice.
2029 unsigned BestTripCount = 2;
2030
2031 // If exact trip count is known use that.
2032 if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
2033 BestTripCount = SmallTC;
2035 // Else use profile data if available.
2036 if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
2037 BestTripCount = *EstimatedTC;
2038 }
2039
2040 BestTripCount = std::max(BestTripCount, 1U);
2041 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2042
2043 // Let's ensure the cost is always at least 1.
2044 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2046
2047 if (BestTripCount > 1)
2049 << "We expect runtime memory checks to be hoisted "
2050 << "out of the outer loop. Cost reduced from "
2051 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2052
2053 MemCheckCost = NewMemCheckCost;
2054 }
2055 }
2056
2057 RTCheckCost += MemCheckCost;
2058 }
2059
2060 if (SCEVCheckBlock || MemCheckBlock)
2061 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2062 << "\n");
2063
2064 return RTCheckCost;
2065 }
2066
2067 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2068 /// unused.
2069 ~GeneratedRTChecks() {
2070 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2071 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2072 if (!SCEVCheckCond)
2073 SCEVCleaner.markResultUsed();
2074
2075 if (!MemRuntimeCheckCond)
2076 MemCheckCleaner.markResultUsed();
2077
2078 if (MemRuntimeCheckCond) {
2079 auto &SE = *MemCheckExp.getSE();
2080 // Memory runtime check generation creates compares that use expanded
2081 // values. Remove them before running the SCEVExpanderCleaners.
2082 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2083 if (MemCheckExp.isInsertedInstruction(&I))
2084 continue;
2085 SE.forgetValue(&I);
2086 I.eraseFromParent();
2087 }
2088 }
2089 MemCheckCleaner.cleanup();
2090 SCEVCleaner.cleanup();
2091
2092 if (SCEVCheckCond)
2093 SCEVCheckBlock->eraseFromParent();
2094 if (MemRuntimeCheckCond)
2095 MemCheckBlock->eraseFromParent();
2096 }
2097
2098 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2099 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2100 /// depending on the generated condition.
2101 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2102 BasicBlock *LoopVectorPreHeader,
2103 BasicBlock *LoopExitBlock) {
2104 if (!SCEVCheckCond)
2105 return nullptr;
2106
2107 Value *Cond = SCEVCheckCond;
2108 // Mark the check as used, to prevent it from being removed during cleanup.
2109 SCEVCheckCond = nullptr;
2110 if (auto *C = dyn_cast<ConstantInt>(Cond))
2111 if (C->isZero())
2112 return nullptr;
2113
2114 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2115
2116 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2117 // Create new preheader for vector loop.
2118 if (OuterLoop)
2119 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2120
2121 SCEVCheckBlock->getTerminator()->eraseFromParent();
2122 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2123 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2124 SCEVCheckBlock);
2125
2126 DT->addNewBlock(SCEVCheckBlock, Pred);
2127 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2128
2129 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2130 if (AddBranchWeights)
2131 setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
2132 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2133 return SCEVCheckBlock;
2134 }
2135
2136 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2137 /// the branches to branch to the vector preheader or \p Bypass, depending on
2138 /// the generated condition.
2139 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2140 BasicBlock *LoopVectorPreHeader) {
2141 // Check if we generated code that checks in runtime if arrays overlap.
2142 if (!MemRuntimeCheckCond)
2143 return nullptr;
2144
2145 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2146 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2147 MemCheckBlock);
2148
2149 DT->addNewBlock(MemCheckBlock, Pred);
2150 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2151 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2152
2153 if (OuterLoop)
2154 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2155
2156 BranchInst &BI =
2157 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2158 if (AddBranchWeights) {
2159 setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2160 }
2161 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2162 MemCheckBlock->getTerminator()->setDebugLoc(
2163 Pred->getTerminator()->getDebugLoc());
2164
2165 // Mark the check as used, to prevent it from being removed during cleanup.
2166 MemRuntimeCheckCond = nullptr;
2167 return MemCheckBlock;
2168 }
2169};
2170} // namespace
2171
2173 return Style == TailFoldingStyle::Data ||
2174 Style == TailFoldingStyle::DataAndControlFlow ||
2175 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2176}
2177
2179 return Style == TailFoldingStyle::DataAndControlFlow ||
2180 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2181}
2182
2183// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2184// vectorization. The loop needs to be annotated with #pragma omp simd
2185// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2186// vector length information is not provided, vectorization is not considered
2187// explicit. Interleave hints are not allowed either. These limitations will be
2188// relaxed in the future.
2189// Please, note that we are currently forced to abuse the pragma 'clang
2190// vectorize' semantics. This pragma provides *auto-vectorization hints*
2191// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2192// provides *explicit vectorization hints* (LV can bypass legal checks and
2193// assume that vectorization is legal). However, both hints are implemented
2194// using the same metadata (llvm.loop.vectorize, processed by
2195// LoopVectorizeHints). This will be fixed in the future when the native IR
2196// representation for pragma 'omp simd' is introduced.
2197static bool isExplicitVecOuterLoop(Loop *OuterLp,
2199 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2200 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2201
2202 // Only outer loops with an explicit vectorization hint are supported.
2203 // Unannotated outer loops are ignored.
2205 return false;
2206
2207 Function *Fn = OuterLp->getHeader()->getParent();
2208 if (!Hints.allowVectorization(Fn, OuterLp,
2209 true /*VectorizeOnlyWhenForced*/)) {
2210 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2211 return false;
2212 }
2213
2214 if (Hints.getInterleave() > 1) {
2215 // TODO: Interleave support is future work.
2216 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2217 "outer loops.\n");
2218 Hints.emitRemarkWithHints();
2219 return false;
2220 }
2221
2222 return true;
2223}
2224
2228 // Collect inner loops and outer loops without irreducible control flow. For
2229 // now, only collect outer loops that have explicit vectorization hints. If we
2230 // are stress testing the VPlan H-CFG construction, we collect the outermost
2231 // loop of every loop nest.
2232 if (L.isInnermost() || VPlanBuildStressTest ||
2234 LoopBlocksRPO RPOT(&L);
2235 RPOT.perform(LI);
2236 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2237 V.push_back(&L);
2238 // TODO: Collect inner loops inside marked outer loops in case
2239 // vectorization fails for the outer loop. Do not invoke
2240 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2241 // already known to be reducible. We can use an inherited attribute for
2242 // that.
2243 return;
2244 }
2245 }
2246 for (Loop *InnerL : L)
2247 collectSupportedLoops(*InnerL, LI, ORE, V);
2248}
2249
2250//===----------------------------------------------------------------------===//
2251// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2252// LoopVectorizationCostModel and LoopVectorizationPlanner.
2253//===----------------------------------------------------------------------===//
2254
2255/// Compute the transformed value of Index at offset StartValue using step
2256/// StepValue.
2257/// For integer induction, returns StartValue + Index * StepValue.
2258/// For pointer induction, returns StartValue[Index * StepValue].
2259/// FIXME: The newly created binary instructions should contain nsw/nuw
2260/// flags, which can be found from the original scalar operations.
2261static Value *
2263 Value *Step,
2265 const BinaryOperator *InductionBinOp) {
2266 Type *StepTy = Step->getType();
2267 Value *CastedIndex = StepTy->isIntegerTy()
2268 ? B.CreateSExtOrTrunc(Index, StepTy)
2269 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2270 if (CastedIndex != Index) {
2271 CastedIndex->setName(CastedIndex->getName() + ".cast");
2272 Index = CastedIndex;
2273 }
2274
2275 // Note: the IR at this point is broken. We cannot use SE to create any new
2276 // SCEV and then expand it, hoping that SCEV's simplification will give us
2277 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2278 // lead to various SCEV crashes. So all we can do is to use builder and rely
2279 // on InstCombine for future simplifications. Here we handle some trivial
2280 // cases only.
2281 auto CreateAdd = [&B](Value *X, Value *Y) {
2282 assert(X->getType() == Y->getType() && "Types don't match!");
2283 if (auto *CX = dyn_cast<ConstantInt>(X))
2284 if (CX->isZero())
2285 return Y;
2286 if (auto *CY = dyn_cast<ConstantInt>(Y))
2287 if (CY->isZero())
2288 return X;
2289 return B.CreateAdd(X, Y);
2290 };
2291
2292 // We allow X to be a vector type, in which case Y will potentially be
2293 // splatted into a vector with the same element count.
2294 auto CreateMul = [&B](Value *X, Value *Y) {
2295 assert(X->getType()->getScalarType() == Y->getType() &&
2296 "Types don't match!");
2297 if (auto *CX = dyn_cast<ConstantInt>(X))
2298 if (CX->isOne())
2299 return Y;
2300 if (auto *CY = dyn_cast<ConstantInt>(Y))
2301 if (CY->isOne())
2302 return X;
2303 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2304 if (XVTy && !isa<VectorType>(Y->getType()))
2305 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2306 return B.CreateMul(X, Y);
2307 };
2308
2309 switch (InductionKind) {
2311 assert(!isa<VectorType>(Index->getType()) &&
2312 "Vector indices not supported for integer inductions yet");
2313 assert(Index->getType() == StartValue->getType() &&
2314 "Index type does not match StartValue type");
2315 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2316 return B.CreateSub(StartValue, Index);
2317 auto *Offset = CreateMul(Index, Step);
2318 return CreateAdd(StartValue, Offset);
2319 }
2321 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2323 assert(!isa<VectorType>(Index->getType()) &&
2324 "Vector indices not supported for FP inductions yet");
2325 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2326 assert(InductionBinOp &&
2327 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2328 InductionBinOp->getOpcode() == Instruction::FSub) &&
2329 "Original bin op should be defined for FP induction");
2330
2331 Value *MulExp = B.CreateFMul(Step, Index);
2332 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2333 "induction");
2334 }
2336 return nullptr;
2337 }
2338 llvm_unreachable("invalid enum");
2339}
2340
2341std::optional<unsigned> getMaxVScale(const Function &F,
2342 const TargetTransformInfo &TTI) {
2343 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2344 return MaxVScale;
2345
2346 if (F.hasFnAttribute(Attribute::VScaleRange))
2347 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2348
2349 return std::nullopt;
2350}
2351
2352/// For the given VF and UF and maximum trip count computed for the loop, return
2353/// whether the induction variable might overflow in the vectorized loop. If not,
2354/// then we know a runtime overflow check always evaluates to false and can be
2355/// removed.
2358 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2359 // Always be conservative if we don't know the exact unroll factor.
2360 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2361
2362 Type *IdxTy = Cost->Legal->getWidestInductionType();
2363 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2364
2365 // We know the runtime overflow check is known false iff the (max) trip-count
2366 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2367 // the vector loop induction variable.
2368 if (unsigned TC =
2369 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2370 uint64_t MaxVF = VF.getKnownMinValue();
2371 if (VF.isScalable()) {
2372 std::optional<unsigned> MaxVScale =
2373 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2374 if (!MaxVScale)
2375 return false;
2376 MaxVF *= *MaxVScale;
2377 }
2378
2379 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2380 }
2381
2382 return false;
2383}
2384
2385// Return whether we allow using masked interleave-groups (for dealing with
2386// strided loads/stores that reside in predicated blocks, or for dealing
2387// with gaps).
2389 // If an override option has been passed in for interleaved accesses, use it.
2390 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2392
2394}
2395
2396// Try to vectorize the interleave group that \p Instr belongs to.
2397//
2398// E.g. Translate following interleaved load group (factor = 3):
2399// for (i = 0; i < N; i+=3) {
2400// R = Pic[i]; // Member of index 0
2401// G = Pic[i+1]; // Member of index 1
2402// B = Pic[i+2]; // Member of index 2
2403// ... // do something to R, G, B
2404// }
2405// To:
2406// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2407// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2408// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2409// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2410//
2411// Or translate following interleaved store group (factor = 3):
2412// for (i = 0; i < N; i+=3) {
2413// ... do something to R, G, B
2414// Pic[i] = R; // Member of index 0
2415// Pic[i+1] = G; // Member of index 1
2416// Pic[i+2] = B; // Member of index 2
2417// }
2418// To:
2419// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2420// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2421// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2422// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2423// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2426 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2427 VPValue *BlockInMask, bool NeedsMaskForGaps) {
2428 Instruction *Instr = Group->getInsertPos();
2429 const DataLayout &DL = Instr->getDataLayout();
2430
2431 // Prepare for the vector type of the interleaved load/store.
2432 Type *ScalarTy = getLoadStoreType(Instr);
2433 unsigned InterleaveFactor = Group->getFactor();
2434 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
2435
2436 // Prepare for the new pointers.
2437 SmallVector<Value *, 2> AddrParts;
2438 unsigned Index = Group->getIndex(Instr);
2439
2440 // TODO: extend the masked interleaved-group support to reversed access.
2441 assert((!BlockInMask || !Group->isReverse()) &&
2442 "Reversed masked interleave-group not supported.");
2443
2444 Value *Idx;
2445 // If the group is reverse, adjust the index to refer to the last vector lane
2446 // instead of the first. We adjust the index from the first vector lane,
2447 // rather than directly getting the pointer for lane VF - 1, because the
2448 // pointer operand of the interleaved access is supposed to be uniform. For
2449 // uniform instructions, we're only required to generate a value for the
2450 // first vector lane in each unroll iteration.
2451 if (Group->isReverse()) {
2452 Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
2453 Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
2457 } else
2459
2460 for (unsigned Part = 0; Part < State.UF; Part++) {
2461 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2462 if (auto *I = dyn_cast<Instruction>(AddrPart))
2463 State.setDebugLocFrom(I->getDebugLoc());
2464
2465 // Notice current instruction could be any index. Need to adjust the address
2466 // to the member of index 0.
2467 //
2468 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2469 // b = A[i]; // Member of index 0
2470 // Current pointer is pointed to A[i+1], adjust it to A[i].
2471 //
2472 // E.g. A[i+1] = a; // Member of index 1
2473 // A[i] = b; // Member of index 0
2474 // A[i+2] = c; // Member of index 2 (Current instruction)
2475 // Current pointer is pointed to A[i+2], adjust it to A[i].
2476
2477 bool InBounds = false;
2478 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2479 InBounds = gep->isInBounds();
2480 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2481 AddrParts.push_back(AddrPart);
2482 }
2483
2484 State.setDebugLocFrom(Instr->getDebugLoc());
2485 Value *PoisonVec = PoisonValue::get(VecTy);
2486
2487 auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
2488 unsigned Part, Value *MaskForGaps) -> Value * {
2489 if (State.VF.isScalable()) {
2490 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2491 assert(InterleaveFactor == 2 &&
2492 "Unsupported deinterleave factor for scalable vectors");
2493 auto *BlockInMaskPart = State.get(BlockInMask, Part);
2494 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2495 auto *MaskTy = VectorType::get(Builder.getInt1Ty(),
2496 State.VF.getKnownMinValue() * 2, true);
2497 return Builder.CreateIntrinsic(MaskTy, Intrinsic::vector_interleave2, Ops,
2498 /*FMFSource=*/nullptr, "interleaved.mask");
2499 }
2500
2501 if (!BlockInMask)
2502 return MaskForGaps;
2503
2504 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2505 Value *ShuffledMask = Builder.CreateShuffleVector(
2506 BlockInMaskPart,
2507 createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
2508 "interleaved.mask");
2509 return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2510 MaskForGaps)
2511 : ShuffledMask;
2512 };
2513
2514 // Vectorize the interleaved load group.
2515 if (isa<LoadInst>(Instr)) {
2516 Value *MaskForGaps = nullptr;
2517 if (NeedsMaskForGaps) {
2518 MaskForGaps =
2520 assert(MaskForGaps && "Mask for Gaps is required but it is null");
2521 }
2522
2523 // For each unroll part, create a wide load for the group.
2524 SmallVector<Value *, 2> NewLoads;
2525 for (unsigned Part = 0; Part < State.UF; Part++) {
2526 Instruction *NewLoad;
2527 if (BlockInMask || MaskForGaps) {
2529 "masked interleaved groups are not allowed.");
2530 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2531 NewLoad =
2532 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2533 GroupMask, PoisonVec, "wide.masked.vec");
2534 }
2535 else
2536 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2537 Group->getAlign(), "wide.vec");
2538 Group->addMetadata(NewLoad);
2539 NewLoads.push_back(NewLoad);
2540 }
2541
2542 if (VecTy->isScalableTy()) {
2543 assert(InterleaveFactor == 2 &&
2544 "Unsupported deinterleave factor for scalable vectors");
2545
2546 for (unsigned Part = 0; Part < State.UF; ++Part) {
2547 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2548 // so must use intrinsics to deinterleave.
2550 Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part],
2551 /*FMFSource=*/nullptr, "strided.vec");
2552 unsigned J = 0;
2553 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2554 Instruction *Member = Group->getMember(I);
2555
2556 if (!Member)
2557 continue;
2558
2559 Value *StridedVec = Builder.CreateExtractValue(DI, I);
2560 // If this member has different type, cast the result type.
2561 if (Member->getType() != ScalarTy) {
2562 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2563 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2564 }
2565
2566 if (Group->isReverse())
2567 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2568
2569 State.set(VPDefs[J], StridedVec, Part);
2570 ++J;
2571 }
2572 }
2573
2574 return;
2575 }
2576
2577 // For each member in the group, shuffle out the appropriate data from the
2578 // wide loads.
2579 unsigned J = 0;
2580 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2581 Instruction *Member = Group->getMember(I);
2582
2583 // Skip the gaps in the group.
2584 if (!Member)
2585 continue;
2586
2587 auto StrideMask =
2588 createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
2589 for (unsigned Part = 0; Part < State.UF; Part++) {
2590 Value *StridedVec = Builder.CreateShuffleVector(
2591 NewLoads[Part], StrideMask, "strided.vec");
2592
2593 // If this member has different type, cast the result type.
2594 if (Member->getType() != ScalarTy) {
2595 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
2596 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2597 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2598 }
2599
2600 if (Group->isReverse())
2601 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2602
2603 State.set(VPDefs[J], StridedVec, Part);
2604 }
2605 ++J;
2606 }
2607 return;
2608 }
2609
2610 // The sub vector type for current instruction.
2611 auto *SubVT = VectorType::get(ScalarTy, State.VF);
2612
2613 // Vectorize the interleaved store group.
2614 Value *MaskForGaps =
2616 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2617 "masked interleaved groups are not allowed.");
2618 assert((!MaskForGaps || !State.VF.isScalable()) &&
2619 "masking gaps for scalable vectors is not yet supported.");
2620 for (unsigned Part = 0; Part < State.UF; Part++) {
2621 // Collect the stored vector from each member.
2622 SmallVector<Value *, 4> StoredVecs;
2623 unsigned StoredIdx = 0;
2624 for (unsigned i = 0; i < InterleaveFactor; i++) {
2625 assert((Group->getMember(i) || MaskForGaps) &&
2626 "Fail to get a member from an interleaved store group");
2627 Instruction *Member = Group->getMember(i);
2628
2629 // Skip the gaps in the group.
2630 if (!Member) {
2631 Value *Undef = PoisonValue::get(SubVT);
2632 StoredVecs.push_back(Undef);
2633 continue;
2634 }
2635
2636 Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2637 ++StoredIdx;
2638
2639 if (Group->isReverse())
2640 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2641
2642 // If this member has different type, cast it to a unified type.
2643
2644 if (StoredVec->getType() != SubVT)
2645 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2646
2647 StoredVecs.push_back(StoredVec);
2648 }
2649
2650 // Interleave all the smaller vectors into one wider vector.
2651 Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
2652 Instruction *NewStoreInstr;
2653 if (BlockInMask || MaskForGaps) {
2654 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2655 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2656 Group->getAlign(), GroupMask);
2657 } else
2658 NewStoreInstr =
2659 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2660
2661 Group->addMetadata(NewStoreInstr);
2662 }
2663}
2664
2666 VPReplicateRecipe *RepRecipe,
2667 const VPIteration &Instance,
2668 VPTransformState &State) {
2669 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2670
2671 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2672 // the first lane and part.
2673 if (isa<NoAliasScopeDeclInst>(Instr))
2674 if (!Instance.isFirstIteration())
2675 return;
2676
2677 // Does this instruction return a value ?
2678 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2679
2680 Instruction *Cloned = Instr->clone();
2681 if (!IsVoidRetTy) {
2682 Cloned->setName(Instr->getName() + ".cloned");
2683#if !defined(NDEBUG)
2684 // Verify that VPlan type inference results agree with the type of the
2685 // generated values.
2686 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2687 "inferred type and type from generated instructions do not match");
2688#endif
2689 }
2690
2691 RepRecipe->setFlags(Cloned);
2692
2693 if (auto DL = Instr->getDebugLoc())
2694 State.setDebugLocFrom(DL);
2695
2696 // Replace the operands of the cloned instructions with their scalar
2697 // equivalents in the new loop.
2698 for (const auto &I : enumerate(RepRecipe->operands())) {
2699 auto InputInstance = Instance;
2700 VPValue *Operand = I.value();
2702 InputInstance.Lane = VPLane::getFirstLane();
2703 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2704 }
2705 State.addNewMetadata(Cloned, Instr);
2706
2707 // Place the cloned scalar in the new loop.
2708 State.Builder.Insert(Cloned);
2709
2710 State.set(RepRecipe, Cloned, Instance);
2711
2712 // If we just cloned a new assumption, add it the assumption cache.
2713 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2715
2716 // End if-block.
2717 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2718 if (IfPredicateInstr)
2719 PredicatedInstructions.push_back(Cloned);
2720}
2721
2722Value *
2724 if (VectorTripCount)
2725 return VectorTripCount;
2726
2727 Value *TC = getTripCount();
2728 IRBuilder<> Builder(InsertBlock->getTerminator());
2729
2730 Type *Ty = TC->getType();
2731 // This is where we can make the step a runtime constant.
2732 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2733
2734 // If the tail is to be folded by masking, round the number of iterations N
2735 // up to a multiple of Step instead of rounding down. This is done by first
2736 // adding Step-1 and then rounding down. Note that it's ok if this addition
2737 // overflows: the vector induction variable will eventually wrap to zero given
2738 // that it starts at zero and its Step is a power of two; the loop will then
2739 // exit, with the last early-exit vector comparison also producing all-true.
2740 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2741 // is accounted for in emitIterationCountCheck that adds an overflow check.
2742 if (Cost->foldTailByMasking()) {
2744 "VF*UF must be a power of 2 when folding tail by masking");
2745 TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2746 "n.rnd.up");
2747 }
2748
2749 // Now we need to generate the expression for the part of the loop that the
2750 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2751 // iterations are not required for correctness, or N - Step, otherwise. Step
2752 // is equal to the vectorization factor (number of SIMD elements) times the
2753 // unroll factor (number of SIMD instructions).
2754 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2755
2756 // There are cases where we *must* run at least one iteration in the remainder
2757 // loop. See the cost model for when this can happen. If the step evenly
2758 // divides the trip count, we set the remainder to be equal to the step. If
2759 // the step does not evenly divide the trip count, no adjustment is necessary
2760 // since there will already be scalar iterations. Note that the minimum
2761 // iterations check ensures that N >= Step.
2762 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2763 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2764 R = Builder.CreateSelect(IsZero, Step, R);
2765 }
2766
2767 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2768
2769 return VectorTripCount;
2770}
2771
2773 const DataLayout &DL) {
2774 // Verify that V is a vector type with same number of elements as DstVTy.
2775 auto VF = DstVTy->getElementCount();
2776 auto *SrcVecTy = cast<VectorType>(V->getType());
2777 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2778 Type *SrcElemTy = SrcVecTy->getElementType();
2779 Type *DstElemTy = DstVTy->getElementType();
2780 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2781 "Vector elements must have same size");
2782
2783 // Do a direct cast if element types are castable.
2784 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2785 return Builder.CreateBitOrPointerCast(V, DstVTy);
2786 }
2787 // V cannot be directly casted to desired vector type.
2788 // May happen when V is a floating point vector but DstVTy is a vector of
2789 // pointers or vice-versa. Handle this using a two-step bitcast using an
2790 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2791 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2792 "Only one type should be a pointer type");
2793 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2794 "Only one type should be a floating point type");
2795 Type *IntTy =
2796 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2797 auto *VecIntTy = VectorType::get(IntTy, VF);
2798 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2799 return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2800}
2801
2803 Value *Count = getTripCount();
2804 // Reuse existing vector loop preheader for TC checks.
2805 // Note that new preheader block is generated for vector loop.
2806 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2807 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2808
2809 // Generate code to check if the loop's trip count is less than VF * UF, or
2810 // equal to it in case a scalar epilogue is required; this implies that the
2811 // vector trip count is zero. This check also covers the case where adding one
2812 // to the backedge-taken count overflowed leading to an incorrect trip count
2813 // of zero. In this case we will also jump to the scalar loop.
2814 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2816
2817 // If tail is to be folded, vector loop takes care of all iterations.
2818 Type *CountTy = Count->getType();
2819 Value *CheckMinIters = Builder.getFalse();
2820 auto CreateStep = [&]() -> Value * {
2821 // Create step with max(MinProTripCount, UF * VF).
2823 return createStepForVF(Builder, CountTy, VF, UF);
2824
2825 Value *MinProfTC =
2827 if (!VF.isScalable())
2828 return MinProfTC;
2830 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2831 };
2832
2833 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2834 if (Style == TailFoldingStyle::None)
2835 CheckMinIters =
2836 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2837 else if (VF.isScalable() &&
2840 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2841 // an overflow to zero when updating induction variables and so an
2842 // additional overflow check is required before entering the vector loop.
2843
2844 // Get the maximum unsigned value for the type.
2845 Value *MaxUIntTripCount =
2846 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2847 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2848
2849 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2850 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2851 }
2852
2853 // Create new preheader for vector loop.
2855 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2856 "vector.ph");
2857
2858 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2859 DT->getNode(Bypass)->getIDom()) &&
2860 "TC check is expected to dominate Bypass");
2861
2862 // Update dominator for Bypass & LoopExit (if needed).
2863 DT->changeImmediateDominator(Bypass, TCCheckBlock);
2864 BranchInst &BI =
2865 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2867 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2868 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2869 LoopBypassBlocks.push_back(TCCheckBlock);
2870}
2871
2873 BasicBlock *const SCEVCheckBlock =
2874 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2875 if (!SCEVCheckBlock)
2876 return nullptr;
2877
2878 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2880 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2881 "Cannot SCEV check stride or overflow when optimizing for size");
2882
2883
2884 // Update dominator only if this is first RT check.
2885 if (LoopBypassBlocks.empty()) {
2886 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2887 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2888 // If there is an epilogue which must run, there's no edge from the
2889 // middle block to exit blocks and thus no need to update the immediate
2890 // dominator of the exit blocks.
2891 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2892 }
2893
2894 LoopBypassBlocks.push_back(SCEVCheckBlock);
2895 AddedSafetyChecks = true;
2896 return SCEVCheckBlock;
2897}
2898
2900 // VPlan-native path does not do any analysis for runtime checks currently.
2902 return nullptr;
2903
2904 BasicBlock *const MemCheckBlock =
2905 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2906
2907 // Check if we generated code that checks in runtime if arrays overlap. We put
2908 // the checks into a separate block to make the more common case of few
2909 // elements faster.
2910 if (!MemCheckBlock)
2911 return nullptr;
2912
2913 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2914 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2915 "Cannot emit memory checks when optimizing for size, unless forced "
2916 "to vectorize.");
2917 ORE->emit([&]() {
2918 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2921 << "Code-size may be reduced by not forcing "
2922 "vectorization, or by source-code modifications "
2923 "eliminating the need for runtime checks "
2924 "(e.g., adding 'restrict').";
2925 });
2926 }
2927
2928 LoopBypassBlocks.push_back(MemCheckBlock);
2929
2930 AddedSafetyChecks = true;
2931
2932 return MemCheckBlock;
2933}
2934
2938 assert(LoopVectorPreHeader && "Invalid loop structure");
2939 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2940 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2941 "multiple exit loop without required epilogue?");
2942
2945 LI, nullptr, Twine(Prefix) + "middle.block");
2948 nullptr, Twine(Prefix) + "scalar.ph");
2949}
2950
2952 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
2953 ArrayRef<BasicBlock *> BypassBlocks,
2954 std::pair<BasicBlock *, Value *> AdditionalBypass) {
2956 assert(VectorTripCount && "Expected valid arguments");
2957
2958 Instruction *OldInduction = Legal->getPrimaryInduction();
2959 Value *&EndValue = IVEndValues[OrigPhi];
2960 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
2961 if (OrigPhi == OldInduction) {
2962 // We know what the end value is.
2963 EndValue = VectorTripCount;
2964 } else {
2966
2967 // Fast-math-flags propagate from the original induction instruction.
2968 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
2969 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2970
2971 EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
2972 Step, II.getKind(), II.getInductionBinOp());
2973 EndValue->setName("ind.end");
2974
2975 // Compute the end value for the additional bypass (if applicable).
2976 if (AdditionalBypass.first) {
2977 B.SetInsertPoint(AdditionalBypass.first,
2978 AdditionalBypass.first->getFirstInsertionPt());
2979 EndValueFromAdditionalBypass =
2980 emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
2981 Step, II.getKind(), II.getInductionBinOp());
2982 EndValueFromAdditionalBypass->setName("ind.end");
2983 }
2984 }
2985
2986 // Create phi nodes to merge from the backedge-taken check block.
2987 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
2989 // Copy original phi DL over to the new one.
2990 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2991
2992 // The new PHI merges the original incoming value, in case of a bypass,
2993 // or the value at the end of the vectorized loop.
2994 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
2995
2996 // Fix the scalar body counter (PHI node).
2997 // The old induction's phi node in the scalar body needs the truncated
2998 // value.
2999 for (BasicBlock *BB : BypassBlocks)
3000 BCResumeVal->addIncoming(II.getStartValue(), BB);
3001
3002 if (AdditionalBypass.first)
3003 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3004 EndValueFromAdditionalBypass);
3005 return BCResumeVal;
3006}
3007
3008/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
3009/// expansion results.
3011 const SCEV2ValueTy &ExpandedSCEVs) {
3012 const SCEV *Step = ID.getStep();
3013 if (auto *C = dyn_cast<SCEVConstant>(Step))
3014 return C->getValue();
3015 if (auto *U = dyn_cast<SCEVUnknown>(Step))
3016 return U->getValue();
3017 auto I = ExpandedSCEVs.find(Step);
3018 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
3019 return I->second;
3020}
3021
3023 const SCEV2ValueTy &ExpandedSCEVs,
3024 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3025 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3026 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3027 "Inconsistent information about additional bypass.");
3028 // We are going to resume the execution of the scalar loop.
3029 // Go over all of the induction variables that we found and fix the
3030 // PHIs that are left in the scalar version of the loop.
3031 // The starting values of PHI nodes depend on the counter of the last
3032 // iteration in the vectorized loop.
3033 // If we come from a bypass edge then we need to start from the original
3034 // start value.
3035 for (const auto &InductionEntry : Legal->getInductionVars()) {
3036 PHINode *OrigPhi = InductionEntry.first;
3037 const InductionDescriptor &II = InductionEntry.second;
3038 PHINode *BCResumeVal = createInductionResumeValue(
3039 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
3040 AdditionalBypass);
3041 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3042 }
3043}
3044
3045std::pair<BasicBlock *, Value *>
3047 const SCEV2ValueTy &ExpandedSCEVs) {
3048 /*
3049 In this function we generate a new loop. The new loop will contain
3050 the vectorized instructions while the old loop will continue to run the
3051 scalar remainder.
3052
3053 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3054 / | preheader are expanded here. Eventually all required SCEV
3055 / | expansion should happen here.
3056 / v
3057 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3058 | / |
3059 | / v
3060 || [ ] <-- vector pre header.
3061 |/ |
3062 | v
3063 | [ ] \
3064 | [ ]_| <-- vector loop (created during VPlan execution).
3065 | |
3066 | v
3067 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
3068 | | successors created during VPlan execution)
3069 \/ |
3070 /\ v
3071 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
3072 | |
3073 (opt) v <-- edge from middle to exit iff epilogue is not required.
3074 | [ ] \
3075 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3076 \ |
3077 \ v
3078 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
3079 ...
3080 */
3081
3082 // Create an empty vector loop, and prepare basic blocks for the runtime
3083 // checks.
3085
3086 // Now, compare the new count to zero. If it is zero skip the vector loop and
3087 // jump to the scalar loop. This check also covers the case where the
3088 // backedge-taken count is uint##_max: adding one to it will overflow leading
3089 // to an incorrect trip count of zero. In this (rare) case we will also jump
3090 // to the scalar loop.
3092
3093 // Generate the code to check any assumptions that we've made for SCEV
3094 // expressions.
3096
3097 // Generate the code that checks in runtime if arrays overlap. We put the
3098 // checks into a separate block to make the more common case of few elements
3099 // faster.
3101
3102 // Emit phis for the new starting index of the scalar loop.
3103 createInductionResumeValues(ExpandedSCEVs);
3104
3105 return {LoopVectorPreHeader, nullptr};
3106}
3107
3108// Fix up external users of the induction variable. At this point, we are
3109// in LCSSA form, with all external PHIs that use the IV having one input value,
3110// coming from the remainder loop. We need those PHIs to also have a correct
3111// value for the IV when arriving directly from the middle block.
3113 const InductionDescriptor &II,
3114 Value *VectorTripCount, Value *EndValue,
3115 BasicBlock *MiddleBlock,
3116 BasicBlock *VectorHeader, VPlan &Plan,
3117 VPTransformState &State) {
3118 // There are two kinds of external IV usages - those that use the value
3119 // computed in the last iteration (the PHI) and those that use the penultimate
3120 // value (the value that feeds into the phi from the loop latch).
3121 // We allow both, but they, obviously, have different values.
3122
3123 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3124
3125 DenseMap<Value *, Value *> MissingVals;
3126
3127 // An external user of the last iteration's value should see the value that
3128 // the remainder loop uses to initialize its own IV.
3130 for (User *U : PostInc->users()) {
3131 Instruction *UI = cast<Instruction>(U);
3132 if (!OrigLoop->contains(UI)) {
3133 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3134 MissingVals[UI] = EndValue;
3135 }
3136 }
3137
3138 // An external user of the penultimate value need to see EndValue - Step.
3139 // The simplest way to get this is to recompute it from the constituent SCEVs,
3140 // that is Start + (Step * (CRD - 1)).
3141 for (User *U : OrigPhi->users()) {
3142 auto *UI = cast<Instruction>(U);
3143 if (!OrigLoop->contains(UI)) {
3144 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3145 IRBuilder<> B(MiddleBlock->getTerminator());
3146
3147 // Fast-math-flags propagate from the original induction instruction.
3148 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3149 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3150
3151 Value *CountMinusOne = B.CreateSub(
3152 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3153 CountMinusOne->setName("cmo");
3154
3155 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
3156 assert(StepVPV && "step must have been expanded during VPlan execution");
3157 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
3158 : State.get(StepVPV, {0, 0});
3159 Value *Escape =
3160 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
3161 II.getKind(), II.getInductionBinOp());
3162 Escape->setName("ind.escape");
3163 MissingVals[UI] = Escape;
3164 }
3165 }
3166
3167 for (auto &I : MissingVals) {
3168 PHINode *PHI = cast<PHINode>(I.first);
3169 // One corner case we have to handle is two IVs "chasing" each-other,
3170 // that is %IV2 = phi [...], [ %IV1, %latch ]
3171 // In this case, if IV1 has an external use, we need to avoid adding both
3172 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3173 // don't already have an incoming value for the middle block.
3174 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3175 PHI->addIncoming(I.second, MiddleBlock);
3176 Plan.removeLiveOut(PHI);
3177 }
3178 }
3179}
3180
3181namespace {
3182
3183struct CSEDenseMapInfo {
3184 static bool canHandle(const Instruction *I) {
3185 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3186 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3187 }
3188
3189 static inline Instruction *getEmptyKey() {
3191 }
3192
3193 static inline Instruction *getTombstoneKey() {
3195 }
3196
3197 static unsigned getHashValue(const Instruction *I) {
3198 assert(canHandle(I) && "Unknown instruction!");
3199 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3200 I->value_op_end()));
3201 }
3202
3203 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3204 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3205 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3206 return LHS == RHS;
3207 return LHS->isIdenticalTo(RHS);
3208 }
3209};
3210
3211} // end anonymous namespace
3212
3213///Perform cse of induction variable instructions.
3214static void cse(BasicBlock *BB) {
3215 // Perform simple cse.
3217 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3218 if (!CSEDenseMapInfo::canHandle(&In))
3219 continue;
3220
3221 // Check if we can replace this instruction with any of the
3222 // visited instructions.
3223 if (Instruction *V = CSEMap.lookup(&In)) {
3224 In.replaceAllUsesWith(V);
3225 In.eraseFromParent();
3226 continue;
3227 }
3228
3229 CSEMap[&In] = &In;
3230 }
3231}
3232
3235 ElementCount VF) const {
3236 // We only need to calculate a cost if the VF is scalar; for actual vectors
3237 // we should already have a pre-calculated cost at each VF.
3238 if (!VF.isScalar())
3239 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
3240
3242 Type *RetTy = CI->getType();
3244 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
3245 return *RedCost;
3246
3248 for (auto &ArgOp : CI->args())
3249 Tys.push_back(ArgOp->getType());
3250
3251 InstructionCost ScalarCallCost =
3253
3254 // If this is an intrinsic we may have a lower cost for it.
3256 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
3257 return std::min(ScalarCallCost, IntrinsicCost);
3258 }
3259 return ScalarCallCost;
3260}
3261
3263 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3264 return Elt;
3265 return VectorType::get(Elt, VF);
3266}
3267
3270 ElementCount VF) const {
3272 assert(ID && "Expected intrinsic call!");
3273 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3274 FastMathFlags FMF;
3275 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3276 FMF = FPMO->getFastMathFlags();
3277
3280 SmallVector<Type *> ParamTys;
3281 std::transform(FTy->param_begin(), FTy->param_end(),
3282 std::back_inserter(ParamTys),
3283 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3284
3285 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3286 dyn_cast<IntrinsicInst>(CI));
3287 return TTI.getIntrinsicInstrCost(CostAttrs,
3289}
3290
3292 VPlan &Plan) {
3293 // Fix widened non-induction PHIs by setting up the PHI operands.
3295 fixNonInductionPHIs(Plan, State);
3296
3297 // Forget the original basic block.
3300
3301 // After vectorization, the exit blocks of the original loop will have
3302 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
3303 // looked through single-entry phis.
3304 SmallVector<BasicBlock *> ExitBlocks;
3305 OrigLoop->getExitBlocks(ExitBlocks);
3306 for (BasicBlock *Exit : ExitBlocks)
3307 for (PHINode &PN : Exit->phis())
3309
3310 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3311 VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
3312 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3313 if (Cost->requiresScalarEpilogue(VF.isVector())) {
3314 // No edge from the middle block to the unique exit block has been inserted
3315 // and there is nothing to fix from vector loop; phis should have incoming
3316 // from scalar loop only.
3317 } else {
3318 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
3319 // the cost model.
3320
3321 // If we inserted an edge from the middle block to the unique exit block,
3322 // update uses outside the loop (phis) to account for the newly inserted
3323 // edge.
3324
3325 // Fix-up external users of the induction variables.
3326 for (const auto &Entry : Legal->getInductionVars())
3327 fixupIVUsers(Entry.first, Entry.second,
3329 IVEndValues[Entry.first], LoopMiddleBlock,
3330 VectorLoop->getHeader(), Plan, State);
3331 }
3332
3333 // Fix live-out phis not already fixed earlier.
3334 for (const auto &KV : Plan.getLiveOuts())
3335 KV.second->fixPhi(Plan, State);
3336
3338 sinkScalarOperands(&*PI);
3339
3340 // Remove redundant induction instructions.
3341 cse(VectorLoop->getHeader());
3342
3343 // Set/update profile weights for the vector and remainder loops as original
3344 // loop iterations are now distributed among them. Note that original loop
3345 // represented by LoopScalarBody becomes remainder loop after vectorization.
3346 //
3347 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3348 // end up getting slightly roughened result but that should be OK since
3349 // profile is not inherently precise anyway. Note also possible bypass of
3350 // vector code caused by legality checks is ignored, assigning all the weight
3351 // to the vector loop, optimistically.
3352 //
3353 // For scalable vectorization we can't know at compile time how many iterations
3354 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3355 // vscale of '1'.
3358 VF.getKnownMinValue() * UF);
3359}
3360
3362 // The basic block and loop containing the predicated instruction.
3363 auto *PredBB = PredInst->getParent();
3364 auto *VectorLoop = LI->getLoopFor(PredBB);
3365
3366 // Initialize a worklist with the operands of the predicated instruction.
3367 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3368
3369 // Holds instructions that we need to analyze again. An instruction may be
3370 // reanalyzed if we don't yet know if we can sink it or not.
3371 SmallVector<Instruction *, 8> InstsToReanalyze;
3372
3373 // Returns true if a given use occurs in the predicated block. Phi nodes use
3374 // their operands in their corresponding predecessor blocks.
3375 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3376 auto *I = cast<Instruction>(U.getUser());
3377 BasicBlock *BB = I->getParent();
3378 if (auto *Phi = dyn_cast<PHINode>(I))
3379 BB = Phi->getIncomingBlock(
3380 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3381 return BB == PredBB;
3382 };
3383
3384 // Iteratively sink the scalarized operands of the predicated instruction
3385 // into the block we created for it. When an instruction is sunk, it's
3386 // operands are then added to the worklist. The algorithm ends after one pass
3387 // through the worklist doesn't sink a single instruction.
3388 bool Changed;
3389 do {
3390 // Add the instructions that need to be reanalyzed to the worklist, and
3391 // reset the changed indicator.
3392 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3393 InstsToReanalyze.clear();
3394 Changed = false;
3395
3396 while (!Worklist.empty()) {
3397 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3398
3399 // We can't sink an instruction if it is a phi node, is not in the loop,
3400 // may have side effects or may read from memory.
3401 // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3402 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3403 I->mayHaveSideEffects() || I->mayReadFromMemory())
3404 continue;
3405
3406 // If the instruction is already in PredBB, check if we can sink its
3407 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3408 // sinking the scalar instruction I, hence it appears in PredBB; but it
3409 // may have failed to sink I's operands (recursively), which we try
3410 // (again) here.
3411 if (I->getParent() == PredBB) {
3412 Worklist.insert(I->op_begin(), I->op_end());
3413 continue;
3414 }
3415
3416 // It's legal to sink the instruction if all its uses occur in the
3417 // predicated block. Otherwise, there's nothing to do yet, and we may
3418 // need to reanalyze the instruction.
3419 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3420 InstsToReanalyze.push_back(I);
3421 continue;
3422 }
3423
3424 // Move the instruction to the beginning of the predicated block, and add
3425 // it's operands to the worklist.
3426 I->moveBefore(&*PredBB->getFirstInsertionPt());
3427 Worklist.insert(I->op_begin(), I->op_end());
3428
3429 // The sinking may have enabled other instructions to be sunk, so we will
3430 // need to iterate.
3431 Changed = true;
3432 }
3433 } while (Changed);
3434}
3435
3437 VPTransformState &State) {
3438 auto Iter = vp_depth_first_deep(Plan.getEntry());
3439 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3440 for (VPRecipeBase &P : VPBB->phis()) {
3441 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3442 if (!VPPhi)
3443 continue;
3444 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3445 // Make sure the builder has a valid insert point.
3446 Builder.SetInsertPoint(NewPhi);
3447 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3448 VPValue *Inc = VPPhi->getIncomingValue(i);
3449 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3450 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
3451 }
3452 }
3453 }
3454}
3455
3456void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3457 // We should not collect Scalars more than once per VF. Right now, this
3458 // function is called from collectUniformsAndScalars(), which already does
3459 // this check. Collecting Scalars for VF=1 does not make any sense.
3460 assert(VF.isVector() && !Scalars.contains(VF) &&
3461 "This function should not be visited twice for the same VF");
3462
3463 // This avoids any chances of creating a REPLICATE recipe during planning
3464 // since that would result in generation of scalarized code during execution,
3465 // which is not supported for scalable vectors.
3466 if (VF.isScalable()) {
3467 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3468 return;
3469 }
3470
3472
3473 // These sets are used to seed the analysis with pointers used by memory
3474 // accesses that will remain scalar.
3476 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3477 auto *Latch = TheLoop->getLoopLatch();
3478
3479 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3480 // The pointer operands of loads and stores will be scalar as long as the
3481 // memory access is not a gather or scatter operation. The value operand of a
3482 // store will remain scalar if the store is scalarized.
3483 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3484 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3485 assert(WideningDecision != CM_Unknown &&
3486 "Widening decision should be ready at this moment");
3487 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3488 if (Ptr == Store->getValueOperand())
3489 return WideningDecision == CM_Scalarize;
3490 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3491 "Ptr is neither a value or pointer operand");
3492 return WideningDecision != CM_GatherScatter;
3493 };
3494
3495 // A helper that returns true if the given value is a bitcast or
3496 // getelementptr instruction contained in the loop.
3497 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
3498 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
3499 isa<GetElementPtrInst>(V)) &&
3501 };
3502
3503 // A helper that evaluates a memory access's use of a pointer. If the use will
3504 // be a scalar use and the pointer is only used by memory accesses, we place
3505 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3506 // PossibleNonScalarPtrs.
3507 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3508 // We only care about bitcast and getelementptr instructions contained in
3509 // the loop.
3510 if (!isLoopVaryingBitCastOrGEP(Ptr))
3511 return;
3512
3513 // If the pointer has already been identified as scalar (e.g., if it was
3514 // also identified as uniform), there's nothing to do.
3515 auto *I = cast<Instruction>(Ptr);
3516 if (Worklist.count(I))
3517 return;
3518
3519 // If the use of the pointer will be a scalar use, and all users of the
3520 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3521 // place the pointer in PossibleNonScalarPtrs.
3522 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3523 return isa<LoadInst>(U) || isa<StoreInst>(U);
3524 }))
3525 ScalarPtrs.insert(I);
3526 else
3527 PossibleNonScalarPtrs.insert(I);
3528 };
3529
3530 // We seed the scalars analysis with three classes of instructions: (1)
3531 // instructions marked uniform-after-vectorization and (2) bitcast,
3532 // getelementptr and (pointer) phi instructions used by memory accesses
3533 // requiring a scalar use.
3534 //
3535 // (1) Add to the worklist all instructions that have been identified as
3536 // uniform-after-vectorization.
3537 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3538
3539 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3540 // memory accesses requiring a scalar use. The pointer operands of loads and
3541 // stores will be scalar as long as the memory accesses is not a gather or
3542 // scatter operation. The value operand of a store will remain scalar if the
3543 // store is scalarized.
3544 for (auto *BB : TheLoop->blocks())
3545 for (auto &I : *BB) {
3546 if (auto *Load = dyn_cast<LoadInst>(&I)) {
3547 evaluatePtrUse(Load, Load->getPointerOperand());
3548 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3549 evaluatePtrUse(Store, Store->getPointerOperand());
3550 evaluatePtrUse(Store, Store->getValueOperand());
3551 }
3552 }
3553 for (auto *I : ScalarPtrs)
3554 if (!PossibleNonScalarPtrs.count(I)) {
3555 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3556 Worklist.insert(I);
3557 }
3558
3559 // Insert the forced scalars.
3560 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3561 // induction variable when the PHI user is scalarized.
3562 auto ForcedScalar = ForcedScalars.find(VF);
3563 if (ForcedScalar != ForcedScalars.end())
3564 for (auto *I : ForcedScalar->second) {
3565 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3566 Worklist.insert(I);
3567 }
3568
3569 // Expand the worklist by looking through any bitcasts and getelementptr
3570 // instructions we've already identified as scalar. This is similar to the
3571 // expansion step in collectLoopUniforms(); however, here we're only
3572 // expanding to include additional bitcasts and getelementptr instructions.
3573 unsigned Idx = 0;
3574 while (Idx != Worklist.size()) {
3575 Instruction *Dst = Worklist[Idx++];
3576 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
3577 continue;
3578 auto *Src = cast<Instruction>(Dst->getOperand(0));
3579 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3580 auto *J = cast<Instruction>(U);
3581 return !TheLoop->contains(J) || Worklist.count(J) ||
3582 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3583 isScalarUse(J, Src));
3584 })) {
3585 Worklist.insert(Src);
3586 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3587 }
3588 }
3589
3590 // An induction variable will remain scalar if all users of the induction
3591 // variable and induction variable update remain scalar.
3592 for (const auto &Induction : Legal->getInductionVars()) {
3593 auto *Ind = Induction.first;
3594 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3595
3596 // If tail-folding is applied, the primary induction variable will be used
3597 // to feed a vector compare.
3598 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3599 continue;
3600
3601 // Returns true if \p Indvar is a pointer induction that is used directly by
3602 // load/store instruction \p I.
3603 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3604 Instruction *I) {
3605 return Induction.second.getKind() ==
3607 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3608 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
3609 };
3610
3611 // Determine if all users of the induction variable are scalar after
3612 // vectorization.
3613 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3614 auto *I = cast<Instruction>(U);
3615 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3616 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3617 });
3618 if (!ScalarInd)
3619 continue;
3620
3621 // If the induction variable update is a fixed-order recurrence, neither the
3622 // induction variable or its update should be marked scalar after
3623 // vectorization.
3624 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3625 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3626 continue;
3627
3628 // Determine if all users of the induction variable update instruction are
3629 // scalar after vectorization.
3630 auto ScalarIndUpdate =
3631 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3632 auto *I = cast<Instruction>(U);
3633 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3634 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3635 });
3636 if (!ScalarIndUpdate)
3637 continue;
3638
3639 // The induction variable and its update instruction will remain scalar.
3640 Worklist.insert(Ind);
3641 Worklist.insert(IndUpdate);
3642 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3643 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3644 << "\n");
3645 }
3646
3647 Scalars[VF].insert(Worklist.begin(), Worklist.end());
3648}
3649
3651 Instruction *I, ElementCount VF) const {
3652 if (!isPredicatedInst(I))
3653 return false;
3654
3655 // Do we have a non-scalar lowering for this predicated
3656 // instruction? No - it is scalar with predication.
3657 switch(I->getOpcode()) {
3658 default:
3659 return true;
3660 case Instruction::Call:
3661 if (VF.isScalar())
3662 return true;
3663 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3664 .Kind == CM_Scalarize;
3665 case Instruction::Load:
3666 case Instruction::Store: {
3668 auto *Ty = getLoadStoreType(I);
3669 Type *VTy = Ty;
3670 if (VF.isVector())
3671 VTy = VectorType::get(Ty, VF);
3672 const Align Alignment = getLoadStoreAlignment(I);
3673 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3674 TTI.isLegalMaskedGather(VTy, Alignment))
3675 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3676 TTI.isLegalMaskedScatter(VTy, Alignment));
3677 }
3678 case Instruction::UDiv:
3679 case Instruction::SDiv:
3680 case Instruction::SRem:
3681 case Instruction::URem: {
3682 // We have the option to use the safe-divisor idiom to avoid predication.
3683 // The cost based decision here will always select safe-divisor for
3684 // scalable vectors as scalarization isn't legal.
3685 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3686 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3687 }
3688 }
3689}
3690
3692 if (!blockNeedsPredicationForAnyReason(I->getParent()))
3693 return false;
3694
3695 // Can we prove this instruction is safe to unconditionally execute?
3696 // If not, we must use some form of predication.
3697 switch(I->getOpcode()) {
3698 default:
3699 return false;
3700 case Instruction::Load:
3701 case Instruction::Store: {
3702 if (!Legal->isMaskRequired(I))
3703 return false;
3704 // When we know the load's address is loop invariant and the instruction
3705 // in the original scalar loop was unconditionally executed then we
3706 // don't need to mark it as a predicated instruction. Tail folding may
3707 // introduce additional predication, but we're guaranteed to always have
3708 // at least one active lane. We call Legal->blockNeedsPredication here
3709 // because it doesn't query tail-folding. For stores, we need to prove
3710 // both speculation safety (which follows from the same argument as loads),
3711 // but also must prove the value being stored is correct. The easiest
3712 // form of the later is to require that all values stored are the same.
3714 (isa<LoadInst>(I) ||
3715 (isa<StoreInst>(I) &&
3716 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
3717 !Legal->blockNeedsPredication(I->getParent()))
3718 return false;
3719 return true;
3720 }
3721 case Instruction::UDiv:
3722 case Instruction::SDiv:
3723 case Instruction::SRem:
3724 case Instruction::URem:
3725 // TODO: We can use the loop-preheader as context point here and get
3726 // context sensitive reasoning
3728 case Instruction::Call:
3729 return Legal->isMaskRequired(I);
3730 }
3731}
3732
3733std::pair<InstructionCost, InstructionCost>
3735 ElementCount VF) const {
3736 assert(I->getOpcode() == Instruction::UDiv ||
3737 I->getOpcode() == Instruction::SDiv ||
3738 I->getOpcode() == Instruction::SRem ||
3739 I->getOpcode() == Instruction::URem);
3741
3743
3744 // Scalarization isn't legal for scalable vector types
3745 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3746 if (!VF.isScalable()) {
3747 // Get the scalarization cost and scale this amount by the probability of
3748 // executing the predicated block. If the instruction is not predicated,
3749 // we fall through to the next case.
3750 ScalarizationCost = 0;
3751
3752 // These instructions have a non-void type, so account for the phi nodes
3753 // that we will create. This cost is likely to be zero. The phi node
3754 // cost, if any, should be scaled by the block probability because it
3755 // models a copy at the end of each predicated block.
3756 ScalarizationCost += VF.getKnownMinValue() *
3757 TTI.getCFInstrCost(Instruction::PHI, CostKind);
3758
3759 // The cost of the non-predicated instruction.
3760 ScalarizationCost += VF.getKnownMinValue() *
3761 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3762
3763 // The cost of insertelement and extractelement instructions needed for
3764 // scalarization.
3765 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
3766
3767 // Scale the cost by the probability of executing the predicated blocks.
3768 // This assumes the predicated block for each vector lane is equally
3769 // likely.
3770 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3771 }
3772 InstructionCost SafeDivisorCost = 0;
3773
3774 auto *VecTy = ToVectorTy(I->getType(), VF);
3775
3776 // The cost of the select guard to ensure all lanes are well defined
3777 // after we speculate above any internal control flow.
3778 SafeDivisorCost += TTI.getCmpSelInstrCost(
3779 Instruction::Select, VecTy,
3780 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
3782
3783 // Certain instructions can be cheaper to vectorize if they have a constant
3784 // second vector operand. One example of this are shifts on x86.
3785 Value *Op2 = I->getOperand(1);
3786 auto Op2Info = TTI.getOperandInfo(Op2);
3787 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3788 Legal->isInvariant(Op2))
3790
3791 SmallVector<const Value *, 4> Operands(I->operand_values());
3792 SafeDivisorCost += TTI.getArithmeticInstrCost(
3793 I->getOpcode(), VecTy, CostKind,
3794 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3795 Op2Info, Operands, I);
3796 return {ScalarizationCost, SafeDivisorCost};
3797}
3798
3800 Instruction *I, ElementCount VF) const {
3801 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3803 "Decision should not be set yet.");
3804 auto *Group = getInterleavedAccessGroup(I);
3805 assert(Group && "Must have a group.");
3806
3807 // If the instruction's allocated size doesn't equal it's type size, it
3808 // requires padding and will be scalarized.
3809 auto &DL = I->getDataLayout();
3810 auto *ScalarTy = getLoadStoreType(I);
3811 if (hasIrregularType(ScalarTy, DL))
3812 return false;
3813
3814 // If the group involves a non-integral pointer, we may not be able to
3815 // losslessly cast all values to a common type.
3816 unsigned InterleaveFactor = Group->getFactor();
3817 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3818 for (unsigned i = 0; i < InterleaveFactor; i++) {
3819 Instruction *Member = Group->getMember(i);
3820 if (!Member)
3821 continue;
3822 auto *MemberTy = getLoadStoreType(Member);
3823 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3824 // Don't coerce non-integral pointers to integers or vice versa.
3825 if (MemberNI != ScalarNI) {
3826 // TODO: Consider adding special nullptr value case here
3827 return false;
3828 } else if (MemberNI && ScalarNI &&
3829 ScalarTy->getPointerAddressSpace() !=
3830 MemberTy->getPointerAddressSpace()) {
3831 return false;
3832 }
3833 }
3834
3835 // Check if masking is required.
3836 // A Group may need masking for one of two reasons: it resides in a block that
3837 // needs predication, or it was decided to use masking to deal with gaps
3838 // (either a gap at the end of a load-access that may result in a speculative
3839 // load, or any gaps in a store-access).
3840 bool PredicatedAccessRequiresMasking =
3841 blockNeedsPredicationForAnyReason(I->getParent()) &&
3843 bool LoadAccessWithGapsRequiresEpilogMasking =
3844 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3846 bool StoreAccessWithGapsRequiresMasking =
3847 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3848 if (!PredicatedAccessRequiresMasking &&
3849 !LoadAccessWithGapsRequiresEpilogMasking &&
3850 !StoreAccessWithGapsRequiresMasking)
3851 return true;
3852
3853 // If masked interleaving is required, we expect that the user/target had
3854 // enabled it, because otherwise it either wouldn't have been created or
3855 // it should have been invalidated by the CostModel.
3857 "Masked interleave-groups for predicated accesses are not enabled.");
3858
3859 if (Group->isReverse())
3860 return false;
3861
3862 auto *Ty = getLoadStoreType(I);
3863 const Align Alignment = getLoadStoreAlignment(I);
3864 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
3865 : TTI.isLegalMaskedStore(Ty, Alignment);
3866}
3867
3869 Instruction *I, ElementCount VF) {
3870 // Get and ensure we have a valid memory instruction.
3871 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3872
3874 auto *ScalarTy = getLoadStoreType(I);
3875
3876 // In order to be widened, the pointer should be consecutive, first of all.
3877 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3878 return false;
3879
3880 // If the instruction is a store located in a predicated block, it will be
3881 // scalarized.
3882 if (isScalarWithPredication(I, VF))
3883 return false;
3884
3885 // If the instruction's allocated size doesn't equal it's type size, it
3886 // requires padding and will be scalarized.
3887 auto &DL = I->getDataLayout();
3888 if (hasIrregularType(ScalarTy, DL))
3889 return false;
3890
3891 return true;
3892}
3893
3894void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3895 // We should not collect Uniforms more than once per VF. Right now,
3896 // this function is called from collectUniformsAndScalars(), which
3897 // already does this check. Collecting Uniforms for VF=1 does not make any
3898 // sense.
3899
3900 assert(VF.isVector() && !Uniforms.contains(VF) &&
3901 "This function should not be visited twice for the same VF");
3902
3903 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
3904 // not analyze again. Uniforms.count(VF) will return 1.
3905 Uniforms[VF].clear();
3906
3907 // We now know that the loop is vectorizable!
3908 // Collect instructions inside the loop that will remain uniform after
3909 // vectorization.
3910
3911 // Global values, params and instructions outside of current loop are out of
3912 // scope.
3913 auto isOutOfScope = [&](Value *V) -> bool {
3914 Instruction *I = dyn_cast<Instruction>(V);
3915 return (!I || !TheLoop->contains(I));
3916 };
3917
3918 // Worklist containing uniform instructions demanding lane 0.
3919 SetVector<Instruction *> Worklist;
3920
3921 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3922 // that require predication must not be considered uniform after
3923 // vectorization, because that would create an erroneous replicating region
3924 // where only a single instance out of VF should be formed.
3925 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
3926 if (isOutOfScope(I)) {
3927 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3928 << *I << "\n");
3929 return;
3930 }
3931 if (isPredicatedInst(I)) {
3932 LLVM_DEBUG(
3933 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3934 << "\n");
3935 return;
3936 }
3937 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3938 Worklist.insert(I);
3939 };
3940
3941 // Start with the conditional branches exiting the loop. If the branch
3942 // condition is an instruction contained in the loop that is only used by the
3943 // branch, it is uniform.
3945 TheLoop->getExitingBlocks(Exiting);
3946 for (BasicBlock *E : Exiting) {
3947 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3948 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3949 addToWorklistIfAllowed(Cmp);
3950 }
3951
3952 auto PrevVF = VF.divideCoefficientBy(2);
3953 // Return true if all lanes perform the same memory operation, and we can
3954 // thus chose to execute only one.
3955 auto isUniformMemOpUse = [&](Instruction *I) {
3956 // If the value was already known to not be uniform for the previous
3957 // (smaller VF), it cannot be uniform for the larger VF.
3958 if (PrevVF.isVector()) {
3959 auto Iter = Uniforms.find(PrevVF);
3960 if (Iter != Uniforms.end() && !Iter->second.contains(I))
3961 return false;
3962 }
3963 if (!Legal->isUniformMemOp(*I, VF))
3964 return false;
3965 if (isa<LoadInst>(I))
3966 // Loading the same address always produces the same result - at least
3967 // assuming aliasing and ordering which have already been checked.
3968 return true;
3969 // Storing the same value on every iteration.
3970 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3971 };
3972
3973 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
3974 InstWidening WideningDecision = getWideningDecision(I, VF);
3975 assert(WideningDecision != CM_Unknown &&
3976 "Widening decision should be ready at this moment");
3977
3978 if (isUniformMemOpUse(I))
3979 return true;
3980
3981 return (WideningDecision == CM_Widen ||
3982 WideningDecision == CM_Widen_Reverse ||
3983 WideningDecision == CM_Interleave);
3984 };
3985
3986 // Returns true if Ptr is the pointer operand of a memory access instruction
3987 // I, I is known to not require scalarization, and the pointer is not also
3988 // stored.
3989 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3990 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3991 return false;
3992 return getLoadStorePointerOperand(I) == Ptr &&
3993 (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3994 };
3995
3996 // Holds a list of values which are known to have at least one uniform use.
3997 // Note that there may be other uses which aren't uniform. A "uniform use"
3998 // here is something which only demands lane 0 of the unrolled iterations;
3999 // it does not imply that all lanes produce the same value (e.g. this is not
4000 // the usual meaning of uniform)
4001 SetVector<Value *> HasUniformUse;
4002
4003 // Scan the loop for instructions which are either a) known to have only
4004 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4005 for (auto *BB : TheLoop->blocks())
4006 for (auto &I : *BB) {
4007 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4008 switch (II->getIntrinsicID()) {
4009 case Intrinsic::sideeffect:
4010 case Intrinsic::experimental_noalias_scope_decl:
4011 case Intrinsic::assume:
4012 case Intrinsic::lifetime_start:
4013 case Intrinsic::lifetime_end:
4015 addToWorklistIfAllowed(&I);
4016 break;
4017 default:
4018 break;
4019 }
4020 }
4021
4022 // ExtractValue instructions must be uniform, because the operands are
4023 // known to be loop-invariant.
4024 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4025 assert(isOutOfScope(EVI->getAggregateOperand()) &&
4026 "Expected aggregate value to be loop invariant");
4027 addToWorklistIfAllowed(EVI);
4028 continue;
4029 }
4030
4031 // If there's no pointer operand, there's nothing to do.
4033 if (!Ptr)
4034 continue;
4035
4036 if (isUniformMemOpUse(&I))
4037 addToWorklistIfAllowed(&I);
4038
4039 if (isVectorizedMemAccessUse(&I, Ptr))
4040 HasUniformUse.insert(Ptr);
4041 }
4042
4043 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4044 // demanding) users. Since loops are assumed to be in LCSSA form, this
4045 // disallows uses outside the loop as well.
4046 for (auto *V : HasUniformUse) {
4047 if (isOutOfScope(V))
4048 continue;
4049 auto *I = cast<Instruction>(V);
4050 auto UsersAreMemAccesses =
4051 llvm::all_of(I->users(), [&](User *U) -> bool {
4052 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4053 });
4054 if (UsersAreMemAccesses)
4055 addToWorklistIfAllowed(I);
4056 }
4057
4058 // Expand Worklist in topological order: whenever a new instruction
4059 // is added , its users should be already inside Worklist. It ensures
4060 // a uniform instruction will only be used by uniform instructions.
4061 unsigned idx = 0;
4062 while (idx != Worklist.size()) {
4063 Instruction *I = Worklist[idx++];
4064
4065 for (auto *OV : I->operand_values()) {
4066 // isOutOfScope operands cannot be uniform instructions.
4067 if (isOutOfScope(OV))
4068 continue;
4069 // First order recurrence Phi's should typically be considered
4070 // non-uniform.
4071 auto *OP = dyn_cast<PHINode>(OV);
4073 continue;
4074 // If all the users of the operand are uniform, then add the
4075 // operand into the uniform worklist.
4076 auto *OI = cast<Instruction>(OV);
4077 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4078 auto *J = cast<Instruction>(U);
4079 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4080 }))
4081 addToWorklistIfAllowed(OI);
4082 }
4083 }
4084
4085 // For an instruction to be added into Worklist above, all its users inside
4086 // the loop should also be in Worklist. However, this condition cannot be
4087 // true for phi nodes that form a cyclic dependence. We must process phi
4088 // nodes separately. An induction variable will remain uniform if all users
4089 // of the induction variable and induction variable update remain uniform.
4090 // The code below handles both pointer and non-pointer induction variables.
4091 BasicBlock *Latch = TheLoop->getLoopLatch();
4092 for (const auto &Induction : Legal->getInductionVars()) {
4093 auto *Ind = Induction.first;
4094 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4095
4096 // Determine if all users of the induction variable are uniform after
4097 // vectorization.
4098 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4099 auto *I = cast<Instruction>(U);
4100 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4101 isVectorizedMemAccessUse(I, Ind);
4102 });
4103 if (!UniformInd)
4104 continue;
4105
4106 // Determine if all users of the induction variable update instruction are
4107 // uniform after vectorization.
4108 auto UniformIndUpdate =
4109 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4110 auto *I = cast<Instruction>(U);
4111 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4112 isVectorizedMemAccessUse(I, IndUpdate);
4113 });
4114 if (!UniformIndUpdate)
4115 continue;
4116
4117 // The induction variable and its update instruction will remain uniform.
4118 addToWorklistIfAllowed(Ind);
4119 addToWorklistIfAllowed(IndUpdate);
4120 }
4121
4122 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4123}
4124
4126 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4127
4129 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4130 "runtime pointer checks needed. Enable vectorization of this "
4131 "loop with '#pragma clang loop vectorize(enable)' when "
4132 "compiling with -Os/-Oz",
4133 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4134 return true;
4135 }
4136
4137 if (!PSE.getPredicate().isAlwaysTrue()) {
4138 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4139 "runtime SCEV checks needed. Enable vectorization of this "
4140 "loop with '#pragma clang loop vectorize(enable)' when "
4141 "compiling with -Os/-Oz",
4142 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4143 return true;
4144 }
4145
4146 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4147 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4148 reportVectorizationFailure("Runtime stride check for small trip count",
4149 "runtime stride == 1 checks needed. Enable vectorization of "
4150 "this loop without such check by compiling with -Os/-Oz",
4151 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4152 return true;
4153 }
4154
4155 return false;
4156}
4157
4158bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
4159 if (IsScalableVectorizationAllowed)
4160 return *IsScalableVectorizationAllowed;
4161
4162 IsScalableVectorizationAllowed = false;
4164 return false;
4165
4167 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4168 "ScalableVectorizationDisabled", ORE, TheLoop);
4169 return false;
4170 }
4171
4172 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4173
4174 auto MaxScalableVF = ElementCount::getScalable(
4175 std::numeric_limits<ElementCount::ScalarTy>::max());
4176
4177 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4178 // FIXME: While for scalable vectors this is currently sufficient, this should
4179 // be replaced by a more detailed mechanism that filters out specific VFs,
4180 // instead of invalidating vectorization for a whole set of VFs based on the
4181 // MaxVF.
4182
4183 // Disable scalable vectorization if the loop contains unsupported reductions.
4184 if (!canVectorizeReductions(MaxScalableVF)) {
4186 "Scalable vectorization not supported for the reduction "
4187 "operations found in this loop.",
4188 "ScalableVFUnfeasible", ORE, TheLoop);
4189 return false;
4190 }
4191
4192 // Disable scalable vectorization if the loop contains any instructions
4193 // with element types not supported for scalable vectors.
4194 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4195 return !Ty->isVoidTy() &&
4197 })) {
4198 reportVectorizationInfo("Scalable vectorization is not supported "
4199 "for all element types found in this loop.",
4200 "ScalableVFUnfeasible", ORE, TheLoop);
4201 return false;
4202 }
4203
4205 reportVectorizationInfo("The target does not provide maximum vscale value "
4206 "for safe distance analysis.",
4207 "ScalableVFUnfeasible", ORE, TheLoop);
4208 return false;
4209 }
4210
4211 IsScalableVectorizationAllowed = true;
4212 return true;
4213}
4214
4216LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4217 if (!isScalableVectorizationAllowed())
4218 return ElementCount::getScalable(0);
4219
4220 auto MaxScalableVF = ElementCount::getScalable(
4221 std::numeric_limits<ElementCount::ScalarTy>::max());
4223 return MaxScalableVF;
4224
4225 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4226 // Limit MaxScalableVF by the maximum safe dependence distance.
4227 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
4228
4229 if (!MaxScalableVF)
4231 "Max legal vector width too small, scalable vectorization "
4232 "unfeasible.",
4233 "ScalableVFUnfeasible", ORE, TheLoop);
4234
4235 return MaxScalableVF;
4236}
4237
4238FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4239 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4241 unsigned SmallestType, WidestType;
4242 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4243
4244 // Get the maximum safe dependence distance in bits computed by LAA.
4245 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4246 // the memory accesses that is most restrictive (involved in the smallest
4247 // dependence distance).
4248 unsigned MaxSafeElements =
4250
4251 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4252 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4253
4254 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4255 << ".\n");
4256 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4257 << ".\n");
4258
4259 // First analyze the UserVF, fall back if the UserVF should be ignored.
4260 if (UserVF) {
4261 auto MaxSafeUserVF =
4262 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4263
4264 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4265 // If `VF=vscale x N` is safe, then so is `VF=N`
4266 if (UserVF.isScalable())
4267 return FixedScalableVFPair(
4268 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4269 else
4270 return UserVF;
4271 }
4272
4273 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4274
4275 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4276 // is better to ignore the hint and let the compiler choose a suitable VF.
4277 if (!UserVF.isScalable()) {
4278 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4279 << " is unsafe, clamping to max safe VF="
4280 << MaxSafeFixedVF << ".\n");
4281 ORE->emit([&]() {
4282 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4284 TheLoop->getHeader())
4285 << "User-specified vectorization factor "
4286 << ore::NV("UserVectorizationFactor", UserVF)
4287 << " is unsafe, clamping to maximum safe vectorization factor "
4288 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4289 });
4290 return MaxSafeFixedVF;
4291 }
4292
4294 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4295 << " is ignored because scalable vectors are not "
4296 "available.\n");
4297 ORE->emit([&]() {
4298 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4300 TheLoop->getHeader())
4301 << "User-specified vectorization factor "
4302 << ore::NV("UserVectorizationFactor", UserVF)
4303 << " is ignored because the target does not support scalable "
4304 "vectors. The compiler will pick a more suitable value.";
4305 });
4306 } else {
4307 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4308 << " is unsafe. Ignoring scalable UserVF.\n");
4309 ORE->emit([&]() {
4310 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4312 TheLoop->getHeader())
4313 << "User-specified vectorization factor "
4314 << ore::NV("UserVectorizationFactor", UserVF)
4315 << " is unsafe. Ignoring the hint to let the compiler pick a "
4316 "more suitable value.";
4317 });
4318 }
4319 }
4320
4321 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4322 << " / " << WidestType << " bits.\n");
4323
4326 if (auto MaxVF =
4327 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4328 MaxSafeFixedVF, FoldTailByMasking))
4329 Result.FixedVF = MaxVF;
4330
4331 if (auto MaxVF =
4332 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4333 MaxSafeScalableVF, FoldTailByMasking))
4334 if (MaxVF.isScalable()) {
4335 Result.ScalableVF = MaxVF;
4336 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4337 << "\n");
4338 }
4339
4340 return Result;
4341}
4342
4346 // TODO: It may by useful to do since it's still likely to be dynamically
4347 // uniform if the target can skip.
4349 "Not inserting runtime ptr check for divergent target",
4350 "runtime pointer checks needed. Not enabled for divergent target",
4351 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4353 }
4354
4355 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4356 unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
4357 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4358 if (TC == 1) {
4359 reportVectorizationFailure("Single iteration (non) loop",
4360 "loop trip count is one, irrelevant for vectorization",
4361 "SingleIterationLoop", ORE, TheLoop);
4363 }
4364
4365 switch (ScalarEpilogueStatus) {
4367 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4369 [[fallthrough]];
4371 LLVM_DEBUG(
4372 dbgs() << "LV: vector predicate hint/switch found.\n"
4373 << "LV: Not allowing scalar epilogue, creating predicated "
4374 << "vector loop.\n");
4375 break;
4377 // fallthrough as a special case of OptForSize
4379 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4380 LLVM_DEBUG(
4381 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4382 else
4383 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4384 << "count.\n");
4385
4386 // Bail if runtime checks are required, which are not good when optimising
4387 // for size.
4390
4391 break;
4392 }
4393
4394 // The only loops we can vectorize without a scalar epilogue, are loops with
4395 // a bottom-test and a single exiting block. We'd have to handle the fact
4396 // that not every instruction executes on the last iteration. This will
4397 // require a lane mask which varies through the vector loop body. (TODO)
4399 // If there was a tail-folding hint/switch, but we can't fold the tail by
4400 // masking, fallback to a vectorization with a scalar epilogue.
4401 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4402 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4403 "scalar epilogue instead.\n");
4404 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4405 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4406 }
4408 }
4409
4410 // Now try the tail folding
4411
4412 // Invalidate interleave groups that require an epilogue if we can't mask
4413 // the interleave-group.
4415 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4416 "No decisions should have been taken at this point");
4417 // Note: There is no need to invalidate any cost modeling decisions here, as
4418 // non where taken so far.
4420 }
4421
4422 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4423
4424 // Avoid tail folding if the trip count is known to be a multiple of any VF
4425 // we choose.
4426 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4427 MaxFactors.FixedVF.getFixedValue();
4428 if (MaxFactors.ScalableVF) {
4429 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4430 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4431 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4432 *MaxPowerOf2RuntimeVF,
4433 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4434 } else
4435 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4436 }
4437
4438 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4439 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4440 "MaxFixedVF must be a power of 2");
4441 unsigned MaxVFtimesIC =
4442 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4443 ScalarEvolution *SE = PSE.getSE();
4444 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4445 const SCEV *ExitCount = SE->getAddExpr(
4446 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4447 const SCEV *Rem = SE->getURemExpr(
4448 SE->applyLoopGuards(ExitCount, TheLoop),
4449 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4450 if (Rem->isZero()) {
4451 // Accept MaxFixedVF if we do not have a tail.
4452 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4453 return MaxFactors;
4454 }
4455 }
4456
4457 // If we don't know the precise trip count, or if the trip count that we
4458 // found modulo the vectorization factor is not zero, try to fold the tail
4459 // by masking.
4460 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4461 setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4462 if (foldTailByMasking()) {
4464 LLVM_DEBUG(
4465 dbgs()
4466 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4467 "try to generate VP Intrinsics with scalable vector "
4468 "factors only.\n");
4469 // Tail folded loop using VP intrinsics restricts the VF to be scalable
4470 // for now.
4471 // TODO: extend it for fixed vectors, if required.
4472 assert(MaxFactors.ScalableVF.isScalable() &&
4473 "Expected scalable vector factor.");
4474
4475 MaxFactors.FixedVF = ElementCount::getFixed(1);
4476 }
4477 return MaxFactors;
4478 }
4479
4480 // If there was a tail-folding hint/switch, but we can't fold the tail by
4481 // masking, fallback to a vectorization with a scalar epilogue.
4482 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4483 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4484 "scalar epilogue instead.\n");
4485 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4486 return MaxFactors;
4487 }
4488
4489 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4490 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4492 }
4493
4494 if (TC == 0) {
4496 "Unable to calculate the loop count due to complex control flow",
4497 "unable to calculate the loop count due to complex control flow",
4498 "UnknownLoopCountComplexCFG", ORE, TheLoop);
4500 }
4501
4503 "Cannot optimize for size and vectorize at the same time.",
4504 "cannot optimize for size and vectorize at the same time. "
4505 "Enable vectorization of this loop with '#pragma clang loop "
4506 "vectorize(enable)' when compiling with -Os/-Oz",
4507 "NoTailLoopWithOptForSize", ORE, TheLoop);
4509}
4510
4511ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4512 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4513 ElementCount MaxSafeVF, bool FoldTailByMasking) {
4514 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4515 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4516 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4518
4519 // Convenience function to return the minimum of two ElementCounts.
4520 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4521 assert((LHS.isScalable() == RHS.isScalable()) &&
4522 "Scalable flags must match");
4523 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4524 };
4525
4526 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4527 // Note that both WidestRegister and WidestType may not be a powers of 2.
4528 auto MaxVectorElementCount = ElementCount::get(
4529 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4530 ComputeScalableMaxVF);
4531 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4532 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4533 << (MaxVectorElementCount * WidestType) << " bits.\n");
4534
4535 if (!MaxVectorElementCount) {
4536 LLVM_DEBUG(dbgs() << "LV: The target has no "
4537 << (ComputeScalableMaxVF ? "scalable" : "fixed")
4538 << " vector registers.\n");
4539 return ElementCount::getFixed(1);
4540 }
4541
4542 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4543 if (MaxVectorElementCount.isScalable() &&
4544 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4545 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4546 auto Min = Attr.getVScaleRangeMin();
4547 WidestRegisterMinEC *= Min;
4548 }
4549
4550 // When a scalar epilogue is required, at least one iteration of the scalar
4551 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4552 // max VF that results in a dead vector loop.
4553 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4554 MaxTripCount -= 1;
4555
4556 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4557 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4558 // If upper bound loop trip count (TC) is known at compile time there is no
4559 // point in choosing VF greater than TC (as done in the loop below). Select
4560 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4561 // scalable, we only fall back on a fixed VF when the TC is less than or
4562 // equal to the known number of lanes.
4563 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4564 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4565 "exceeding the constant trip count: "
4566 << ClampedUpperTripCount << "\n");
4567 return ElementCount::get(
4568 ClampedUpperTripCount,
4569 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4570 }
4571
4573 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4575 ElementCount MaxVF = MaxVectorElementCount;
4576 if (MaximizeBandwidth ||
4577 (MaximizeBandwidth.getNumOccurrences() == 0 &&
4580 auto MaxVectorElementCountMaxBW = ElementCount::get(
4581 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4582 ComputeScalableMaxVF);
4583 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4584
4585 // Collect all viable vectorization factors larger than the default MaxVF
4586 // (i.e. MaxVectorElementCount).
4588 for (ElementCount VS = MaxVectorElementCount * 2;
4589 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4590 VFs.push_back(VS);
4591
4592 // For each VF calculate its register usage.
4593 auto RUs = calculateRegisterUsage(VFs);
4594
4595 // Select the largest VF which doesn't require more registers than existing
4596 // ones.
4597 for (int I = RUs.size() - 1; I >= 0; --I) {
4598 const auto &MLU = RUs[I].MaxLocalUsers;
4599 if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4600 return LU.second <= TTI.getNumberOfRegisters(LU.first);
4601 })) {
4602 MaxVF = VFs[I];
4603 break;
4604 }
4605 }
4606 if (ElementCount MinVF =
4607 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4608 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4609 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4610 << ") with target's minimum: " << MinVF << '\n');
4611 MaxVF = MinVF;
4612 }
4613 }
4614
4615 // Invalidate any widening decisions we might have made, in case the loop
4616 // requires prediction (decided later), but we have already made some
4617 // load/store widening decisions.
4619 }
4620 return MaxVF;
4621}
4622
4623/// Convenience function that returns the value of vscale_range iff
4624/// vscale_range.min == vscale_range.max or otherwise returns the value
4625/// returned by the corresponding TTI method.
4626static std::optional<unsigned>
4628 const Function *Fn = L->getHeader()->getParent();
4629 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4630 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4631 auto Min = Attr.getVScaleRangeMin();
4632 auto Max = Attr.getVScaleRangeMax();
4633 if (Max && Min == Max)
4634 return Max;
4635 }
4636
4637 return TTI.getVScaleForTuning();
4638}
4639
4640bool LoopVectorizationPlanner::isMoreProfitable(
4641 const VectorizationFactor &A, const VectorizationFactor &B) const {
4642 InstructionCost CostA = A.Cost;
4643 InstructionCost CostB = B.Cost;
4644
4645 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4646
4647 // Improve estimate for the vector width if it is scalable.
4648 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4649 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4650 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4651 if (A.Width.isScalable())
4652 EstimatedWidthA *= *VScale;
4653 if (B.Width.isScalable())
4654 EstimatedWidthB *= *VScale;
4655 }
4656
4657 // Assume vscale may be larger than 1 (or the value being tuned for),
4658 // so that scalable vectorization is slightly favorable over fixed-width
4659 // vectorization.
4660 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4661 A.Width.isScalable() && !B.Width.isScalable();
4662
4663 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4664 const InstructionCost &RHS) {
4665 return PreferScalable ? LHS <= RHS : LHS < RHS;
4666 };
4667
4668 // To avoid the need for FP division:
4669 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4670 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4671 if (!MaxTripCount)
4672 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4673
4674 auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4675 InstructionCost VectorCost,
4676 InstructionCost ScalarCost) {
4677 // If the trip count is a known (possibly small) constant, the trip count
4678 // will be rounded up to an integer number of iterations under
4679 // FoldTailByMasking. The total cost in that case will be
4680 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4681 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4682 // some extra overheads, but for the purpose of comparing the costs of
4683 // different VFs we can use this to compare the total loop-body cost
4684 // expected after vectorization.
4685 if (CM.foldTailByMasking())
4686 return VectorCost * divideCeil(MaxTripCount, VF);
4687 return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4688 };
4689
4690 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4691 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4692 return CmpFn(RTCostA, RTCostB);
4693}
4694
4697 Loop *TheLoop) {
4698 if (InvalidCosts.empty())
4699 return;
4700
4701 // Emit a report of VFs with invalid costs in the loop.
4702
4703 // Group the remarks per instruction, keeping the instruction order from
4704 // InvalidCosts.
4705 std::map<Instruction *, unsigned> Numbering;
4706 unsigned I = 0;
4707 for (auto &Pair : InvalidCosts)
4708 if (!Numbering.count(Pair.first))
4709 Numbering[Pair.first] = I++;
4710
4711 // Sort the list, first on instruction(number) then on VF.
4712 sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
4713 if (Numbering[A.first] != Numbering[B.first])
4714 return Numbering[A.first] < Numbering[B.first];
4715 const auto &LHS = A.second;
4716 const auto &RHS = B.second;
4717 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
4718 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
4719 });
4720
4721 // For a list of ordered instruction-vf pairs:
4722 // [(load, vf1), (load, vf2), (store, vf1)]
4723 // Group the instructions together to emit separate remarks for:
4724 // load (vf1, vf2)
4725 // store (vf1)
4726 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
4727 auto Subset = ArrayRef<InstructionVFPair>();
4728 do {
4729 if (Subset.empty())
4730 Subset = Tail.take_front(1);
4731
4732 Instruction *I = Subset.front().first;
4733
4734 // If the next instruction is different, or if there are no other pairs,
4735 // emit a remark for the collated subset. e.g.
4736 // [(load, vf1), (load, vf2))]
4737 // to emit:
4738 // remark: invalid costs for 'load' at VF=(vf, vf2)
4739 if (Subset == Tail || Tail[Subset.size()].first != I) {
4740 std::string OutString;
4741 raw_string_ostream OS(OutString);
4742 assert(!Subset.empty() && "Unexpected empty range");
4743 OS << "Instruction with invalid costs prevented vectorization at VF=(";
4744 for (const auto &Pair : Subset)
4745 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4746 OS << "):";
4747 if (auto *CI = dyn_cast<CallInst>(I))
4748 OS << " call to " << CI->getCalledFunction()->getName();
4749 else
4750 OS << " " << I->getOpcodeName();
4751 OS.flush();
4752 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
4753 Tail = Tail.drop_front(Subset.size());
4754 Subset = {};
4755 } else
4756 // Grow the subset by one element
4757 Subset = Tail.take_front(Subset.size() + 1);
4758 } while (!Tail.empty());
4759}
4760
4761/// Check if any recipe of \p Plan will generate a vector value, which will be
4762/// assigned a vector register.
4764 const TargetTransformInfo &TTI) {
4765 assert(VF.isVector() && "Checking a scalar VF?");
4766 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
4768 DenseSet<VPRecipeBase *> EphemeralRecipes;
4769 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4770 // Set of already visited types.
4771 DenseSet<Type *> Visited;
4772 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4774 for (VPRecipeBase &R : *VPBB) {
4775 if (EphemeralRecipes.contains(&R))
4776 continue;
4777 // Continue early if the recipe is considered to not produce a vector
4778 // result. Note that this includes VPInstruction where some opcodes may
4779 // produce a vector, to preserve existing behavior as VPInstructions model
4780 // aspects not directly mapped to existing IR instructions.
4781 switch (R.getVPDefID()) {
4782 case VPDef::VPDerivedIVSC:
4783 case VPDef::VPScalarIVStepsSC:
4784 case VPDef::VPScalarCastSC:
4785 case VPDef::VPReplicateSC:
4786 case VPDef::VPInstructionSC:
4787 case VPDef::VPCanonicalIVPHISC:
4788 case VPDef::VPVectorPointerSC:
4789 case VPDef::VPExpandSCEVSC:
4790 case VPDef::VPEVLBasedIVPHISC:
4791 case VPDef::VPPredInstPHISC:
4792 case VPDef::VPBranchOnMaskSC:
4793 continue;
4794 case VPDef::VPReductionSC:
4795 case VPDef::VPActiveLaneMaskPHISC:
4796 case VPDef::VPWidenCallSC:
4797 case VPDef::VPWidenCanonicalIVSC:
4798 case VPDef::VPWidenCastSC:
4799 case VPDef::VPWidenGEPSC:
4800 case VPDef::VPWidenSC:
4801 case VPDef::VPWidenSelectSC:
4802 case VPDef::VPBlendSC:
4803 case VPDef::VPFirstOrderRecurrencePHISC:
4804 case VPDef::VPWidenPHISC:
4805 case VPDef::VPWidenIntOrFpInductionSC:
4806 case VPDef::VPWidenPointerInductionSC:
4807 case VPDef::VPReductionPHISC:
4808 case VPDef::VPInterleaveSC:
4809 case VPDef::VPWidenLoadEVLSC:
4810 case VPDef::VPWidenLoadSC:
4811 case VPDef::VPWidenStoreEVLSC:
4812 case VPDef::VPWidenStoreSC:
4813 break;
4814 default:
4815 llvm_unreachable("unhandled recipe");
4816 }
4817
4818 auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4819 Type *VectorTy = ToVectorTy(ScalarTy, VF);
4820 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4821 if (!NumLegalParts)
4822 return false;
4823 if (VF.isScalable()) {
4824 // <vscale x 1 x iN> is assumed to be profitable over iN because
4825 // scalable registers are a distinct register class from scalar
4826 // ones. If we ever find a target which wants to lower scalable
4827 // vectors back to scalars, we'll need to update this code to
4828 // explicitly ask TTI about the register class uses for each part.
4829 return NumLegalParts <= VF.getKnownMinValue();
4830 }
4831 // Two or more parts that share a register - are vectorized.
4832 return NumLegalParts < VF.getKnownMinValue();
4833 };
4834
4835 // If no def nor is a store, e.g., branches, continue - no value to check.
4836 if (R.getNumDefinedValues() == 0 &&
4837 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4838 &R))
4839 continue;
4840 // For multi-def recipes, currently only interleaved loads, suffice to
4841 // check first def only.
4842 // For stores check their stored value; for interleaved stores suffice
4843 // the check first stored value only. In all cases this is the second
4844 // operand.
4845 VPValue *ToCheck =
4846 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4847 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4848 if (!Visited.insert({ScalarTy}).second)
4849 continue;
4850 if (WillWiden(ScalarTy))
4851 return true;
4852 }
4853 }
4854
4855 return false;
4856}
4857
4858VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4860 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4861 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4862 assert(any_of(VPlans,
4863 [](std::unique_ptr<VPlan> &P) {
4864 return P->hasVF(ElementCount::getFixed(1));
4865 }) &&
4866 "Expected Scalar VF to be a candidate");
4867
4868 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4869 ExpectedCost);
4870 VectorizationFactor ChosenFactor = ScalarCost;
4871
4872 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4873 if (ForceVectorization &&
4874 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4875 // Ignore scalar width, because the user explicitly wants vectorization.
4876 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4877 // evaluation.
4878 ChosenFactor.Cost = InstructionCost::getMax();
4879 }
4880
4881 SmallVector<InstructionVFPair> InvalidCosts;
4882 for (auto &P : VPlans) {
4883 for (ElementCount VF : P->vectorFactors()) {
4884 // The cost for scalar VF=1 is already calculated, so ignore it.
4885 if (VF.isScalar())
4886 continue;
4887
4888 InstructionCost C = CM.expectedCost(VF, &InvalidCosts);
4889 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4890
4891#ifndef NDEBUG
4892 unsigned AssumedMinimumVscale =
4893 getVScaleForTuning(OrigLoop, TTI).value_or(1);
4894 unsigned Width =
4895 Candidate.Width.isScalable()
4896 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
4897 : Candidate.Width.getFixedValue();
4898 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4899 << " costs: " << (Candidate.Cost / Width));
4900 if (VF.isScalable())
4901 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4902 << AssumedMinimumVscale << ")");
4903 LLVM_DEBUG(dbgs() << ".\n");
4904#endif
4905
4906 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4907 LLVM_DEBUG(
4908 dbgs()
4909 << "LV: Not considering vector loop of width " << VF
4910 << " because it will not generate any vector instructions.\n");
4911 continue;
4912 }
4913
4914 // If profitable add it to ProfitableVF list.
4915 if (isMoreProfitable(Candidate, ScalarCost))
4916 ProfitableVFs.push_back(Candidate);
4917
4918 if (isMoreProfitable(Candidate, ChosenFactor))
4919 ChosenFactor = Candidate;
4920 }
4921 }
4922
4923 emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
4924
4927 "There are conditional stores.",
4928 "store that is conditionally executed prevents vectorization",
4929 "ConditionalStore", ORE, OrigLoop);
4930 ChosenFactor = ScalarCost;
4931 }
4932
4933 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4934 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
4935 << "LV: Vectorization seems to be not beneficial, "
4936 << "but was forced by a user.\n");
4937 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
4938 return ChosenFactor;
4939}
4940
4941bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4942 ElementCount VF) const {
4943 // Cross iteration phis such as reductions need special handling and are
4944 // currently unsupported.
4945 if (any_of(OrigLoop->getHeader()->phis(),
4946 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
4947 return false;
4948
4949 // Phis with uses outside of the loop require special handling and are
4950 // currently unsupported.
4951 for (const auto &Entry : Legal->getInductionVars()) {
4952 // Look for uses of the value of the induction at the last iteration.
4953 Value *PostInc =
4954 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4955 for (User *U : PostInc->users())
4956 if (!OrigLoop->contains(cast<Instruction>(U)))
4957 return false;
4958 // Look for uses of penultimate value of the induction.
4959 for (User *U : Entry.first->users())
4960 if (!OrigLoop->contains(cast<Instruction>(U)))
4961 return false;
4962 }
4963
4964 // Epilogue vectorization code has not been auditted to ensure it handles
4965 // non-latch exits properly. It may be fine, but it needs auditted and
4966 // tested.
4967 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4968 return false;
4969
4970 return true;
4971}
4972
4974 const ElementCount VF) const {
4975 // FIXME: We need a much better cost-model to take different parameters such
4976 // as register pressure, code size increase and cost of extra branches into
4977 // account. For now we apply a very crude heuristic and only consider loops
4978 // with vectorization factors larger than a certain value.
4979
4980 // Allow the target to opt out entirely.
4982 return false;
4983
4984 // We also consider epilogue vectorization unprofitable for targets that don't
4985 // consider interleaving beneficial (eg. MVE).
4986 if (TTI.getMaxInterleaveFactor(VF) <= 1)
4987 return false;
4988
4989 unsigned Multiplier = 1;
4990 if (VF.isScalable())
4991 Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
4992 if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
4993 return true;
4994 return false;
4995}
4996
4998 const ElementCount MainLoopVF, unsigned IC) {
5001 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
5002 return Result;
5003 }
5004
5005 if (!CM.isScalarEpilogueAllowed()) {
5006 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
5007 "epilogue is allowed.\n");
5008 return Result;
5009 }
5010
5011 // Not really a cost consideration, but check for unsupported cases here to
5012 // simplify the logic.
5013 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
5014 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
5015 "is not a supported candidate.\n");
5016 return Result;
5017 }
5018
5020 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
5022 if (hasPlanWithVF(ForcedEC))
5023 return {ForcedEC, 0, 0};
5024 else {
5025 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
5026 "viable.\n");
5027 return Result;
5028 }
5029 }
5030
5031 if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
5032 OrigLoop->getHeader()->getParent()->hasMinSize()) {
5033 LLVM_DEBUG(
5034 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
5035 return Result;
5036 }
5037
5038 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
5039 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5040 "this loop\n");
5041 return Result;
5042 }
5043
5044 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5045 // the main loop handles 8 lanes per iteration. We could still benefit from
5046 // vectorizing the epilogue loop with VF=4.
5047 ElementCount EstimatedRuntimeVF = MainLoopVF;
5048 if (MainLoopVF.isScalable()) {
5049 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5050 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
5051 EstimatedRuntimeVF *= *VScale;
5052 }
5053
5054 ScalarEvolution &SE = *PSE.getSE();
5055 Type *TCType = Legal->getWidestInductionType();
5056 const SCEV *RemainingIterations = nullptr;
5057 for (auto &NextVF : ProfitableVFs) {
5058 // Skip candidate VFs without a corresponding VPlan.
5059 if (!hasPlanWithVF(NextVF.Width))
5060 continue;
5061
5062 // Skip candidate VFs with widths >= the estimate runtime VF (scalable
5063 // vectors) or the VF of the main loop (fixed vectors).
5064 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5065 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
5066 ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
5067 continue;
5068
5069 // If NextVF is greater than the number of remaining iterations, the
5070 // epilogue loop would be dead. Skip such factors.
5071 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
5072 // TODO: extend to support scalable VFs.
5073 if (!RemainingIterations) {
5074 const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
5075 RemainingIterations = SE.getURemExpr(
5076 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
5077 }
5078 if (SE.isKnownPredicate(
5080 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
5081 RemainingIterations))
5082 continue;
5083 }
5084
5085 if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
5086 Result = NextVF;
5087 }
5088
5089 if (Result != VectorizationFactor::Disabled())
5090 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5091 << Result.Width << "\n");
5092 return Result;
5093}
5094
5095std::pair<unsigned, unsigned>
5097 unsigned MinWidth = -1U;
5098 unsigned MaxWidth = 8;
5100 // For in-loop reductions, no element types are added to ElementTypesInLoop
5101 // if there are no loads/stores in the loop. In this case, check through the
5102 // reduction variables to determine the maximum width.
5103 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5104 // Reset MaxWidth so that we can find the smallest type used by recurrences
5105 // in the loop.
5106 MaxWidth = -1U;
5107 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
5108 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5109 // When finding the min width used by the recurrence we need to account
5110 // for casts on the input operands of the recurrence.
5111 MaxWidth = std::min<unsigned>(
5112 MaxWidth, std::min<unsigned>(
5115 }
5116 } else {
5117 for (Type *T : ElementTypesInLoop) {
5118 MinWidth = std::min<unsigned>(
5119 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5120 MaxWidth = std::max<unsigned>(
5121 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5122 }
5123 }
5124 return {MinWidth, MaxWidth};
5125}
5126
5128 ElementTypesInLoop.clear();
5129 // For each block.
5130 for (BasicBlock *BB : TheLoop->blocks()) {
5131 // For each instruction in the loop.
5132 for (Instruction &I : BB->instructionsWithoutDebug()) {
5133 Type *T = I.getType();
5134
5135 // Skip ignored values.
5136 if (ValuesToIgnore.count(&I))
5137 continue;
5138
5139 // Only examine Loads, Stores and PHINodes.
5140 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5141 continue;
5142
5143 // Examine PHI nodes that are reduction variables. Update the type to
5144 // account for the recurrence type.
5145 if (auto *PN = dyn_cast<PHINode>(&I)) {
5146 if (!Legal->isReductionVariable(PN))
5147 continue;
5148 const RecurrenceDescriptor &RdxDesc =
5149 Legal->getReductionVars().find(PN)->second;
5152 RdxDesc.getRecurrenceType(),
5154 continue;
5155 T = RdxDesc.getRecurrenceType();
5156 }
5157
5158 // Examine the stored values.
5159 if (auto *ST = dyn_cast<StoreInst>(&I))
5160 T = ST->getValueOperand()->getType();
5161
5162 assert(T->isSized() &&
5163 "Expected the load/store/recurrence type to be sized");
5164
5165 ElementTypesInLoop.insert(T);
5166 }
5167 }
5168}
5169
5170unsigned
5172 InstructionCost LoopCost) {
5173 // -- The interleave heuristics --
5174 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5175 // There are many micro-architectural considerations that we can't predict
5176 // at this level. For example, frontend pressure (on decode or fetch) due to
5177 // code size, or the number and capabilities of the execution ports.
5178 //
5179 // We use the following heuristics to select the interleave count:
5180 // 1. If the code has reductions, then we interleave to break the cross
5181 // iteration dependency.
5182 // 2. If the loop is really small, then we interleave to reduce the loop
5183 // overhead.
5184 // 3. We don't interleave if we think that we will spill registers to memory
5185 // due to the increased register pressure.
5186
5188 return 1;
5189
5190 // Do not interleave if EVL is preferred and no User IC is specified.
5191 if (foldTailWithEVL()) {
5192 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
5193 "Unroll factor forced to be 1.\n");
5194 return 1;
5195 }
5196
5197 // We used the distance for the interleave count.
5199 return 1;
5200
5201 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5202 const bool HasReductions = !Legal->getReductionVars().empty();
5203
5204 // If we did not calculate the cost for VF (because the user selected the VF)
5205 // then we calculate the cost of VF here.
5206 if (LoopCost == 0) {
5207 LoopCost = expectedCost(VF);
5208 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5209
5210 // Loop body is free and there is no need for interleaving.
5211 if (LoopCost == 0)
5212 return 1;
5213 }
5214
5216 // We divide by these constants so assume that we have at least one
5217 // instruction that uses at least one register.
5218 for (auto& pair : R.MaxLocalUsers) {
5219 pair.second = std::max(pair.second, 1U);
5220 }
5221
5222 // We calculate the interleave count using the following formula.
5223 // Subtract the number of loop invariants from the number of available
5224 // registers. These registers are used by all of the interleaved instances.
5225 // Next, divide the remaining registers by the number of registers that is
5226 // required by the loop, in order to estimate how many parallel instances
5227 // fit without causing spills. All of this is rounded down if necessary to be
5228 // a power of two. We want power of two interleave count to simplify any
5229 // addressing operations or alignment considerations.
5230 // We also want power of two interleave counts to ensure that the induction
5231 // variable of the vector loop wraps to zero, when tail is folded by masking;
5232 // this currently happens when OptForSize, in which case IC is set to 1 above.
5233 unsigned IC = UINT_MAX;
5234
5235 for (auto& pair : R.MaxLocalUsers) {
5236 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5237 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5238 << " registers of "
5239 << TTI.getRegisterClassName(pair.first) << " register class\n");
5240 if (VF.isScalar()) {
5241 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5242 TargetNumRegisters = ForceTargetNumScalarRegs;
5243 } else {
5244 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5245 TargetNumRegisters = ForceTargetNumVectorRegs;
5246 }
5247 unsigned MaxLocalUsers = pair.second;
5248 unsigned LoopInvariantRegs = 0;
5249 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5250 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5251
5252 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5253 MaxLocalUsers);
5254 // Don't count the induction variable as interleaved.
5256 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5257 std::max(1U, (MaxLocalUsers - 1)));
5258 }
5259
5260 IC = std::min(IC, TmpIC);
5261 }
5262
5263 // Clamp the interleave ranges to reasonable counts.
5264 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5265
5266 // Check if the user has overridden the max.
5267 if (VF.isScalar()) {
5268 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5269 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5270 } else {
5271 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5272 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5273 }
5274
5275 unsigned EstimatedVF = VF.getKnownMinValue();
5276 if (VF.isScalable()) {
5277 if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
5278 EstimatedVF *= *VScale;
5279 }
5280 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
5281
5282 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5283 if (KnownTC > 0) {
5284 // At least one iteration must be scalar when this constraint holds. So the
5285 // maximum available iterations for interleaving is one less.
5286 unsigned AvailableTC =
5287 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5288
5289 // If trip count is known we select between two prospective ICs, where
5290 // 1) the aggressive IC is capped by the trip count divided by VF
5291 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5292 // The final IC is selected in a way that the epilogue loop trip count is
5293 // minimized while maximizing the IC itself, so that we either run the
5294 // vector loop at least once if it generates a small epilogue loop, or else
5295 // we run the vector loop at least twice.
5296
5297 unsigned InterleaveCountUB = bit_floor(
5298 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
5299 unsigned InterleaveCountLB = bit_floor(std::max(
5300 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5301 MaxInterleaveCount = InterleaveCountLB;
5302
5303 if (InterleaveCountUB != InterleaveCountLB) {
5304 unsigned TailTripCountUB =
5305 (AvailableTC % (EstimatedVF * InterleaveCountUB));
5306 unsigned TailTripCountLB =
5307 (AvailableTC % (EstimatedVF * InterleaveCountLB));
5308 // If both produce same scalar tail, maximize the IC to do the same work
5309 // in fewer vector loop iterations
5310 if (TailTripCountUB == TailTripCountLB)
5311 MaxInterleaveCount = InterleaveCountUB;
5312 }
5313 } else if (BestKnownTC && *BestKnownTC > 0) {
5314 // At least one iteration must be scalar when this constraint holds. So the
5315 // maximum available iterations for interleaving is one less.
5316 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5317 ? (*BestKnownTC) - 1
5318 : *BestKnownTC;
5319
5320 // If trip count is an estimated compile time constant, limit the
5321 // IC to be capped by the trip count divided by VF * 2, such that the vector
5322 // loop runs at least twice to make interleaving seem profitable when there
5323 // is an epilogue loop present. Since exact Trip count is not known we
5324 // choose to be conservative in our IC estimate.
5325 MaxInterleaveCount = bit_floor(std::max(
5326 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5327 }
5328
5329 assert(MaxInterleaveCount > 0 &&
5330 "Maximum interleave count must be greater than 0");
5331
5332 // Clamp the calculated IC to be between the 1 and the max interleave count
5333 // that the target and trip count allows.
5334 if (IC > MaxInterleaveCount)
5335 IC = MaxInterleaveCount;
5336 else
5337 // Make sure IC is greater than 0.
5338 IC = std::max(1u, IC);
5339
5340 assert(IC > 0 && "Interleave count must be greater than 0.");
5341
5342 // Interleave if we vectorized this loop and there is a reduction that could
5343 // benefit from interleaving.
5344 if (VF.isVector() && HasReductions) {
5345 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5346 return IC;
5347 }
5348
5349 // For any scalar loop that either requires runtime checks or predication we
5350 // are better off leaving this to the unroller. Note that if we've already
5351 // vectorized the loop we will have done the runtime check and so interleaving
5352 // won't require further checks.
5353 bool ScalarInterleavingRequiresPredication =
5354 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5355 return Legal->blockNeedsPredication(BB);
5356 }));
5357 bool ScalarInterleavingRequiresRuntimePointerCheck =
5359
5360 // We want to interleave small loops in order to reduce the loop overhead and
5361 // potentially expose ILP opportunities.
5362 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5363 << "LV: IC is " << IC << '\n'
5364 << "LV: VF is " << VF << '\n');
5365 const bool AggressivelyInterleaveReductions =
5366 TTI.enableAggressiveInterleaving(HasReductions);
5367 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5368 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5369 // We assume that the cost overhead is 1 and we use the cost model
5370 // to estimate the cost of the loop and interleave until the cost of the
5371 // loop overhead is about 5% of the cost of the loop.
5372 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5373 SmallLoopCost / *LoopCost.getValue()));
5374
5375 // Interleave until store/load ports (estimated by max interleave count) are
5376 // saturated.
5377 unsigned NumStores = Legal->getNumStores();
5378 unsigned NumLoads = Legal->getNumLoads();
5379 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5380 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5381
5382 // There is little point in interleaving for reductions containing selects
5383 // and compares when VF=1 since it may just create more overhead than it's
5384 // worth for loops with small trip counts. This is because we still have to
5385 // do the final reduction after the loop.
5386 bool HasSelectCmpReductions =
5387 HasReductions &&
5388 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5389 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5390 return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5391 RdxDesc.getRecurrenceKind());
5392 });
5393 if (HasSelectCmpReductions) {
5394 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5395 return 1;
5396 }
5397
5398 // If we have a scalar reduction (vector reductions are already dealt with
5399 // by this point), we can increase the critical path length if the loop
5400 // we're interleaving is inside another loop. For tree-wise reductions
5401 // set the limit to 2, and for ordered reductions it's best to disable
5402 // interleaving entirely.
5403 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5404 bool HasOrderedReductions =
5405 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5406 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5407 return RdxDesc.isOrdered();
5408 });
5409 if (HasOrderedReductions) {
5410 LLVM_DEBUG(
5411 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5412 return 1;
5413 }
5414
5415 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5416 SmallIC = std::min(SmallIC, F);
5417 StoresIC = std::min(StoresIC, F);
5418 LoadsIC = std::min(LoadsIC, F);
5419 }
5420
5422 std::max(StoresIC, LoadsIC) > SmallIC) {
5423 LLVM_DEBUG(
5424 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5425 return std::max(StoresIC, LoadsIC);
5426 }
5427
5428 // If there are scalar reductions and TTI has enabled aggressive
5429 // interleaving for reductions, we will interleave to expose ILP.
5430 if (VF.isScalar() && AggressivelyInterleaveReductions) {
5431 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5432 // Interleave no less than SmallIC but not as aggressive as the normal IC
5433 // to satisfy the rare situation when resources are too limited.
5434 return std::max(IC / 2, SmallIC);
5435 } else {
5436 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5437 return SmallIC;
5438 }
5439 }
5440
5441 // Interleave if this is a large loop (small loops are already dealt with by
5442 // this point) that could benefit from interleaving.
5443 if (AggressivelyInterleaveReductions) {
5444 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5445 return IC;
5446 }
5447
5448 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5449 return 1;
5450}
5451
5454 // This function calculates the register usage by measuring the highest number
5455 // of values that are alive at a single location. Obviously, this is a very
5456 // rough estimation. We scan the loop in a topological order in order and
5457 // assign a number to each instruction. We use RPO to ensure that defs are
5458 // met before their users. We assume that each instruction that has in-loop
5459 // users starts an interval. We record every time that an in-loop value is
5460 // used, so we have a list of the first and last occurrences of each
5461 // instruction. Next, we transpose this data structure into a multi map that
5462 // holds the list of intervals that *end* at a specific location. This multi
5463 // map allows us to perform a linear search. We scan the instructions linearly
5464 // and record each time that a new interval starts, by placing it in a set.
5465 // If we find this value in the multi-map then we remove it from the set.
5466 // The max register usage is the maximum size of the set.
5467 // We also search for instructions that are defined outside the loop, but are
5468 // used inside the loop. We need this number separately from the max-interval
5469 // usage number because when we unroll, loop-invariant values do not take
5470 // more register.
5472 DFS.perform(LI);
5473
5474 RegisterUsage RU;
5475
5476 // Each 'key' in the map opens a new interval. The values
5477 // of the map are the index of the 'last seen' usage of the
5478 // instruction that is the key.
5480
5481 // Maps instruction to its index.
5483 // Marks the end of each interval.
5484 IntervalMap EndPoint;
5485 // Saves the list of instruction indices that are used in the loop.
5487 // Saves the list of values that are used in the loop but are defined outside
5488 // the loop (not including non-instruction values such as arguments and
5489 // constants).
5490 SmallSetVector<Instruction *, 8> LoopInvariants;
5491
5492 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5493 for (Instruction &I : BB->instructionsWithoutDebug()) {
5494 IdxToInstr.push_back(&I);
5495
5496 // Save the end location of each USE.
5497 for (Value *U : I.operands()) {
5498 auto *Instr = dyn_cast<Instruction>(U);
5499
5500 // Ignore non-instruction values such as arguments, constants, etc.
5501 // FIXME: Might need some motivation why these values are ignored. If
5502 // for example an argument is used inside the loop it will increase the
5503 // register pressure (so shouldn't we add it to LoopInvariants).
5504 if (!Instr)
5505 continue;
5506
5507 // If this instruction is outside the loop then record it and continue.
5508 if (!TheLoop->contains(Instr)) {
5509 LoopInvariants.insert(Instr);
5510 continue;
5511 }
5512
5513 // Overwrite previous end points.
5514 EndPoint[Instr] = IdxToInstr.size();
5515 Ends.insert(Instr);
5516 }
5517 }
5518 }
5519
5520 // Saves the list of intervals that end with the index in 'key'.
5521 using InstrList = SmallVector<Instruction *, 2>;
5522 DenseMap<unsigned, InstrList> TransposeEnds;
5523
5524 // Transpose the EndPoints to a list of values that end at each index.
5525 for (auto &Interval : EndPoint)
5526 TransposeEnds[Interval.second].push_back(Interval.first);
5527
5528 SmallPtrSet<Instruction *, 8> OpenIntervals;
5531
5532 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5533
5534 const auto &TTICapture = TTI;
5535 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5536 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5537 return 0;
5538 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5539 };
5540
5541 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5542 Instruction *I = IdxToInstr[i];
5543
5544 // Remove all of the instructions that end at this location.
5545 InstrList &List = TransposeEnds[i];
5546 for (Instruction *ToRemove : List)
5547 OpenIntervals.erase(ToRemove);
5548
5549 // Ignore instructions that are never used within the loop.
5550 if (!Ends.count(I))
5551 continue;
5552
5553 // Skip ignored values.
5554 if (ValuesToIgnore.count(I))
5555 continue;
5556
5558
5559 // For each VF find the maximum usage of registers.
5560 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5561 // Count the number of registers used, per register class, given all open
5562 // intervals.
5563 // Note that elements in this SmallMapVector will be default constructed
5564 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5565 // there is no previous entry for ClassID.
5567
5568 if (VFs[j].isScalar()) {
5569 for (auto *Inst : OpenIntervals) {
5570 unsigned ClassID =
5571 TTI.getRegisterClassForType(false, Inst->getType());
5572 // FIXME: The target might use more than one register for the type
5573 // even in the scalar case.
5574 RegUsage[ClassID] += 1;
5575 }
5576 } else {
5578 for (auto *Inst : OpenIntervals) {
5579 // Skip ignored values for VF > 1.
5580 if (VecValuesToIgnore.count(Inst))
5581 continue;
5582 if (isScalarAfterVectorization(Inst, VFs[j])) {
5583 unsigned ClassID =
5584 TTI.getRegisterClassForType(false, Inst->getType());
5585 // FIXME: The target might use more than one register for the type
5586 // even in the scalar case.
5587 RegUsage[ClassID] += 1;
5588 } else {
5589 unsigned ClassID =
5590 TTI.getRegisterClassForType(true, Inst->getType());
5591 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5592 }
5593 }
5594 }
5595
5596 for (auto& pair : RegUsage) {
5597 auto &Entry = MaxUsages[j][pair.first];
5598 Entry = std::max(Entry, pair.second);
5599 }
5600 }
5601
5602 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5603 << OpenIntervals.size() << '\n');
5604
5605 // Add the current instruction to the list of open intervals.
5606 OpenIntervals.insert(I);
5607 }
5608
5609 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5610 // Note that elements in this SmallMapVector will be default constructed
5611 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5612 // there is no previous entry for ClassID.
5614
5615 for (auto *Inst : LoopInvariants) {
5616 // FIXME: The target might use more than one register for the type
5617 // even in the scalar case.
5618 bool IsScalar = all_of(Inst->users(), [&](User *U) {
5619 auto *I = cast<Instruction>(U);
5620 return TheLoop != LI->getLoopFor(I->getParent()) ||
5621 isScalarAfterVectorization(I, VFs[i]);
5622 });
5623
5624 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
5625 unsigned ClassID =
5626 TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5627 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5628 }
5629
5630 LLVM_DEBUG({
5631 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5632 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5633 << " item\n";
5634 for (const auto &pair : MaxUsages[i]) {
5635 dbgs() << "LV(REG): RegisterClass: "
5636 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5637 << " registers\n";
5638 }
5639 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5640 << " item\n";
5641 for (const auto &pair : Invariant) {
5642 dbgs() << "LV(REG): RegisterClass: "
5643 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5644 << " registers\n";
5645 }
5646 });
5647
5648 RU.LoopInvariantRegs = Invariant;
5649 RU.MaxLocalUsers = MaxUsages[i];
5650 RUs[i] = RU;
5651 }
5652
5653 return RUs;
5654}
5655
5656bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5657 ElementCount VF) {
5658 // TODO: Cost model for emulated masked load/store is completely
5659 // broken. This hack guides the cost model to use an artificially
5660 // high enough value to practically disable vectorization with such
5661 // operations, except where previously deployed legality hack allowed
5662 // using very low cost values. This is to avoid regressions coming simply
5663 // from moving "masked load/store" check from legality to cost model.
5664 // Masked Load/Gather emulation was previously never allowed.
5665 // Limited number of Masked Store/Scatter emulation was allowed.
5667 "Expecting a scalar emulated instruction");
5668 return isa<LoadInst>(I) ||
5669 (isa<StoreInst>(I) &&
5670 NumPredStores > NumberOfStoresToPredicate);
5671}
5672
5674 // If we aren't vectorizing the loop, or if we've already collected the
5675 // instructions to scalarize, there's nothing to do. Collection may already
5676 // have occurred if we have a user-selected VF and are now computing the
5677 // expected cost for interleaving.
5678 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5679 return;
5680
5681 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5682 // not profitable to scalarize any instructions, the presence of VF in the
5683 // map will indicate that we've analyzed it already.
5684 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5685
5686 PredicatedBBsAfterVectorization[VF].clear();
5687
5688 // Find all the instructions that are scalar with predication in the loop and
5689 // determine if it would be better to not if-convert the blocks they are in.
5690 // If so, we also record the instructions to scalarize.
5691 for (BasicBlock *BB : TheLoop->blocks()) {
5693 continue;
5694 for (Instruction &I : *BB)
5695 if (isScalarWithPredication(&I, VF)) {
5696 ScalarCostsTy ScalarCosts;
5697 // Do not apply discount logic for:
5698 // 1. Scalars after vectorization, as there will only be a single copy
5699 // of the instruction.
5700 // 2. Scalable VF, as that would lead to invalid scalarization costs.
5701 // 3. Emulated masked memrefs, if a hacked cost is needed.
5702 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5703 !useEmulatedMaskMemRefHack(&I, VF) &&
5704 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5705 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5706 // Remember that BB will remain after vectorization.
5707 PredicatedBBsAfterVectorization[VF].insert(BB);
5708 for (auto *Pred : predecessors(BB)) {
5709 if (Pred->getSingleSuccessor() == BB)
5710 PredicatedBBsAfterVectorization[VF].insert(Pred);
5711 }
5712 }
5713 }
5714}
5715
5716InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5717 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5718 assert(!isUniformAfterVectorization(PredInst, VF) &&
5719 "Instruction marked uniform-after-vectorization will be predicated");
5720
5721 // Initialize the discount to zero, meaning that the scalar version and the
5722 // vector version cost the same.
5723 InstructionCost Discount = 0;
5724
5725 // Holds instructions to analyze. The instructions we visit are mapped in
5726 // ScalarCosts. Those instructions are the ones that would be scalarized if
5727 // we find that the scalar version costs less.
5729
5730 // Returns true if the given instruction can be scalarized.
5731 auto canBeScalarized = [&](Instruction *I) -> bool {
5732 // We only attempt to scalarize instructions forming a single-use chain
5733 // from the original predicated block that would otherwise be vectorized.
5734 // Although not strictly necessary, we give up on instructions we know will
5735 // already be scalar to avoid traversing chains that are unlikely to be
5736 // beneficial.
5737 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5739 return false;
5740
5741 // If the instruction is scalar with predication, it will be analyzed
5742 // separately. We ignore it within the context of PredInst.
5743 if (isScalarWithPredication(I, VF))
5744 return false;
5745
5746 // If any of the instruction's operands are uniform after vectorization,
5747 // the instruction cannot be scalarized. This prevents, for example, a
5748 // masked load from being scalarized.
5749 //
5750 // We assume we will only emit a value for lane zero of an instruction
5751 // marked uniform after vectorization, rather than VF identical values.
5752 // Thus, if we scalarize an instruction that uses a uniform, we would
5753 // create uses of values corresponding to the lanes we aren't emitting code
5754 // for. This behavior can be changed by allowing getScalarValue to clone
5755 // the lane zero values for uniforms rather than asserting.
5756 for (Use &U : I->operands())
5757 if (auto *J = dyn_cast<Instruction>(U.get()))
5758 if (isUniformAfterVectorization(J, VF))
5759 return false;
5760
5761 // Otherwise, we can scalarize the instruction.
5762 return true;
5763 };
5764
5765 // Compute the expected cost discount from scalarizing the entire expression
5766 // feeding the predicated instruction. We currently only consider expressions
5767 // that are single-use instruction chains.
5768 Worklist.push_back(PredInst);
5769 while (!Worklist.empty()) {
5770 Instruction *I = Worklist.pop_back_val();
5771
5772 // If we've already analyzed the instruction, there's nothing to do.
5773 if (ScalarCosts.contains(I))
5774 continue;
5775
5776 // Compute the cost of the vector instruction. Note that this cost already
5777 // includes the scalarization overhead of the predicated instruction.
5778 InstructionCost VectorCost = getInstructionCost(I, VF);
5779
5780 // Compute the cost of the scalarized instruction. This cost is the cost of
5781 // the instruction as if it wasn't if-converted and instead remained in the
5782 // predicated block. We will scale this cost by block probability after
5783 // computing the scalarization overhead.
5784 InstructionCost ScalarCost =
5786
5787 // Compute the scalarization overhead of needed insertelement instructions
5788 // and phi nodes.
5790 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5791 ScalarCost += TTI.getScalarizationOverhead(
5792 cast<VectorType>(ToVectorTy(I->getType(), VF)),
5793 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5794 /*Extract*/ false, CostKind);
5795 ScalarCost +=
5796 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5797 }
5798
5799 // Compute the scalarization overhead of needed extractelement
5800 // instructions. For each of the instruction's operands, if the operand can
5801 // be scalarized, add it to the worklist; otherwise, account for the
5802 // overhead.
5803 for (Use &U : I->operands())
5804 if (auto *J = dyn_cast<Instruction>(U.get())) {
5805 assert(VectorType::isValidElementType(J->getType()) &&
5806 "Instruction has non-scalar type");
5807 if (canBeScalarized(J))
5808 Worklist.push_back(J);
5809 else if (needsExtract(J, VF)) {
5810 ScalarCost += TTI.getScalarizationOverhead(
5811 cast<VectorType>(ToVectorTy(J->getType(), VF)),
5812 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5813 /*Extract*/ true, CostKind);
5814 }
5815 }
5816
5817 // Scale the total scalar cost by block probability.
5818 ScalarCost /= getReciprocalPredBlockProb();
5819
5820 // Compute the discount. A non-negative discount means the vector version
5821 // of the instruction costs more, and scalarizing would be beneficial.
5822 Discount += VectorCost - ScalarCost;
5823 ScalarCosts[I] = ScalarCost;
5824 }
5825
5826 return Discount;
5827}
5828
5832
5833 // For each block.
5834 for (BasicBlock *BB : TheLoop->blocks()) {
5835 InstructionCost BlockCost;
5836
5837 // For each instruction in the old loop.
5838 for (Instruction &I : BB->instructionsWithoutDebug()) {
5839 // Skip ignored values.
5840 if (ValuesToIgnore.count(&I) ||
5841 (VF.isVector() && VecValuesToIgnore.count(&I)))
5842 continue;
5843
5845
5846 // Check if we should override the cost.
5847 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5849
5850 // Keep a list of instructions with invalid costs.
5851 if (Invalid && !C.isValid())
5852 Invalid->emplace_back(&I, VF);
5853
5854 BlockCost += C;
5855 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5856 << VF << " For instruction: " << I << '\n');
5857 }
5858
5859 // If we are vectorizing a predicated block, it will have been
5860 // if-converted. This means that the block's instructions (aside from
5861 // stores and instructions that may divide by zero) will now be
5862 // unconditionally executed. For the scalar case, we may not always execute
5863 // the predicated block, if it is an if-else block. Thus, scale the block's
5864 // cost by the probability of executing it. blockNeedsPredication from
5865 // Legal is used so as to not include all blocks in tail folded loops.
5866 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5867 BlockCost /= getReciprocalPredBlockProb();
5868
5869 Cost += BlockCost;
5870 }
5871
5872 return Cost;
5873}
5874
5875/// Gets Address Access SCEV after verifying that the access pattern
5876/// is loop invariant except the induction variable dependence.
5877///
5878/// This SCEV can be sent to the Target in order to estimate the address
5879/// calculation cost.
5881 Value *Ptr,
5884 const Loop *TheLoop) {
5885
5886 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5887 if (!Gep)
5888 return nullptr;
5889
5890 // We are looking for a gep with all loop invariant indices except for one
5891 // which should be an induction variable.
5892 auto SE = PSE.getSE();
5893 unsigned NumOperands = Gep->getNumOperands();
5894 for (unsigned i = 1; i < NumOperands; ++i) {
5895 Value *Opd = Gep->getOperand(i);
5896 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5897 !Legal->isInductionVariable(Opd))
5898 return nullptr;
5899 }
5900
5901 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5902 return PSE.getSCEV(Ptr);
5903}
5904
5906LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5907 ElementCount VF) {
5908 assert(VF.isVector() &&
5909 "Scalarization cost of instruction implies vectorization.");
5910 if (VF.isScalable())
5912
5913 Type *ValTy = getLoadStoreType(I);
5914 auto SE = PSE.getSE();
5915
5916 unsigned AS = getLoadStoreAddressSpace(I);
5918 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5919 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5920 // that it is being called from this specific place.
5921
5922 // Figure out whether the access is strided and get the stride value
5923 // if it's known in compile time
5924 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5925
5926 // Get the cost of the scalar memory instruction and address computation.
5928 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5929
5930 // Don't pass *I here, since it is scalar but will actually be part of a
5931 // vectorized loop where the user of it is a vectorized instruction.
5933 const Align Alignment = getLoadStoreAlignment(I);
5934 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5935 ValTy->getScalarType(),
5936 Alignment, AS, CostKind);
5937
5938 // Get the overhead of the extractelement and insertelement instructions
5939 // we might create due to scalarization.
5940 Cost += getScalarizationOverhead(I, VF, CostKind);
5941
5942 // If we have a predicated load/store, it will need extra i1 extracts and
5943 // conditional branches, but may not be executed for each vector lane. Scale
5944 // the cost by the probability of executing the predicated block.
5945 if (isPredicatedInst(I)) {
5947
5948 // Add the cost of an i1 extract and a branch
5949 auto *Vec_i1Ty =
5952 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
5953 /*Insert=*/false, /*Extract=*/true, CostKind);
5954 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5955
5956 if (useEmulatedMaskMemRefHack(I, VF))
5957 // Artificially setting to a high enough value to practically disable
5958 // vectorization with such operations.
5959 Cost = 3000000;
5960 }
5961
5962 return Cost;
5963}
5964
5966LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5967 ElementCount VF) {
5968 Type *ValTy = getLoadStoreType(I);
5969 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5971 unsigned AS = getLoadStoreAddressSpace(I);
5972 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5974
5975 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5976 "Stride should be 1 or -1 for consecutive memory access");
5977 const Align Alignment = getLoadStoreAlignment(I);
5979 if (Legal->isMaskRequired(I)) {
5980 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5981 CostKind);
5982 } else {
5983 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5984 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5985 CostKind, OpInfo, I);
5986 }
5987
5988 bool Reverse = ConsecutiveStride < 0;
5989 if (Reverse)
5991 std::nullopt, CostKind, 0);
5992 return Cost;
5993}
5994
5996LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5997 ElementCount VF) {
5998 assert(Legal->isUniformMemOp(*I, VF));
5999
6000 Type *ValTy = getLoadStoreType(I);
6001 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6002 const Align Alignment = getLoadStoreAlignment(I);
6003 unsigned AS = getLoadStoreAddressSpace(I);
6005 if (isa<LoadInst>(I)) {
6006 return TTI.getAddressComputationCost(ValTy) +
6007 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6008 CostKind) +
6010 }
6011 StoreInst *SI = cast<StoreInst>(I);
6012
6013 bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
6014 return TTI.getAddressComputationCost(ValTy) +
6015 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6016 CostKind) +
6017 (isLoopInvariantStoreValue
6018 ? 0
6019 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6020 CostKind, VF.getKnownMinValue() - 1));
6021}
6022
6024LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6025 ElementCount VF) {
6026 Type *ValTy = getLoadStoreType(I);
6027 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6028 const Align Alignment = getLoadStoreAlignment(I);
6030
6031 return TTI.getAddressComputationCost(VectorTy) +
6033 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6035}
6036
6038LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6039 ElementCount VF) {
6040 Type *ValTy = getLoadStoreType(I);
6041 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6042 unsigned AS = getLoadStoreAddressSpace(I);
6044
6045 auto Group = getInterleavedAccessGroup(I);
6046 assert(Group && "Fail to get an interleaved access group.");
6047
6048 unsigned InterleaveFactor = Group->getFactor();
6049 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6050
6051 // Holds the indices of existing members in the interleaved group.
6053 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6054 if (Group->getMember(IF))
6055 Indices.push_back(IF);
6056
6057 // Calculate the cost of the whole interleaved group.
6058 bool UseMaskForGaps =
6059 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6060 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6062 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6063 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
6064
6065 if (Group->isReverse()) {
6066 // TODO: Add support for reversed masked interleaved access.
6068 "Reverse masked interleaved access not supported.");
6069 Cost += Group->getNumMembers() *
6071 std::nullopt, CostKind, 0);
6072 }
6073 return Cost;
6074}
6075
6076std::optional<InstructionCost>
6078 Instruction *I, ElementCount VF, Type *Ty,
6080 using namespace llvm::PatternMatch;
6081 // Early exit for no inloop reductions
6082 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6083 return std::nullopt;
6084 auto *VectorTy = cast<VectorType>(Ty);
6085
6086 // We are looking for a pattern of, and finding the minimal acceptable cost:
6087 // reduce(mul(ext(A), ext(B))) or
6088 // reduce(mul(A, B)) or
6089 // reduce(ext(A)) or
6090 // reduce(A).
6091 // The basic idea is that we walk down the tree to do that, finding the root
6092 // reduction instruction in InLoopReductionImmediateChains. From there we find
6093 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6094 // of the components. If the reduction cost is lower then we return it for the
6095 // reduction instruction and 0 for the other instructions in the pattern. If
6096 // it is not we return an invalid cost specifying the orignal cost method
6097 // should be used.
6098 Instruction *RetI = I;
6099 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6100 if (!RetI->hasOneUser())
6101 return std::nullopt;
6102 RetI = RetI->user_back();
6103 }
6104
6105 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6106 RetI->user_back()->getOpcode() == Instruction::Add) {
6107 RetI = RetI->user_back();
6108 }
6109
6110 // Test if the found instruction is a reduction, and if not return an invalid
6111 // cost specifying the parent to use the original cost modelling.
6112 if (!InLoopReductionImmediateChains.count(RetI))
6113 return std::nullopt;
6114
6115 // Find the reduction this chain is a part of and calculate the basic cost of
6116 // the reduction on its own.
6117 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
6118 Instruction *ReductionPhi = LastChain;
6119 while (!isa<PHINode>(ReductionPhi))
6120 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
6121
6122 const RecurrenceDescriptor &RdxDesc =
6123 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6124
6125 InstructionCost BaseCost;
6126 RecurKind RK = RdxDesc.getRecurrenceKind();
6129 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
6130 RdxDesc.getFastMathFlags(), CostKind);
6131 } else {
6133 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6134 }
6135
6136 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6137 // normal fmul instruction to the cost of the fadd reduction.
6138 if (RK == RecurKind::FMulAdd)
6139 BaseCost +=
6140 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6141
6142 // If we're using ordered reductions then we can just return the base cost
6143 // here, since getArithmeticReductionCost calculates the full ordered
6144 // reduction cost when FP reassociation is not allowed.
6145 if (useOrderedReductions(RdxDesc))
6146 return BaseCost;
6147
6148 // Get the operand that was not the reduction chain and match it to one of the
6149 // patterns, returning the better cost if it is found.
6150 Instruction *RedOp = RetI->getOperand(1) == LastChain
6151 ? dyn_cast<Instruction>(RetI->getOperand(0))
6152 : dyn_cast<Instruction>(RetI->getOperand(1));
6153
6154 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6155
6156 Instruction *Op0, *Op1;
6157 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6158 match(RedOp,
6160 match(Op0, m_ZExtOrSExt(m_Value())) &&
6161 Op0->getOpcode() == Op1->getOpcode() &&
6162 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6164 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6165
6166 // Matched reduce.add(ext(mul(ext(A), ext(B)))
6167 // Note that the extend opcodes need to all match, or if A==B they will have
6168 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6169 // which is equally fine.
6170 bool IsUnsigned = isa<ZExtInst>(Op0);
6171 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6172 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6173
6174 InstructionCost ExtCost =
6175 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6177 InstructionCost MulCost =
6178 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6179 InstructionCost Ext2Cost =
6180 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6182
6184 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6185
6186 if (RedCost.isValid() &&
6187 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6188 return I == RetI ? RedCost : 0;
6189 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6190 !TheLoop->isLoopInvariant(RedOp)) {
6191 // Matched reduce(ext(A))
6192 bool IsUnsigned = isa<ZExtInst>(RedOp);
6193 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6195 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6196 RdxDesc.getFastMathFlags(), CostKind);
6197
6198 InstructionCost ExtCost =
6199 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6201 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6202 return I == RetI ? RedCost : 0;
6203 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6204 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6205 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6206 Op0->getOpcode() == Op1->getOpcode() &&
6208 bool IsUnsigned = isa<ZExtInst>(Op0);
6209 Type *Op0Ty = Op0->getOperand(0)->getType();
6210 Type *Op1Ty = Op1->getOperand(0)->getType();
6211 Type *LargestOpTy =
6212 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6213 : Op0Ty;
6214 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6215
6216 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6217 // different sizes. We take the largest type as the ext to reduce, and add
6218 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6220 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6223 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6225 InstructionCost MulCost =
6226 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6227
6229 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6230 InstructionCost ExtraExtCost = 0;
6231 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6232 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6233 ExtraExtCost = TTI.getCastInstrCost(
6234 ExtraExtOp->getOpcode(), ExtType,
6235 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6237 }
6238
6239 if (RedCost.isValid() &&
6240 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6241 return I == RetI ? RedCost : 0;
6242 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6243 // Matched reduce.add(mul())
6244 InstructionCost MulCost =
6245 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6246
6248 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6249
6250 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6251 return I == RetI ? RedCost : 0;
6252 }
6253 }
6254
6255 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6256}
6257
6259LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6260 ElementCount VF) {
6261 // Calculate scalar cost only. Vectorization cost should be ready at this
6262 // moment.
6263 if (VF.isScalar()) {
6264 Type *ValTy = getLoadStoreType(I);
6265 const Align Alignment = getLoadStoreAlignment(I);
6266 unsigned AS = getLoadStoreAddressSpace(I);
6267
6268 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6269 return TTI.getAddressComputationCost(ValTy) +
6270 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6271 TTI::TCK_RecipThroughput, OpInfo, I);
6272 }
6273 return getWideningCost(I, VF);
6274}
6275
6276InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6278
6279 // There is no mechanism yet to create a scalable scalarization loop,
6280 // so this is currently Invalid.
6281 if (VF.isScalable())
6283
6284 if (VF.isScalar())
6285 return 0;
6286
6288 Type *RetTy = ToVectorTy(I->getType(), VF);
6289 if (!RetTy->isVoidTy() &&
6290 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6292 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6293 /*Insert*/ true,
6294 /*Extract*/ false, CostKind);
6295
6296 // Some targets keep addresses scalar.
6297 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6298 return Cost;
6299
6300 // Some targets support efficient element stores.
6301 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6302 return Cost;
6303
6304 // Collect operands to consider.
6305 CallInst *CI = dyn_cast<CallInst>(I);
6306 Instruction::op_range Ops = CI ? CI->args() : I->operands();
6307
6308 // Skip operands that do not require extraction/scalarization and do not incur
6309 // any overhead.
6311 for (auto *V : filterExtractingOperands(Ops, VF))
6312 Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6314 filterExtractingOperands(Ops, VF), Tys, CostKind);
6315}
6316
6318 if (VF.isScalar())
6319 return;
6320 NumPredStores = 0;
6321 for (BasicBlock *BB : TheLoop->blocks()) {
6322 // For each instruction in the old loop.
6323 for (Instruction &I : *BB) {
6325 if (!Ptr)
6326 continue;
6327
6328 // TODO: We should generate better code and update the cost model for
6329 // predicated uniform stores. Today they are treated as any other
6330 // predicated store (see added test cases in
6331 // invariant-store-vectorization.ll).
6332 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6333 NumPredStores++;
6334
6335 if (Legal->isUniformMemOp(I, VF)) {
6336 auto isLegalToScalarize = [&]() {
6337 if (!VF.isScalable())
6338 // Scalarization of fixed length vectors "just works".
6339 return true;
6340
6341 // We have dedicated lowering for unpredicated uniform loads and
6342 // stores. Note that even with tail folding we know that at least
6343 // one lane is active (i.e. generalized predication is not possible
6344 // here), and the logic below depends on this fact.
6345 if (!foldTailByMasking())
6346 return true;
6347
6348 // For scalable vectors, a uniform memop load is always
6349 // uniform-by-parts and we know how to scalarize that.
6350 if (isa<LoadInst>(I))
6351 return true;
6352
6353 // A uniform store isn't neccessarily uniform-by-part
6354 // and we can't assume scalarization.
6355 auto &SI = cast<StoreInst>(I);
6356 return TheLoop->isLoopInvariant(SI.getValueOperand());
6357 };
6358
6359 const InstructionCost GatherScatterCost =
6361 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6362
6363 // Load: Scalar load + broadcast
6364 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6365 // FIXME: This cost is a significant under-estimate for tail folded
6366 // memory ops.
6367 const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6368 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6369
6370 // Choose better solution for the current VF, Note that Invalid
6371 // costs compare as maximumal large. If both are invalid, we get
6372 // scalable invalid which signals a failure and a vectorization abort.
6373 if (GatherScatterCost < ScalarizationCost)
6374 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6375 else
6376 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6377 continue;
6378 }
6379
6380 // We assume that widening is the best solution when possible.
6381 if (memoryInstructionCanBeWidened(&I, VF)) {
6382 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6383 int ConsecutiveStride = Legal->isConsecutivePtr(
6385 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6386 "Expected consecutive stride.");
6387 InstWidening Decision =
6388 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6389 setWideningDecision(&I, VF, Decision, Cost);
6390 continue;
6391 }
6392
6393 // Choose between Interleaving, Gather/Scatter or Scalarization.
6395 unsigned NumAccesses = 1;
6396 if (isAccessInterleaved(&I)) {
6397 auto Group = getInterleavedAccessGroup(&I);
6398 assert(Group && "Fail to get an interleaved access group.");
6399
6400 // Make one decision for the whole group.
6401 if (getWideningDecision(&I, VF) != CM_Unknown)
6402 continue;
6403
6404 NumAccesses = Group->getNumMembers();
6406 InterleaveCost = getInterleaveGroupCost(&I, VF);
6407 }
6408
6409 InstructionCost GatherScatterCost =
6411 ? getGatherScatterCost(&I, VF) * NumAccesses
6413
6414 InstructionCost ScalarizationCost =
6415 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6416
6417 // Choose better solution for the current VF,
6418 // write down this decision and use it during vectorization.
6420 InstWidening Decision;
6421 if (InterleaveCost <= GatherScatterCost &&
6422 InterleaveCost < ScalarizationCost) {
6423 Decision = CM_Interleave;
6424 Cost = InterleaveCost;
6425 } else if (GatherScatterCost < ScalarizationCost) {
6426 Decision = CM_GatherScatter;
6427 Cost = GatherScatterCost;
6428 } else {
6429 Decision = CM_Scalarize;
6430 Cost = ScalarizationCost;
6431 }
6432 // If the instructions belongs to an interleave group, the whole group
6433 // receives the same decision. The whole group receives the cost, but
6434 // the cost will actually be assigned to one instruction.
6435 if (auto Group = getInterleavedAccessGroup(&I))
6436 setWideningDecision(Group, VF, Decision, Cost);
6437 else
6438 setWideningDecision(&I, VF, Decision, Cost);
6439 }
6440 }
6441
6442 // Make sure that any load of address and any other address computation
6443 // remains scalar unless there is gather/scatter support. This avoids
6444 // inevitable extracts into address registers, and also has the benefit of
6445 // activating LSR more, since that pass can't optimize vectorized
6446 // addresses.
6448 return;
6449
6450 // Start with all scalar pointer uses.
6452 for (BasicBlock *BB : TheLoop->blocks())
6453 for (Instruction &I : *BB) {
6454 Instruction *PtrDef =
6455 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6456 if (PtrDef && TheLoop->contains(PtrDef) &&
6458 AddrDefs.insert(PtrDef);
6459 }
6460
6461 // Add all instructions used to generate the addresses.
6463 append_range(Worklist, AddrDefs);
6464 while (!Worklist.empty()) {
6465 Instruction *I = Worklist.pop_back_val();
6466 for (auto &Op : I->operands())
6467 if (auto *InstOp = dyn_cast<Instruction>(Op))
6468 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6469 AddrDefs.insert(InstOp).second)
6470 Worklist.push_back(InstOp);
6471 }
6472
6473 for (auto *I : AddrDefs) {
6474 if (isa<LoadInst>(I)) {
6475 // Setting the desired widening decision should ideally be handled in
6476 // by cost functions, but since this involves the task of finding out
6477 // if the loaded register is involved in an address computation, it is
6478 // instead changed here when we know this is the case.
6479 InstWidening Decision = getWideningDecision(I, VF);
6480 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6481 // Scalarize a widened load of address.
6483 I, VF, CM_Scalarize,
6484 (VF.getKnownMinValue() *
6485 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6486 else if (auto Group = getInterleavedAccessGroup(I)) {
6487 // Scalarize an interleave group of address loads.
6488 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6489 if (Instruction *Member = Group->getMember(I))
6491 Member, VF, CM_Scalarize,
6492 (VF.getKnownMinValue() *
6493 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6494 }
6495 }
6496 } else
6497 // Make sure I gets scalarized and a cost estimate without
6498 // scalarization overhead.
6499 ForcedScalars[VF].insert(I);
6500 }
6501}
6502
6504 assert(!VF.isScalar() &&
6505 "Trying to set a vectorization decision for a scalar VF");
6506
6507 for (BasicBlock *BB : TheLoop->blocks()) {
6508 // For each instruction in the old loop.
6509 for (Instruction &I : *BB) {
6510 CallInst *CI = dyn_cast<CallInst>(&I);
6511
6512 if (!CI)
6513 continue;
6514
6519
6520 Function *ScalarFunc = CI->getCalledFunction();
6521 Type *ScalarRetTy = CI->getType();
6522 SmallVector<Type *, 4> Tys, ScalarTys;
6523 bool MaskRequired = Legal->isMaskRequired(CI);
6524 for (auto &ArgOp : CI->args())
6525 ScalarTys.push_back(ArgOp->getType());
6526
6527 // Compute corresponding vector type for return value and arguments.
6528 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
6529 for (Type *ScalarTy : ScalarTys)
6530 Tys.push_back(ToVectorTy(ScalarTy, VF));
6531
6532 // An in-loop reduction using an fmuladd intrinsic is a special case;
6533 // we don't want the normal cost for that intrinsic.
6535 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6538 std::nullopt, *RedCost);
6539 continue;
6540 }
6541
6542 // Estimate cost of scalarized vector call. The source operands are
6543 // assumed to be vectors, so we need to extract individual elements from
6544 // there, execute VF scalar calls, and then gather the result into the
6545 // vector return value.
6546 InstructionCost ScalarCallCost =
6547 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6548
6549 // Compute costs of unpacking argument values for the scalar calls and
6550 // packing the return values to a vector.
6551 InstructionCost ScalarizationCost =
6552 getScalarizationOverhead(CI, VF, CostKind);
6553
6554 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6555
6556 // Find the cost of vectorizing the call, if we can find a suitable
6557 // vector variant of the function.
6558 bool UsesMask = false;
6559 VFInfo FuncInfo;
6560 Function *VecFunc = nullptr;
6561 // Search through any available variants for one we can use at this VF.
6562 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6563 // Must match requested VF.
6564 if (Info.Shape.VF != VF)
6565 continue;
6566
6567 // Must take a mask argument if one is required
6568 if (MaskRequired && !Info.isMasked())
6569 continue;
6570
6571 // Check that all parameter kinds are supported
6572 bool ParamsOk = true;
6573 for (VFParameter Param : Info.Shape.Parameters) {
6574 switch (Param.ParamKind) {
6576 break;
6578 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6579 // Make sure the scalar parameter in the loop is invariant.
6580 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6581 TheLoop))
6582 ParamsOk = false;
6583 break;
6584 }
6586 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6587 // Find the stride for the scalar parameter in this loop and see if
6588 // it matches the stride for the variant.
6589 // TODO: do we need to figure out the cost of an extract to get the
6590 // first lane? Or do we hope that it will be folded away?
6591 ScalarEvolution *SE = PSE.getSE();
6592 const auto *SAR =
6593 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6594
6595 if (!SAR || SAR->getLoop() != TheLoop) {
6596 ParamsOk = false;
6597 break;
6598 }
6599
6600 const SCEVConstant *Step =
6601 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6602
6603 if (!Step ||
6604 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6605 ParamsOk = false;
6606
6607 break;
6608 }
6610 UsesMask = true;
6611 break;
6612 default:
6613 ParamsOk = false;
6614 break;
6615 }
6616 }
6617
6618 if (!ParamsOk)
6619 continue;
6620
6621 // Found a suitable candidate, stop here.
6622 VecFunc = CI->getModule()->getFunction(Info.VectorName);
6623 FuncInfo = Info;
6624 break;
6625 }
6626
6627 // Add in the cost of synthesizing a mask if one wasn't required.
6628 InstructionCost MaskCost = 0;
6629 if (VecFunc && UsesMask && !MaskRequired)
6630 MaskCost = TTI.getShuffleCost(
6633 VecFunc->getFunctionType()->getContext()),
6634 VF));
6635
6636 if (TLI && VecFunc && !CI->isNoBuiltin())
6637 VectorCost =
6638 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6639
6640 // Find the cost of an intrinsic; some targets may have instructions that
6641 // perform the operation without needing an actual call.
6643 if (IID != Intrinsic::not_intrinsic)
6644 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6645
6646 InstructionCost Cost = ScalarCost;
6647 InstWidening Decision = CM_Scalarize;
6648
6649 if (VectorCost <= Cost) {
6650 Cost = VectorCost;
6651 Decision = CM_VectorCall;
6652 }
6653
6654 if (IntrinsicCost <= Cost) {
6655 Cost = IntrinsicCost;
6656 Decision = CM_IntrinsicCall;
6657 }
6658
6659 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6661 }
6662 }
6663}
6664
6667 ElementCount VF) {
6668 // If we know that this instruction will remain uniform, check the cost of
6669 // the scalar version.
6671 VF = ElementCount::getFixed(1);
6672
6673 if (VF.isVector() && isProfitableToScalarize(I, VF))
6674 return InstsToScalarize[VF][I];
6675
6676 // Forced scalars do not have any scalarization overhead.
6677 auto ForcedScalar = ForcedScalars.find(VF);
6678 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6679 auto InstSet = ForcedScalar->second;
6680 if (InstSet.count(I))
6682 VF.getKnownMinValue();
6683 }
6684
6685 Type *RetTy = I->getType();
6687 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6688 auto SE = PSE.getSE();
6690
6691 auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6692 ElementCount VF) -> bool {
6693 if (VF.isScalar())
6694 return true;
6695
6696 auto Scalarized = InstsToScalarize.find(VF);
6697 assert(Scalarized != InstsToScalarize.end() &&
6698 "VF not yet analyzed for scalarization profitability");
6699 return !Scalarized->second.count(I) &&
6700 llvm::all_of(I->users(), [&](User *U) {
6701 auto *UI = cast<Instruction>(U);
6702 return !Scalarized->second.count(UI);
6703 });
6704 };
6705 (void) hasSingleCopyAfterVectorization;
6706
6707 Type *VectorTy;
6708 if (isScalarAfterVectorization(I, VF)) {
6709 // With the exception of GEPs and PHIs, after scalarization there should
6710 // only be one copy of the instruction generated in the loop. This is
6711 // because the VF is either 1, or any instructions that need scalarizing
6712 // have already been dealt with by the time we get here. As a result,
6713 // it means we don't have to multiply the instruction cost by VF.
6714 assert(I->getOpcode() == Instruction::GetElementPtr ||
6715 I->getOpcode() == Instruction::PHI ||
6716 (I->getOpcode() == Instruction::BitCast &&
6717 I->getType()->isPointerTy()) ||
6718 hasSingleCopyAfterVectorization(I, VF));
6719 VectorTy = RetTy;
6720 } else
6721 VectorTy = ToVectorTy(RetTy, VF);
6722
6723 if (VF.isVector() && VectorTy->isVectorTy() &&
6724 !TTI.getNumberOfParts(VectorTy))
6726
6727 // TODO: We need to estimate the cost of intrinsic calls.
6728 switch (I->getOpcode()) {
6729 case Instruction::GetElementPtr:
6730 // We mark this instruction as zero-cost because the cost of GEPs in
6731 // vectorized code depends on whether the corresponding memory instruction
6732 // is scalarized or not. Therefore, we handle GEPs with the memory
6733 // instruction cost.
6734 return 0;
6735 case Instruction::Br: {
6736 // In cases of scalarized and predicated instructions, there will be VF
6737 // predicated blocks in the vectorized loop. Each branch around these
6738 // blocks requires also an extract of its vector compare i1 element.
6739 // Note that the conditional branch from the loop latch will be replaced by
6740 // a single branch controlling the loop, so there is no extra overhead from
6741 // scalarization.
6742 bool ScalarPredicatedBB = false;
6743 BranchInst *BI = cast<BranchInst>(I);
6744 if (VF.isVector() && BI->isConditional() &&
6745 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6746 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6747 BI->getParent() != TheLoop->getLoopLatch())
6748 ScalarPredicatedBB = true;
6749
6750 if (ScalarPredicatedBB) {
6751 // Not possible to scalarize scalable vector with predicated instructions.
6752 if (VF.isScalable())
6754 // Return cost for branches around scalarized and predicated blocks.
6755 auto *Vec_i1Ty =
6756 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6757 return (
6759 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
6760 /*Insert*/ false, /*Extract*/ true, CostKind) +
6761 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6762 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6763 // The back-edge branch will remain, as will all scalar branches.
6764 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6765 else
6766 // This branch will be eliminated by if-conversion.
6767 return 0;
6768 // Note: We currently assume zero cost for an unconditional branch inside
6769 // a predicated block since it will become a fall-through, although we
6770 // may decide in the future to call TTI for all branches.
6771 }
6772 case Instruction::PHI: {
6773 auto *Phi = cast<PHINode>(I);
6774
6775 // First-order recurrences are replaced by vector shuffles inside the loop.
6776 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6777 // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
6778 // penultimate value of the recurrence.
6779 // TODO: Consider vscale_range info.
6780 if (VF.isScalable() && VF.getKnownMinValue() == 1)
6783 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6785 cast<VectorType>(VectorTy), Mask, CostKind,
6786 VF.getKnownMinValue() - 1);
6787 }
6788
6789 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6790 // converted into select instructions. We require N - 1 selects per phi
6791 // node, where N is the number of incoming values.
6792 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6793 return (Phi->getNumIncomingValues() - 1) *
6795 Instruction::Select, ToVectorTy(Phi->getType(), VF),
6796 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6798
6799 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6800 }
6801 case Instruction::UDiv:
6802 case Instruction::SDiv:
6803 case Instruction::URem:
6804 case Instruction::SRem:
6805 if (VF.isVector() && isPredicatedInst(I)) {
6806 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6807 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6808 ScalarCost : SafeDivisorCost;
6809 }
6810 // We've proven all lanes safe to speculate, fall through.
6811 [[fallthrough]];
6812 case Instruction::Add:
6813 case Instruction::FAdd:
6814 case Instruction::Sub:
6815 case Instruction::FSub:
6816 case Instruction::Mul:
6817 case Instruction::FMul:
6818 case Instruction::FDiv:
6819 case Instruction::FRem:
6820 case Instruction::Shl:
6821 case Instruction::LShr:
6822 case Instruction::AShr:
6823 case Instruction::And:
6824 case Instruction::Or:
6825 case Instruction::Xor: {
6826 // If we're speculating on the stride being 1, the multiplication may
6827 // fold away. We can generalize this for all operations using the notion
6828 // of neutral elements. (TODO)
6829 if (I->getOpcode() == Instruction::Mul &&
6830 (PSE.getSCEV(I->getOperand(0))->isOne() ||
6831 PSE.getSCEV(I->getOperand(1))->isOne()))
6832 return 0;
6833
6834 // Detect reduction patterns
6835 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6836 return *RedCost;
6837
6838 // Certain instructions can be cheaper to vectorize if they have a constant
6839 // second vector operand. One example of this are shifts on x86.
6840 Value *Op2 = I->getOperand(1);
6841 auto Op2Info = TTI.getOperandInfo(Op2);
6842 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6843 Legal->isInvariant(Op2))
6845
6846 SmallVector<const Value *, 4> Operands(I->operand_values());
6848 I->getOpcode(), VectorTy, CostKind,
6849 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6850 Op2Info, Operands, I, TLI);
6851 }
6852 case Instruction::FNeg: {
6854 I->getOpcode(), VectorTy, CostKind,
6855 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6856 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6857 I->getOperand(0), I);
6858 }
6859 case Instruction::Select: {
6860 SelectInst *SI = cast<SelectInst>(I);
6861 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6862 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6863
6864 const Value *Op0, *Op1;
6865 using namespace llvm::PatternMatch;
6866 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6867 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6868 // select x, y, false --> x & y
6869 // select x, true, y --> x | y
6870 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6871 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6872 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6873 Op1->getType()->getScalarSizeInBits() == 1);
6874
6877 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6878 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6879 }
6880
6881 Type *CondTy = SI->getCondition()->getType();
6882 if (!ScalarCond)
6883 CondTy = VectorType::get(CondTy, VF);
6884
6886 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6887 Pred = Cmp->getPredicate();
6888 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6889 CostKind, I);
6890 }
6891 case Instruction::ICmp:
6892 case Instruction::FCmp: {
6893 Type *ValTy = I->getOperand(0)->getType();
6894 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6895 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6896 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6897 VectorTy = ToVectorTy(ValTy, VF);
6898 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6899 cast<CmpInst>(I)->getPredicate(), CostKind,
6900 I);
6901 }
6902 case Instruction::Store:
6903 case Instruction::Load: {
6904 ElementCount Width = VF;
6905 if (Width.isVector()) {
6906 InstWidening Decision = getWideningDecision(I, Width);
6907 assert(Decision != CM_Unknown &&
6908 "CM decision should be taken at this point");
6911 if (Decision == CM_Scalarize)
6912 Width = ElementCount::getFixed(1);
6913 }
6914 VectorTy = ToVectorTy(getLoadStoreType(I), Width);
6915 return getMemoryInstructionCost(I, VF);
6916 }
6917 case Instruction::BitCast:
6918 if (I->getType()->isPointerTy())
6919 return 0;
6920 [[fallthrough]];
6921 case Instruction::ZExt:
6922 case Instruction::SExt:
6923 case Instruction::FPToUI:
6924 case Instruction::FPToSI:
6925 case Instruction::FPExt:
6926 case Instruction::PtrToInt:
6927 case Instruction::IntToPtr:
6928 case Instruction::SIToFP:
6929 case Instruction::UIToFP:
6930 case Instruction::Trunc:
6931 case Instruction::FPTrunc: {
6932 // Computes the CastContextHint from a Load/Store instruction.
6933 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6934 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6935 "Expected a load or a store!");
6936
6937 if (VF.isScalar() || !TheLoop->contains(I))
6939
6940 switch (getWideningDecision(I, VF)) {
6952 llvm_unreachable("Instr did not go through cost modelling?");
6955 llvm_unreachable_internal("Instr has invalid widening decision");
6956 }
6957
6958 llvm_unreachable("Unhandled case!");
6959 };
6960
6961 unsigned Opcode = I->getOpcode();
6963 // For Trunc, the context is the only user, which must be a StoreInst.
6964 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6965 if (I->hasOneUse())
6966 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6967 CCH = ComputeCCH(Store);
6968 }
6969 // For Z/Sext, the context is the operand, which must be a LoadInst.
6970 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6971 Opcode == Instruction::FPExt) {
6972 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6973 CCH = ComputeCCH(Load);
6974 }
6975
6976 // We optimize the truncation of induction variables having constant
6977 // integer steps. The cost of these truncations is the same as the scalar
6978 // operation.
6979 if (isOptimizableIVTruncate(I, VF)) {
6980 auto *Trunc = cast<TruncInst>(I);
6981 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6982 Trunc->getSrcTy(), CCH, CostKind, Trunc);
6983 }
6984
6985 // Detect reduction patterns
6986 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6987 return *RedCost;
6988
6989 Type *SrcScalarTy = I->getOperand(0)->getType();
6990 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6991 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6992 SrcScalarTy =
6993 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6994 Type *SrcVecTy =
6995 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6996
6998 // If the result type is <= the source type, there will be no extend
6999 // after truncating the users to the minimal required bitwidth.
7000 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
7001 (I->getOpcode() == Instruction::ZExt ||
7002 I->getOpcode() == Instruction::SExt))
7003 return 0;
7004 }
7005
7006 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7007 }
7008 case Instruction::Call:
7009 return getVectorCallCost(cast<CallInst>(I), VF);
7010 case Instruction::ExtractValue:
7012 case Instruction::Alloca:
7013 // We cannot easily widen alloca to a scalable alloca, as
7014 // the result would need to be a vector of pointers.
7015 if (VF.isScalable())
7017 [[fallthrough]];
7018 default:
7019 // This opcode is unknown. Assume that it is the same as 'mul'.
7020 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7021 } // end of switch.
7022}
7023
7025 // Ignore ephemeral values.
7027
7028 SmallVector<Value *, 4> DeadInterleavePointerOps;
7029 for (BasicBlock *BB : TheLoop->blocks())
7030 for (Instruction &I : *BB) {
7031 // Find all stores to invariant variables. Since they are going to sink
7032 // outside the loop we do not need calculate cost for them.
7033 StoreInst *SI;
7034 if ((SI = dyn_cast<StoreInst>(&I)) &&
7035 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7036 ValuesToIgnore.insert(&I);
7037
7038 // For interleave groups, we only create a pointer for the start of the
7039 // interleave group. Queue up addresses of group members except the insert
7040 // position for further processing.
7041 if (isAccessInterleaved(&I)) {
7042 auto *Group = getInterleavedAccessGroup(&I);
7043 if (Group->getInsertPos() == &I)
7044 continue;
7045 Value *PointerOp = getLoadStorePointerOperand(&I);
7046 DeadInterleavePointerOps.push_back(PointerOp);
7047 }
7048 }
7049
7050 // Mark ops feeding interleave group members as free, if they are only used
7051 // by other dead computations.
7052 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
7053 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
7054 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
7055 Instruction *UI = cast<Instruction>(U);
7056 return !VecValuesToIgnore.contains(U) &&
7057 (!isAccessInterleaved(UI) ||
7058 getInterleavedAccessGroup(UI)->getInsertPos() == UI);
7059 }))
7060 continue;
7061 VecValuesToIgnore.insert(Op);
7062 DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
7063 }
7064
7065 // Ignore type-promoting instructions we identified during reduction
7066 // detection.
7067 for (const auto &Reduction : Legal->getReductionVars()) {
7068 const RecurrenceDescriptor &RedDes = Reduction.second;
7069 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7070 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7071 }
7072 // Ignore type-casting instructions we identified during induction
7073 // detection.
7074 for (const auto &Induction : Legal->getInductionVars()) {
7075 const InductionDescriptor &IndDes = Induction.second;
7076 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7077 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7078 }
7079}
7080
7082 for (const auto &Reduction : Legal->getReductionVars()) {
7083 PHINode *Phi = Reduction.first;
7084 const RecurrenceDescriptor &RdxDesc = Reduction.second;
7085
7086 // We don't collect reductions that are type promoted (yet).
7087 if (RdxDesc.getRecurrenceType() != Phi->getType())
7088 continue;
7089
7090 // If the target would prefer this reduction to happen "in-loop", then we
7091 // want to record it as such.
7092 unsigned Opcode = RdxDesc.getOpcode();
7093 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7094 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7096 continue;
7097
7098 // Check that we can correctly put the reductions into the loop, by
7099 // finding the chain of operations that leads from the phi to the loop
7100 // exit value.
7101 SmallVector<Instruction *, 4> ReductionOperations =
7102 RdxDesc.getReductionOpChain(Phi, TheLoop);
7103 bool InLoop = !ReductionOperations.empty();
7104
7105 if (InLoop) {
7106 InLoopReductions.insert(Phi);
7107 // Add the elements to InLoopReductionImmediateChains for cost modelling.
7108 Instruction *LastChain = Phi;
7109 for (auto *I : ReductionOperations) {
7110 InLoopReductionImmediateChains[I] = LastChain;
7111 LastChain = I;
7112 }
7113 }
7114 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7115 << " reduction for phi: " << *Phi << "\n");
7116 }
7117}
7118
7120 DebugLoc DL, const Twine &Name) {
7122 Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
7123 return tryInsertInstruction(
7124 new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
7125}
7126
7127// This function will select a scalable VF if the target supports scalable
7128// vectors and a fixed one otherwise.
7129// TODO: we could return a pair of values that specify the max VF and
7130// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7131// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7132// doesn't have a cost model that can choose which plan to execute if
7133// more than one is generated.
7136 unsigned WidestType;
7137 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7138
7143
7145 unsigned N = RegSize.getKnownMinValue() / WidestType;
7146 return ElementCount::get(N, RegSize.isScalable());
7147}
7148
7151 ElementCount VF = UserVF;
7152 // Outer loop handling: They may require CFG and instruction level
7153 // transformations before even evaluating whether vectorization is profitable.
7154 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7155 // the vectorization pipeline.
7156 if (!OrigLoop->isInnermost()) {
7157 // If the user doesn't provide a vectorization factor, determine a
7158 // reasonable one.
7159 if (UserVF.isZero()) {
7160 VF = determineVPlanVF(TTI, CM);
7161 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7162
7163 // Make sure we have a VF > 1 for stress testing.
7164 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7165 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7166 << "overriding computed VF.\n");
7167 VF = ElementCount::getFixed(4);
7168 }
7169 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7171 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7172 << "not supported by the target.\n");
7174 "Scalable vectorization requested but not supported by the target",
7175 "the scalable user-specified vectorization width for outer-loop "
7176 "vectorization cannot be used because the target does not support "
7177 "scalable vectors.",
7178 "ScalableVFUnfeasible", ORE, OrigLoop);
7180 }
7181 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7183 "VF needs to be a power of two");
7184 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7185 << "VF " << VF << " to build VPlans.\n");
7186 buildVPlans(VF, VF);
7187
7188 // For VPlan build stress testing, we bail out after VPlan construction.
7191
7192 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7193 }
7194
7195 LLVM_DEBUG(
7196 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7197 "VPlan-native path.\n");
7199}
7200
7201std::optional<VectorizationFactor>
7203 assert(OrigLoop->isInnermost() && "Inner loop expected.");
7206
7207 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7208 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7209 return std::nullopt;
7210
7211 // Invalidate interleave groups if all blocks of loop will be predicated.
7212 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7214 LLVM_DEBUG(
7215 dbgs()
7216 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7217 "which requires masked-interleaved support.\n");
7219 // Invalidating interleave groups also requires invalidating all decisions
7220 // based on them, which includes widening decisions and uniform and scalar
7221 // values.
7223 }
7224
7225 if (CM.foldTailByMasking())
7227
7228 ElementCount MaxUserVF =
7229 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7230 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7231 if (!UserVF.isZero() && UserVFIsLegal) {
7233 "VF needs to be a power of two");
7234 // Collect the instructions (and their associated costs) that will be more
7235 // profitable to scalarize.
7237 if (CM.selectUserVectorizationFactor(UserVF)) {
7238 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7239 buildVPlansWithVPRecipes(UserVF, UserVF);
7240 if (!hasPlanWithVF(UserVF)) {
7241 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
7242 << ".\n");
7243 return std::nullopt;
7244 }
7245
7247 return {{UserVF, 0, 0}};
7248 } else
7249 reportVectorizationInfo("UserVF ignored because of invalid costs.",
7250 "InvalidCost", ORE, OrigLoop);
7251 }
7252
7253 // Collect the Vectorization Factor Candidates.
7254 SmallVector<ElementCount> VFCandidates;
7255 for (auto VF = ElementCount::getFixed(1);
7256 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7257 VFCandidates.push_back(VF);
7258 for (auto VF = ElementCount::getScalable(1);
7259 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7260 VFCandidates.push_back(VF);
7261
7263 for (const auto &VF : VFCandidates) {
7264 // Collect Uniform and Scalar instructions after vectorization with VF.
7266
7267 // Collect the instructions (and their associated costs) that will be more
7268 // profitable to scalarize.
7269 if (VF.isVector())
7271 }
7272
7273 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7274 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7275
7277 if (VPlans.empty())
7278 return std::nullopt;
7279 if (all_of(VPlans,
7280 [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }))
7282
7283 // Select the optimal vectorization factor according to the legacy cost-model.
7284 // This is now only used to verify the decisions by the new VPlan-based
7285 // cost-model and will be retired once the VPlan-based cost-model is
7286 // stabilized.
7287 VectorizationFactor VF = selectVectorizationFactor();
7288 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7289 if (!hasPlanWithVF(VF.Width)) {
7290 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
7291 << ".\n");
7292 return std::nullopt;
7293 }
7294 return VF;
7295}
7296
7298 ElementCount VF) const {
7299 return CM.getInstructionCost(UI, VF);
7300}
7301
7302bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
7303 return CM.ValuesToIgnore.contains(UI) ||
7304 (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
7305 SkipCostComputation.contains(UI);
7306}
7307
7308InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7309 ElementCount VF) const {
7311 LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
7312 VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx, CM);
7313
7314 // Cost modeling for inductions is inaccurate in the legacy cost model
7315 // compared to the recipes that are generated. To match here initially during
7316 // VPlan cost model bring up directly use the induction costs from the legacy
7317 // cost model. Note that we do this as pre-processing; the VPlan may not have
7318 // any recipes associated with the original induction increment instruction
7319 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7320 // the cost of induction phis and increments (both that are represented by
7321 // recipes and those that are not), to avoid distinguishing between them here,
7322 // and skip all recipes that represent induction phis and increments (the
7323 // former case) later on, if they exist, to avoid counting them twice.
7324 // Similarly we pre-compute the cost of any optimized truncates.
7325 // TODO: Switch to more accurate costing based on VPlan.
7326 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
7327 Instruction *IVInc = cast<Instruction>(
7328 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
7329 SmallVector<Instruction *> IVInsts = {IV, IVInc};
7330 for (User *U : IV->users()) {
7331 auto *CI = cast<Instruction>(U);
7332 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7333 continue;
7334 IVInsts.push_back(CI);
7335 }
7336 for (Instruction *IVInst : IVInsts) {
7337 if (!CostCtx.SkipCostComputation.insert(IVInst).second)
7338 continue;
7339 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
7340 LLVM_DEBUG({
7341 dbgs() << "Cost of " << InductionCost << " for VF " << VF
7342 << ": induction instruction " << *IVInst << "\n";
7343 });
7344 Cost += InductionCost;
7345 }
7346 }
7347
7348 /// Compute the cost of all exiting conditions of the loop using the legacy
7349 /// cost model. This is to match the legacy behavior, which adds the cost of
7350 /// all exit conditions. Note that this over-estimates the cost, as there will
7351 /// be a single condition to control the vector loop.
7353 CM.TheLoop->getExitingBlocks(Exiting);
7354 SetVector<Instruction *> ExitInstrs;
7355 // Collect all exit conditions.
7356 for (BasicBlock *EB : Exiting) {
7357 auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7358 if (!Term)
7359 continue;
7360 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7361 ExitInstrs.insert(CondI);
7362 }
7363 }
7364 // Compute the cost of all instructions only feeding the exit conditions.
7365 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7366 Instruction *CondI = ExitInstrs[I];
7367 if (!OrigLoop->contains(CondI) ||
7368 !CostCtx.SkipCostComputation.insert(CondI).second)
7369 continue;
7370 Cost += CostCtx.getLegacyCost(CondI, VF);
7371 for (Value *Op : CondI->operands()) {
7372 auto *OpI = dyn_cast<Instruction>(Op);
7373 if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {
7374 return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
7375 !ExitInstrs.contains(cast<Instruction>(U));
7376 }))
7377 continue;
7378 ExitInstrs.insert(OpI);
7379 }
7380 }
7381
7382 // The legacy cost model has special logic to compute the cost of in-loop
7383 // reductions, which may be smaller than the sum of all instructions involved
7384 // in the reduction. For AnyOf reductions, VPlan codegen may remove the select
7385 // which the legacy cost model uses to assign cost. Pre-compute their costs
7386 // for now.
7387 // TODO: Switch to costing based on VPlan once the logic has been ported.
7388 for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7389 if (!CM.isInLoopReduction(RedPhi) &&
7391 RdxDesc.getRecurrenceKind()))
7392 continue;
7393
7394 // AnyOf reduction codegen may remove the select. To match the legacy cost
7395 // model, pre-compute the cost for AnyOf reductions here.
7397 RdxDesc.getRecurrenceKind())) {
7398 auto *Select = cast<SelectInst>(*find_if(
7399 RedPhi->users(), [](User *U) { return isa<SelectInst>(U); }));
7400 assert(!CostCtx.SkipCostComputation.contains(Select) &&
7401 "reduction op visited multiple times");
7402 CostCtx.SkipCostComputation.insert(Select);
7403 auto ReductionCost = CostCtx.getLegacyCost(Select, VF);
7404 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7405 << ":\n any-of reduction " << *Select << "\n");
7406 Cost += ReductionCost;
7407 continue;
7408 }
7409
7410 const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7411 SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7412 ChainOps.end());
7413 // Also include the operands of instructions in the chain, as the cost-model
7414 // may mark extends as free.
7415 for (auto *ChainOp : ChainOps) {
7416 for (Value *Op : ChainOp->operands()) {
7417 if (auto *I = dyn_cast<Instruction>(Op))
7418 ChainOpsAndOperands.insert(I);
7419 }
7420 }
7421
7422 // Pre-compute the cost for I, if it has a reduction pattern cost.
7423 for (Instruction *I : ChainOpsAndOperands) {
7424 auto ReductionCost = CM.getReductionPatternCost(
7425 I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
7426 if (!ReductionCost)
7427 continue;
7428
7429 assert(!CostCtx.SkipCostComputation.contains(I) &&
7430 "reduction op visited multiple times");
7431 CostCtx.SkipCostComputation.insert(I);
7432 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7433 << ":\n in-loop reduction " << *I << "\n");
7434 Cost += *ReductionCost;
7435 }
7436 }
7437
7438 // Pre-compute the costs for branches except for the backedge, as the number
7439 // of replicate regions in a VPlan may not directly match the number of
7440 // branches, which would lead to different decisions.
7441 // TODO: Compute cost of branches for each replicate region in the VPlan,
7442 // which is more accurate than the legacy cost model.
7443 for (BasicBlock *BB : OrigLoop->blocks()) {
7444 if (BB == OrigLoop->getLoopLatch())
7445 continue;
7446 CostCtx.SkipCostComputation.insert(BB->getTerminator());
7447 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7448 Cost += BranchCost;
7449 }
7450 // Now compute and add the VPlan-based cost.
7451 Cost += Plan.cost(VF, CostCtx);
7452 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
7453 return Cost;
7454}
7455
7457 // If there is a single VPlan with a single VF, return it directly.
7458 VPlan &FirstPlan = *VPlans[0];
7459 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7460 return FirstPlan;
7461
7462 VPlan *BestPlan = &FirstPlan;
7464 assert(hasPlanWithVF(ScalarVF) &&
7465 "More than a single plan/VF w/o any plan having scalar VF");
7466
7467 // TODO: Compute scalar cost using VPlan-based cost model.
7468 InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7469 VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost);
7470
7471 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7472 if (ForceVectorization) {
7473 // Ignore scalar width, because the user explicitly wants vectorization.
7474 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7475 // evaluation.
7476 BestFactor.Cost = InstructionCost::getMax();
7477 }
7478
7479 for (auto &P : VPlans) {
7480 for (ElementCount VF : P->vectorFactors()) {
7481 if (VF.isScalar())
7482 continue;
7483 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7484 LLVM_DEBUG(
7485 dbgs()
7486 << "LV: Not considering vector loop of width " << VF
7487 << " because it will not generate any vector instructions.\n");
7488 continue;
7489 }
7490
7491 InstructionCost Cost = cost(*P, VF);
7492 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7493 if (isMoreProfitable(CurrentFactor, BestFactor)) {
7494 BestFactor = CurrentFactor;
7495 BestPlan = &*P;
7496 }
7497 }
7498 }
7499 BestPlan->setVF(BestFactor.Width);
7500 return *BestPlan;
7501}
7502
7504 assert(count_if(VPlans,
7505 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7506 1 &&
7507 "Best VF has not a single VPlan.");
7508
7509 for (const VPlanPtr &Plan : VPlans) {
7510 if (Plan->hasVF(VF))
7511 return *Plan.get();
7512 }
7513 llvm_unreachable("No plan found!");
7514}
7515
7518 // Reserve first location for self reference to the LoopID metadata node.
7519 MDs.push_back(nullptr);
7520 bool IsUnrollMetadata = false;
7521 MDNode *LoopID = L->getLoopID();
7522 if (LoopID) {
7523 // First find existing loop unrolling disable metadata.
7524 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7525 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7526 if (MD) {
7527 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7528 IsUnrollMetadata =
7529 S && S->getString().starts_with("llvm.loop.unroll.disable");
7530 }
7531 MDs.push_back(LoopID->getOperand(i));
7532 }
7533 }
7534
7535 if (!IsUnrollMetadata) {
7536 // Add runtime unroll disable metadata.
7537 LLVMContext &Context = L->getHeader()->getContext();
7538 SmallVector<Metadata *, 1> DisableOperands;
7539 DisableOperands.push_back(
7540 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7541 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7542 MDs.push_back(DisableNode);
7543 MDNode *NewLoopID = MDNode::get(Context, MDs);
7544 // Set operand 0 to refer to the loop id itself.
7545 NewLoopID->replaceOperandWith(0, NewLoopID);
7546 L->setLoopID(NewLoopID);
7547 }
7548}
7549
7550// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7551// create a merge phi node for it and add it to \p ReductionResumeValues.
7553 VPInstruction *RedResult,
7555 VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock,
7556 bool VectorizingEpilogue) {
7557 if (!RedResult ||
7559 return;
7560
7561 auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
7562 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7563
7564 Value *FinalValue =
7565 State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
7566 auto *ResumePhi =
7567 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
7568 if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind(
7569 RdxDesc.getRecurrenceKind())) {
7570 auto *Cmp = cast<ICmpInst>(PhiR->getStartValue()->getUnderlyingValue());
7571 assert(Cmp->getPredicate() == CmpInst::ICMP_NE);
7572 assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue());
7573 ResumePhi = cast<PHINode>(Cmp->getOperand(0));
7574 }
7575 assert((!VectorizingEpilogue || ResumePhi) &&
7576 "when vectorizing the epilogue loop, we need a resume phi from main "
7577 "vector loop");
7578
7579 // TODO: bc.merge.rdx should not be created here, instead it should be
7580 // modeled in VPlan.
7581 BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7582 // Create a phi node that merges control-flow from the backedge-taken check
7583 // block and the middle block.
7584 auto *BCBlockPhi =
7585 PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
7586 LoopScalarPreHeader->getTerminator()->getIterator());
7587
7588 // If we are fixing reductions in the epilogue loop then we should already
7589 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7590 // we carry over the incoming values correctly.
7591 for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
7592 if (Incoming == LoopMiddleBlock)
7593 BCBlockPhi->addIncoming(FinalValue, Incoming);
7594 else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
7595 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
7596 Incoming);
7597 else
7598 BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming);
7599 }
7600
7601 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
7602 // TODO: This fixup should instead be modeled in VPlan.
7603 // Fix the scalar loop reduction variable with the incoming reduction sum
7604 // from the vector body and from the backedge value.
7605 int IncomingEdgeBlockIdx =
7606 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
7607 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
7608 // Pick the other block.
7609 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7610 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
7611 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7612 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
7613
7614 ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7615}
7616
7617std::pair<DenseMap<const SCEV *, Value *>,
7620 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7621 InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7622 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7623 assert(BestVPlan.hasVF(BestVF) &&
7624 "Trying to execute plan with unsupported VF");
7625 assert(BestVPlan.hasUF(BestUF) &&
7626 "Trying to execute plan with unsupported UF");
7627 assert(
7628 (IsEpilogueVectorization || !ExpandedSCEVs) &&
7629 "expanded SCEVs to reuse can only be used during epilogue vectorization");
7630 (void)IsEpilogueVectorization;
7631
7632 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7633
7634 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
7635 << ", UF=" << BestUF << '\n');
7636 BestVPlan.setName("Final VPlan");
7637 LLVM_DEBUG(BestVPlan.dump());
7638
7639 // Perform the actual loop transformation.
7640 VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7641 OrigLoop->getHeader()->getContext());
7642
7643 // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7644 // before making any changes to the CFG.
7645 if (!BestVPlan.getPreheader()->empty()) {
7646 State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7648 BestVPlan.getPreheader()->execute(&State);
7649 }
7650 if (!ILV.getTripCount())
7651 ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
7652 else
7653 assert(IsEpilogueVectorization && "should only re-use the existing trip "
7654 "count during epilogue vectorization");
7655
7656 // 1. Set up the skeleton for vectorization, including vector pre-header and
7657 // middle block. The vector loop is created during VPlan execution.
7658 Value *CanonicalIVStartValue;
7659 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7660 ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
7661 : State.ExpandedSCEVs);
7662#ifdef EXPENSIVE_CHECKS
7663 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7664#endif
7665
7666 // Only use noalias metadata when using memory checks guaranteeing no overlap
7667 // across all iterations.
7668 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7669 std::unique_ptr<LoopVersioning> LVer = nullptr;
7670 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7672
7673 // We currently don't use LoopVersioning for the actual loop cloning but we
7674 // still use it to add the noalias metadata.
7675 // TODO: Find a better way to re-use LoopVersioning functionality to add
7676 // metadata.
7677 LVer = std::make_unique<LoopVersioning>(
7678 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7679 PSE.getSE());
7680 State.LVer = &*LVer;
7682 }
7683
7685
7686 //===------------------------------------------------===//
7687 //
7688 // Notice: any optimization or new instruction that go
7689 // into the code below should also be implemented in
7690 // the cost-model.
7691 //
7692 //===------------------------------------------------===//
7693
7694 // 2. Copy and widen instructions from the old loop into the new loop.
7695 BestVPlan.prepareToExecute(ILV.getTripCount(),
7696 ILV.getOrCreateVectorTripCount(nullptr),
7697 CanonicalIVStartValue, State);
7698
7699 BestVPlan.execute(&State);
7700
7701 // 2.5 Collect reduction resume values.
7703 auto *ExitVPBB =
7704 cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7705 for (VPRecipeBase &R : *ExitVPBB) {
7707 dyn_cast<VPInstruction>(&R), ReductionResumeValues, State, OrigLoop,
7708 State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
7709 }
7710
7711 // 2.6. Maintain Loop Hints
7712 // Keep all loop hints from the original loop on the vector loop (we'll
7713 // replace the vectorizer-specific hints below).
7714 MDNode *OrigLoopID = OrigLoop->getLoopID();
7715
7716 std::optional<MDNode *> VectorizedLoopID =
7719
7720 VPBasicBlock *HeaderVPBB =
7722 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7723 if (VectorizedLoopID)
7724 L->setLoopID(*VectorizedLoopID);
7725 else {
7726 // Keep all loop hints from the original loop on the vector loop (we'll
7727 // replace the vectorizer-specific hints below).
7728 if (MDNode *LID = OrigLoop->getLoopID())
7729 L->setLoopID(LID);
7730
7731 LoopVectorizeHints Hints(L, true, *ORE);
7732 Hints.setAlreadyVectorized();
7733 }
7735 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7736 if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
7738
7739 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7740 // predication, updating analyses.
7741 ILV.fixVectorizedLoop(State, BestVPlan);
7742
7744
7745 // 4. Adjust branch weight of the branch in the middle block.
7746 auto *MiddleTerm =
7747 cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
7748 if (MiddleTerm->isConditional() &&
7749 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7750 // Assume that `Count % VectorTripCount` is equally distributed.
7751 unsigned TripCount = State.UF * State.VF.getKnownMinValue();
7752 assert(TripCount > 0 && "trip count should not be zero");
7753 const uint32_t Weights[] = {1, TripCount - 1};
7754 setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7755 }
7756
7757 return {State.ExpandedSCEVs, ReductionResumeValues};
7758}
7759
7760#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7762 for (const auto &Plan : VPlans)
7764 Plan->printDOT(O);
7765 else
7766 Plan->print(O);
7767}
7768#endif
7769
7770//===--------------------------------------------------------------------===//
7771// EpilogueVectorizerMainLoop
7772//===--------------------------------------------------------------------===//
7773
7774/// This function is partially responsible for generating the control flow
7775/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7776std::pair<BasicBlock *, Value *>
7778 const SCEV2ValueTy &ExpandedSCEVs) {
7780
7781 // Generate the code to check the minimum iteration count of the vector
7782 // epilogue (see below).
7786
7787 // Generate the code to check any assumptions that we've made for SCEV
7788 // expressions.
7790
7791 // Generate the code that checks at runtime if arrays overlap. We put the
7792 // checks into a separate block to make the more common case of few elements
7793 // faster.
7795
7796 // Generate the iteration count check for the main loop, *after* the check
7797 // for the epilogue loop, so that the path-length is shorter for the case
7798 // that goes directly through the vector epilogue. The longer-path length for
7799 // the main loop is compensated for, by the gain from vectorizing the larger
7800 // trip count. Note: the branch will get updated later on when we vectorize
7801 // the epilogue.
7804
7805 // Generate the induction variable.
7807
7808 // Skip induction resume value creation here because they will be created in
7809 // the second pass for the scalar loop. The induction resume values for the
7810 // inductions in the epilogue loop are created before executing the plan for
7811 // the epilogue loop.
7812
7813 return {LoopVectorPreHeader, nullptr};
7814}
7815
7817 LLVM_DEBUG({
7818 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7819 << "Main Loop VF:" << EPI.MainLoopVF
7820 << ", Main Loop UF:" << EPI.MainLoopUF
7821 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7822 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7823 });
7824}
7825
7828 dbgs() << "intermediate fn:\n"
7829 << *OrigLoop->getHeader()->getParent() << "\n";
7830 });
7831}
7832
7833BasicBlock *
7835 bool ForEpilogue) {
7836 assert(Bypass && "Expected valid bypass basic block.");
7837 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7838 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7839 Value *Count = getTripCount();
7840 // Reuse existing vector loop preheader for TC checks.
7841 // Note that new preheader block is generated for vector loop.
7842 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7843 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7844
7845 // Generate code to check if the loop's trip count is less than VF * UF of the
7846 // main vector loop.
7847 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7848 : VF.isVector())
7851
7852 Value *CheckMinIters = Builder.CreateICmp(
7853 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7854 "min.iters.check");
7855
7856 if (!ForEpilogue)
7857 TCCheckBlock->setName("vector.main.loop.iter.check");
7858
7859 // Create new preheader for vector loop.
7860 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7861 DT, LI, nullptr, "vector.ph");
7862
7863 if (ForEpilogue) {
7864 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7865 DT->getNode(Bypass)->getIDom()) &&
7866 "TC check is expected to dominate Bypass");
7867
7868 // Update dominator for Bypass.
7869 DT->changeImmediateDominator(Bypass, TCCheckBlock);
7870 LoopBypassBlocks.push_back(TCCheckBlock);
7871
7872 // Save the trip count so we don't have to regenerate it in the
7873 // vec.epilog.iter.check. This is safe to do because the trip count
7874 // generated here dominates the vector epilog iter check.
7875 EPI.TripCount = Count;
7876 }
7877
7878 BranchInst &BI =
7879 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7881 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7882 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7883
7884 return TCCheckBlock;
7885}
7886
7887//===--------------------------------------------------------------------===//
7888// EpilogueVectorizerEpilogueLoop
7889//===--------------------------------------------------------------------===//
7890
7891/// This function is partially responsible for generating the control flow
7892/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7893std::pair<BasicBlock *, Value *>
7895 const SCEV2ValueTy &ExpandedSCEVs) {
7896 createVectorLoopSkeleton("vec.epilog.");
7897
7898 // Now, compare the remaining count and if there aren't enough iterations to
7899 // execute the vectorized epilogue skip to the scalar part.
7900 LoopVectorPreHeader->setName("vec.epilog.ph");
7901 BasicBlock *VecEpilogueIterationCountCheck =
7903 nullptr, "vec.epilog.iter.check", true);
7905 VecEpilogueIterationCountCheck);
7906
7907 // Adjust the control flow taking the state info from the main loop
7908 // vectorization into account.
7910 "expected this to be saved from the previous pass.");
7912 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7913
7916
7918 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7919
7920 if (EPI.SCEVSafetyCheck)
7922 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7923 if (EPI.MemSafetyCheck)
7925 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7926
7928 VecEpilogueIterationCountCheck,
7929 VecEpilogueIterationCountCheck->getSinglePredecessor());
7930
7933 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7934 // If there is an epilogue which must run, there's no edge from the
7935 // middle block to exit blocks and thus no need to update the immediate
7936 // dominator of the exit blocks.
7939
7940 // Keep track of bypass blocks, as they feed start values to the induction and
7941 // reduction phis in the scalar loop preheader.
7942 if (EPI.SCEVSafetyCheck)
7944 if (EPI.MemSafetyCheck)
7947
7948 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7949 // reductions which merge control-flow from the latch block and the middle
7950 // block. Update the incoming values here and move the Phi into the preheader.
7951 SmallVector<PHINode *, 4> PhisInBlock;
7952 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7953 PhisInBlock.push_back(&Phi);
7954
7955 for (PHINode *Phi : PhisInBlock) {
7956 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7957 Phi->replaceIncomingBlockWith(
7958 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7959 VecEpilogueIterationCountCheck);
7960
7961 // If the phi doesn't have an incoming value from the
7962 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7963 // value and also those from other check blocks. This is needed for
7964 // reduction phis only.
7965 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7966 return EPI.EpilogueIterationCountCheck == IncB;
7967 }))
7968 continue;
7969 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7970 if (EPI.SCEVSafetyCheck)
7971 Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7972 if (EPI.MemSafetyCheck)
7973 Phi->removeIncomingValue(EPI.MemSafetyCheck);
7974 }
7975
7976 // Generate a resume induction for the vector epilogue and put it in the
7977 // vector epilogue preheader
7978 Type *IdxTy = Legal->getWidestInductionType();
7979 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
7981 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7982 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7984
7985 // Generate induction resume values. These variables save the new starting
7986 // indexes for the scalar loop. They are used to test if there are any tail
7987 // iterations left once the vector loop has completed.
7988 // Note that when the vectorized epilogue is skipped due to iteration count
7989 // check, then the resume value for the induction variable comes from
7990 // the trip count of the main vector loop, hence passing the AdditionalBypass
7991 // argument.
7992 createInductionResumeValues(ExpandedSCEVs,
7993 {VecEpilogueIterationCountCheck,
7994 EPI.VectorTripCount} /* AdditionalBypass */);
7995
7996 return {LoopVectorPreHeader, EPResumeVal};
7997}
7998
7999BasicBlock *
8001 BasicBlock *Bypass, BasicBlock *Insert) {
8002
8004 "Expected trip count to have been safed in the first pass.");
8005 assert(
8006 (!isa<Instruction>(EPI.TripCount) ||
8007 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8008 "saved trip count does not dominate insertion point.");
8009 Value *TC = EPI.TripCount;
8010 IRBuilder<> Builder(Insert->getTerminator());
8011 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8012
8013 // Generate code to check if the loop's trip count is less than VF * UF of the
8014 // vector epilogue loop.
8015 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
8018
8019 Value *CheckMinIters =
8020 Builder.CreateICmp(P, Count,
8023 "min.epilog.iters.check");
8024
8025 BranchInst &BI =
8026 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
8028 unsigned MainLoopStep = UF * VF.getKnownMinValue();
8029 unsigned EpilogueLoopStep =
8031 // We assume the remaining `Count` is equally distributed in
8032 // [0, MainLoopStep)
8033 // So the probability for `Count < EpilogueLoopStep` should be
8034 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
8035 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
8036 const uint32_t Weights[] = {EstimatedSkipCount,
8037 MainLoopStep - EstimatedSkipCount};
8038 setBranchWeights(BI, Weights, /*IsExpected=*/false);
8039 }
8040 ReplaceInstWithInst(Insert->getTerminator(), &BI);
8041 LoopBypassBlocks.push_back(Insert);
8042 return Insert;
8043}
8044
8046 LLVM_DEBUG({
8047 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8048 << "Epilogue Loop VF:" << EPI.EpilogueVF
8049 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8050 });
8051}
8052
8055 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8056 });
8057}
8058
8060 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8061 assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8062 bool PredicateAtRangeStart = Predicate(Range.Start);
8063
8064 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
8065 if (Predicate(TmpVF) != PredicateAtRangeStart) {
8066 Range.End = TmpVF;
8067 break;
8068 }
8069
8070 return PredicateAtRangeStart;
8071}
8072
8073/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8074/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8075/// of VF's starting at a given VF and extending it as much as possible. Each
8076/// vectorization decision can potentially shorten this sub-range during
8077/// buildVPlan().
8079 ElementCount MaxVF) {
8080 auto MaxVFTimes2 = MaxVF * 2;
8081 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8082 VFRange SubRange = {VF, MaxVFTimes2};
8083 VPlans.push_back(buildVPlan(SubRange));
8084 VF = SubRange.End;
8085 }
8086}
8087
8088iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
8090 std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
8091 if (auto *I = dyn_cast<Instruction>(Op)) {
8092 if (auto *R = Ingredient2Recipe.lookup(I))
8093 return R->getVPSingleValue();
8094 }
8095 return Plan.getOrAddLiveIn(Op);
8096 };
8097 return map_range(Operands, Fn);
8098}
8099
8101 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8102
8103 // Look for cached value.
8104 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8105 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8106 if (ECEntryIt != EdgeMaskCache.end())
8107 return ECEntryIt->second;
8108
8109 VPValue *SrcMask = getBlockInMask(Src);
8110
8111 // The terminator has to be a branch inst!
8112 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8113 assert(BI && "Unexpected terminator found");
8114
8115 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8116 return EdgeMaskCache[Edge] = SrcMask;
8117
8118 // If source is an exiting block, we know the exit edge is dynamically dead
8119 // in the vector loop, and thus we don't need to restrict the mask. Avoid
8120 // adding uses of an otherwise potentially dead instruction.
8121 if (OrigLoop->isLoopExiting(Src))
8122 return EdgeMaskCache[Edge] = SrcMask;
8123
8124 VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition(), Plan);
8125 assert(EdgeMask && "No Edge Mask found for condition");
8126
8127 if (BI->getSuccessor(0) != Dst)
8128 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8129
8130 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8131 // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
8132 // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
8133 // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8134 EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
8135 }
8136
8137 return EdgeMaskCache[Edge] = EdgeMask;
8138}
8139
8141 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8142
8143 // Look for cached value.
8144 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8145 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
8146 assert(ECEntryIt != EdgeMaskCache.end() &&
8147 "looking up mask for edge which has not been created");
8148 return ECEntryIt->second;
8149}
8150
8152 BasicBlock *Header = OrigLoop->getHeader();
8153
8154 // When not folding the tail, use nullptr to model all-true mask.
8155 if (!CM.foldTailByMasking()) {
8156 BlockMaskCache[Header] = nullptr;
8157 return;
8158 }
8159
8160 // Introduce the early-exit compare IV <= BTC to form header block mask.
8161 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8162 // constructing the desired canonical IV in the header block as its first
8163 // non-phi instructions.
8164
8165 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8166 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8167 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8168 HeaderVPBB->insert(IV, NewInsertionPoint);
8169
8170 VPBuilder::InsertPointGuard Guard(Builder);
8171 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8172 VPValue *BlockMask = nullptr;
8174 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8175 BlockMaskCache[Header] = BlockMask;
8176}
8177
8179 // Return the cached value.
8180 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8181 assert(BCEntryIt != BlockMaskCache.end() &&
8182 "Trying to access mask for block without one.");
8183 return BCEntryIt->second;
8184}
8185
8187 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8188 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8189 assert(OrigLoop->getHeader() != BB &&
8190 "Loop header must have cached block mask");
8191
8192 // All-one mask is modelled as no-mask following the convention for masked
8193 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8194 VPValue *BlockMask = nullptr;
8195 // This is the block mask. We OR all incoming edges.
8196 for (auto *Predecessor : predecessors(BB)) {
8197 VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
8198 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8199 BlockMaskCache[BB] = EdgeMask;
8200 return;
8201 }
8202
8203 if (!BlockMask) { // BlockMask has its initialized nullptr value.
8204 BlockMask = EdgeMask;
8205 continue;
8206 }
8207
8208 BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8209 }
8210
8211 BlockMaskCache[BB] = BlockMask;
8212}
8213
8215VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8216 VFRange &Range) {
8217 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8218 "Must be called with either a load or store");
8219
8220 auto willWiden = [&](ElementCount VF) -> bool {
8222 CM.getWideningDecision(I, VF);
8224 "CM decision should be taken at this point.");
8226 return true;
8227 if (CM.isScalarAfterVectorization(I, VF) ||
8228 CM.isProfitableToScalarize(I, VF))
8229 return false;
8231 };
8232
8234 return nullptr;
8235
8236 VPValue *Mask = nullptr;
8237 if (Legal->isMaskRequired(I))
8238 Mask = getBlockInMask(I->getParent());
8239
8240 // Determine if the pointer operand of the access is either consecutive or
8241 // reverse consecutive.
8243 CM.getWideningDecision(I, Range.Start);
8245 bool Consecutive =
8247
8248 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8249 if (Consecutive) {
8250 auto *GEP = dyn_cast<GetElementPtrInst>(
8251 Ptr->getUnderlyingValue()->stripPointerCasts());
8252 auto *VectorPtr = new VPVectorPointerRecipe(
8253 Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
8254 I->getDebugLoc());
8255 Builder.getInsertBlock()->appendRecipe(VectorPtr);
8256 Ptr = VectorPtr;
8257 }
8258 if (LoadInst *Load = dyn_cast<LoadInst>(I))
8259 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
8260 I->getDebugLoc());
8261
8262 StoreInst *Store = cast<StoreInst>(I);
8263 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
8264 Reverse, I->getDebugLoc());
8265}
8266
8267/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8268/// insert a recipe to expand the step for the induction recipe.
8271 VPValue *Start, const InductionDescriptor &IndDesc,
8272 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
8273 assert(IndDesc.getStartValue() ==
8274 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8275 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8276 "step must be loop invariant");
8277
8278 VPValue *Step =
8280 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8281 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
8282 }
8283 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8284 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
8285}
8286
8287VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8289
8290 // Check if this is an integer or fp induction. If so, build the recipe that
8291 // produces its scalar and vector values.
8292 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8293 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8294 *PSE.getSE(), *OrigLoop);
8295
8296 // Check if this is pointer induction. If so, build the recipe for it.
8297 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8298 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8299 *PSE.getSE());
8301 Phi, Operands[0], Step, *II,
8303 [&](ElementCount VF) {
8304 return CM.isScalarAfterVectorization(Phi, VF);
8305 },
8306 Range));
8307 }
8308 return nullptr;
8309}
8310
8311VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8313 // Optimize the special case where the source is a constant integer
8314 // induction variable. Notice that we can only optimize the 'trunc' case
8315 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8316 // (c) other casts depend on pointer size.
8317
8318 // Determine whether \p K is a truncation based on an induction variable that
8319 // can be optimized.
8320 auto isOptimizableIVTruncate =
8321 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8322 return [=](ElementCount VF) -> bool {
8323 return CM.isOptimizableIVTruncate(K, VF);
8324 };
8325 };
8326
8328 isOptimizableIVTruncate(I), Range)) {
8329
8330 auto *Phi = cast<PHINode>(I->getOperand(0));
8332 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
8333 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8334 *OrigLoop);
8335 }
8336 return nullptr;
8337}
8338
8339VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8341 unsigned NumIncoming = Phi->getNumIncomingValues();
8342
8343 // We know that all PHIs in non-header blocks are converted into selects, so
8344 // we don't have to worry about the insertion order and we can just use the
8345 // builder. At this point we generate the predication tree. There may be
8346 // duplications since this is a simple recursive scan, but future
8347 // optimizations will clean it up.
8348 // TODO: At the moment the first mask is always skipped, but it would be
8349 // better to skip the most expensive mask.
8350 SmallVector<VPValue *, 2> OperandsWithMask;
8351
8352 for (unsigned In = 0; In < NumIncoming; In++) {
8353 OperandsWithMask.push_back(Operands[In]);
8354 VPValue *EdgeMask =
8355 getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8356 if (!EdgeMask) {
8357 assert(In == 0 && "Both null and non-null edge masks found");
8359 "Distinct incoming values with one having a full mask");
8360 break;
8361 }
8362 if (In == 0)
8363 continue;
8364 OperandsWithMask.push_back(EdgeMask);
8365 }
8366 return new VPBlendRecipe(Phi, OperandsWithMask);
8367}
8368
8369VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8371 VFRange &Range) {
8373 [this, CI](ElementCount VF) {
8374 return CM.isScalarWithPredication(CI, VF);
8375 },
8376 Range);
8377
8378 if (IsPredicated)
8379 return nullptr;
8380
8382 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8383 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8384 ID == Intrinsic::pseudoprobe ||
8385 ID == Intrinsic::experimental_noalias_scope_decl))
8386 return nullptr;
8387
8388 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8389 Ops.push_back(Operands.back());
8390
8391 // Is it beneficial to perform intrinsic call compared to lib call?
8392 bool ShouldUseVectorIntrinsic =
8394 [&](ElementCount VF) -> bool {
8395 return CM.getCallWideningDecision(CI, VF).Kind ==
8397 },
8398 Range);
8399 if (ShouldUseVectorIntrinsic)
8400 return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()), ID,
8401 CI->getDebugLoc());
8402
8403 Function *Variant = nullptr;
8404 std::optional<unsigned> MaskPos;
8405 // Is better to call a vectorized version of the function than to to scalarize
8406 // the call?
8407 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8408 [&](ElementCount VF) -> bool {
8409 // The following case may be scalarized depending on the VF.
8410 // The flag shows whether we can use a usual Call for vectorized
8411 // version of the instruction.
8412
8413 // If we've found a variant at a previous VF, then stop looking. A
8414 // vectorized variant of a function expects input in a certain shape
8415 // -- basically the number of input registers, the number of lanes
8416 // per register, and whether there's a mask required.
8417 // We store a pointer to the variant in the VPWidenCallRecipe, so
8418 // once we have an appropriate variant it's only valid for that VF.
8419 // This will force a different vplan to be generated for each VF that
8420 // finds a valid variant.
8421 if (Variant)
8422 return false;
8424 CM.getCallWideningDecision(CI, VF);
8426 Variant = Decision.Variant;
8427 MaskPos = Decision.MaskPos;
8428 return true;
8429 }
8430
8431 return false;
8432 },
8433 Range);
8434 if (ShouldUseVectorCall) {
8435 if (MaskPos.has_value()) {
8436 // We have 2 cases that would require a mask:
8437 // 1) The block needs to be predicated, either due to a conditional
8438 // in the scalar loop or use of an active lane mask with
8439 // tail-folding, and we use the appropriate mask for the block.
8440 // 2) No mask is required for the block, but the only available
8441 // vector variant at this VF requires a mask, so we synthesize an
8442 // all-true mask.
8443 VPValue *Mask = nullptr;
8444 if (Legal->isMaskRequired(CI))
8445 Mask = getBlockInMask(CI->getParent());
8446 else
8448 IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
8449
8450 Ops.insert(Ops.begin() + *MaskPos, Mask);
8451 }
8452
8453 return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()),
8455 Variant);
8456 }
8457
8458 return nullptr;
8459}
8460
8461bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8462 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8463 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8464 // Instruction should be widened, unless it is scalar after vectorization,
8465 // scalarization is profitable or it is predicated.
8466 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8467 return CM.isScalarAfterVectorization(I, VF) ||
8468 CM.isProfitableToScalarize(I, VF) ||
8469 CM.isScalarWithPredication(I, VF);
8470 };
8472 Range);
8473}
8474
8475VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8477 VPBasicBlock *VPBB) {
8478 switch (I->getOpcode()) {
8479 default:
8480 return nullptr;
8481 case Instruction::SDiv:
8482 case Instruction::UDiv:
8483 case Instruction::SRem:
8484 case Instruction::URem: {
8485 // If not provably safe, use a select to form a safe divisor before widening the
8486 // div/rem operation itself. Otherwise fall through to general handling below.
8487 if (CM.isPredicatedInst(I)) {
8488 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8489 VPValue *Mask = getBlockInMask(I->getParent());
8490 VPValue *One =
8491 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8492 auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
8493 Ops[1] = SafeRHS;
8494 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8495 }
8496 [[fallthrough]];
8497 }
8498 case Instruction::Add:
8499 case Instruction::And:
8500 case Instruction::AShr:
8501 case Instruction::FAdd:
8502 case Instruction::FCmp:
8503 case Instruction::FDiv:
8504 case Instruction::FMul:
8505 case Instruction::FNeg:
8506 case Instruction::FRem:
8507 case Instruction::FSub:
8508 case Instruction::ICmp:
8509 case Instruction::LShr:
8510 case Instruction::Mul:
8511 case Instruction::Or:
8512 case Instruction::Select:
8513 case Instruction::Shl:
8514 case Instruction::Sub:
8515 case Instruction::Xor:
8516 case Instruction::Freeze:
8517 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8518 };
8519}
8520
8522 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8523 for (VPHeaderPHIRecipe *R : PhisToFix) {
8524 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8525 VPRecipeBase *IncR =
8526 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8527 R->addOperand(IncR->getVPSingleValue());
8528 }
8529}
8530
8532 VFRange &Range) {
8534 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8535 Range);
8536
8537 bool IsPredicated = CM.isPredicatedInst(I);
8538
8539 // Even if the instruction is not marked as uniform, there are certain
8540 // intrinsic calls that can be effectively treated as such, so we check for
8541 // them here. Conservatively, we only do this for scalable vectors, since
8542 // for fixed-width VFs we can always fall back on full scalarization.
8543 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8544 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8545 case Intrinsic::assume:
8546 case Intrinsic::lifetime_start:
8547 case Intrinsic::lifetime_end:
8548 // For scalable vectors if one of the operands is variant then we still
8549 // want to mark as uniform, which will generate one instruction for just
8550 // the first lane of the vector. We can't scalarize the call in the same
8551 // way as for fixed-width vectors because we don't know how many lanes
8552 // there are.
8553 //
8554 // The reasons for doing it this way for scalable vectors are:
8555 // 1. For the assume intrinsic generating the instruction for the first
8556 // lane is still be better than not generating any at all. For
8557 // example, the input may be a splat across all lanes.
8558 // 2. For the lifetime start/end intrinsics the pointer operand only
8559 // does anything useful when the input comes from a stack object,
8560 // which suggests it should always be uniform. For non-stack objects
8561 // the effect is to poison the object, which still allows us to
8562 // remove the call.
8563 IsUniform = true;
8564 break;
8565 default:
8566 break;
8567 }
8568 }
8569 VPValue *BlockInMask = nullptr;
8570 if (!IsPredicated) {
8571 // Finalize the recipe for Instr, first if it is not predicated.
8572 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8573 } else {
8574 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8575 // Instructions marked for predication are replicated and a mask operand is
8576 // added initially. Masked replicate recipes will later be placed under an
8577 // if-then construct to prevent side-effects. Generate recipes to compute
8578 // the block mask for this region.
8579 BlockInMask = getBlockInMask(I->getParent());
8580 }
8581
8582 // Note that there is some custom logic to mark some intrinsics as uniform
8583 // manually above for scalable vectors, which this assert needs to account for
8584 // as well.
8585 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8586 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8587 "Should not predicate a uniform recipe");
8588 auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
8589 IsUniform, BlockInMask);
8590 return Recipe;
8591}
8592
8596 VFRange &Range, VPBasicBlock *VPBB) {
8597 // First, check for specific widening recipes that deal with inductions, Phi
8598 // nodes, calls and memory operations.
8599 VPRecipeBase *Recipe;
8600 if (auto Phi = dyn_cast<PHINode>(Instr)) {
8601 if (Phi->getParent() != OrigLoop->getHeader())
8602 return tryToBlend(Phi, Operands);
8603
8604 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8605 return Recipe;
8606
8607 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8608 assert((Legal->isReductionVariable(Phi) ||
8609 Legal->isFixedOrderRecurrence(Phi)) &&
8610 "can only widen reductions and fixed-order recurrences here");
8611 VPValue *StartV = Operands[0];
8612 if (Legal->isReductionVariable(Phi)) {
8613 const RecurrenceDescriptor &RdxDesc =
8614 Legal->getReductionVars().find(Phi)->second;
8615 assert(RdxDesc.getRecurrenceStartValue() ==
8616 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8617 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8618 CM.isInLoopReduction(Phi),
8619 CM.useOrderedReductions(RdxDesc));
8620 } else {
8621 // TODO: Currently fixed-order recurrences are modeled as chains of
8622 // first-order recurrences. If there are no users of the intermediate
8623 // recurrences in the chain, the fixed order recurrence should be modeled
8624 // directly, enabling more efficient codegen.
8625 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8626 }
8627
8628 PhisToFix.push_back(PhiRecipe);
8629 return PhiRecipe;
8630 }
8631
8632 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8633 cast<TruncInst>(Instr), Operands, Range)))
8634 return Recipe;
8635
8636 // All widen recipes below deal only with VF > 1.
8638 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8639 return nullptr;
8640
8641 if (auto *CI = dyn_cast<CallInst>(Instr))
8642 return tryToWidenCall(CI, Operands, Range);
8643
8644 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8645 return tryToWidenMemory(Instr, Operands, Range);
8646
8647 if (!shouldWiden(Instr, Range))
8648 return nullptr;
8649
8650 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8651 return new VPWidenGEPRecipe(GEP,
8652 make_range(Operands.begin(), Operands.end()));
8653
8654 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8655 return new VPWidenSelectRecipe(
8656 *SI, make_range(Operands.begin(), Operands.end()));
8657 }
8658
8659 if (auto *CI = dyn_cast<CastInst>(Instr)) {
8660 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8661 *CI);
8662 }
8663
8664 return tryToWiden(Instr, Operands, VPBB);
8665}
8666
8667void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8668 ElementCount MaxVF) {
8669 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8670
8671 auto MaxVFTimes2 = MaxVF * 2;
8672 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8673 VFRange SubRange = {VF, MaxVFTimes2};
8674 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8675 // Now optimize the initial VPlan.
8676 if (!Plan->hasVF(ElementCount::getFixed(1)))
8678 *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
8679 VPlanTransforms::optimize(*Plan, *PSE.getSE());
8680 // TODO: try to put it close to addActiveLaneMask().
8681 // Discard the plan if it is not EVL-compatible
8682 if (CM.foldTailWithEVL() &&
8684 break;
8685 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8686 VPlans.push_back(std::move(Plan));
8687 }
8688 VF = SubRange.End;
8689 }
8690}
8691
8692// Add the necessary canonical IV and branch recipes required to control the
8693// loop.
8694static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8695 DebugLoc DL) {
8696 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8697 auto *StartV = Plan.getOrAddLiveIn(StartIdx);
8698
8699 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8700 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8701 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8702 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8703 Header->insert(CanonicalIVPHI, Header->begin());
8704
8705 VPBuilder Builder(TopRegion->getExitingBasicBlock());
8706 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8707 auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8708 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
8709 "index.next");
8710 CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8711
8712 // Add the BranchOnCount VPInstruction to the latch.
8714 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8715}
8716
8717// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8718// original exit block.
8719static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
8720 VPRecipeBuilder &Builder, VPlan &Plan) {
8721 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8722 BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8723 // Only handle single-exit loops with unique exit blocks for now.
8724 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8725 return;
8726
8727 // Introduce VPUsers modeling the exit values.
8728 for (PHINode &ExitPhi : ExitBB->phis()) {
8729 Value *IncomingValue =
8730 ExitPhi.getIncomingValueForBlock(ExitingBB);
8731 VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan);
8732 // Exit values for inductions are computed and updated outside of VPlan and
8733 // independent of induction recipes.
8734 // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
8735 // live-outs.
8736 if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
8737 !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
8738 isa<VPWidenPointerInductionRecipe>(V))
8739 continue;
8740 Plan.addLiveOut(&ExitPhi, V);
8741 }
8742}
8743
8744/// Feed a resume value for every FOR from the vector loop to the scalar loop,
8745/// if middle block branches to scalar preheader, by introducing ExtractFromEnd
8746/// and ResumePhi recipes in each, respectively, and a VPLiveOut which uses the
8747/// latter and corresponds to the scalar header.
8749 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8750
8751 // Start by finding out if middle block branches to scalar preheader, which is
8752 // not a VPIRBasicBlock, unlike Exit block - the other possible successor of
8753 // middle block.
8754 // TODO: Should be replaced by
8755 // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
8756 // scalar region is modeled as well.
8757 VPBasicBlock *ScalarPHVPBB = nullptr;
8758 auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
8759 for (VPBlockBase *Succ : MiddleVPBB->getSuccessors()) {
8760 if (isa<VPIRBasicBlock>(Succ))
8761 continue;
8762 assert(!ScalarPHVPBB && "Two candidates for ScalarPHVPBB?");
8763 ScalarPHVPBB = cast<VPBasicBlock>(Succ);
8764 }
8765 if (!ScalarPHVPBB)
8766 return;
8767
8768 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
8769 VPBuilder MiddleBuilder(MiddleVPBB);
8770 // Reset insert point so new recipes are inserted before terminator and
8771 // condition, if there is either the former or both.
8772 if (auto *Terminator = MiddleVPBB->getTerminator()) {
8773 auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand(0));
8774 assert((!Condition || Condition->getParent() == MiddleVPBB) &&
8775 "Condition expected in MiddleVPBB");
8776 MiddleBuilder.setInsertPoint(Condition ? Condition : Terminator);
8777 }
8778 VPValue *OneVPV = Plan.getOrAddLiveIn(
8779 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
8780
8781 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
8782 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
8783 if (!FOR)
8784 continue;
8785
8786 // Extract the resume value and create a new VPLiveOut for it.
8787 auto *Resume = MiddleBuilder.createNaryOp(VPInstruction::ExtractFromEnd,
8788 {FOR->getBackedgeValue(), OneVPV},
8789 {}, "vector.recur.extract");
8790 auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
8791 VPInstruction::ResumePhi, {Resume, FOR->getStartValue()}, {},
8792 "scalar.recur.init");
8793 Plan.addLiveOut(cast<PHINode>(FOR->getUnderlyingInstr()), ResumePhiRecipe);
8794 }
8795}
8796
8798LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8799
8801
8802 // ---------------------------------------------------------------------------
8803 // Build initial VPlan: Scan the body of the loop in a topological order to
8804 // visit each basic block after having visited its predecessor basic blocks.
8805 // ---------------------------------------------------------------------------
8806
8807 // Create initial VPlan skeleton, having a basic block for the pre-header
8808 // which contains SCEV expansions that need to happen before the CFG is
8809 // modified; a basic block for the vector pre-header, followed by a region for
8810 // the vector loop, followed by the middle basic block. The skeleton vector
8811 // loop region contains a header and latch basic blocks.
8812
8813 bool RequiresScalarEpilogueCheck =
8815 [this](ElementCount VF) {
8816 return !CM.requiresScalarEpilogue(VF.isVector());
8817 },
8818 Range);
8820 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8821 *PSE.getSE(), RequiresScalarEpilogueCheck, CM.foldTailByMasking(),
8822 OrigLoop);
8823
8824 // Don't use getDecisionAndClampRange here, because we don't know the UF
8825 // so this function is better to be conservative, rather than to split
8826 // it up into different VPlans.
8827 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8828 bool IVUpdateMayOverflow = false;
8829 for (ElementCount VF : Range)
8830 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8831
8833 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8834 // When not folding the tail, we know that the induction increment will not
8835 // overflow.
8836 bool HasNUW = Style == TailFoldingStyle::None;
8837 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8838
8839 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
8840
8841 // ---------------------------------------------------------------------------
8842 // Pre-construction: record ingredients whose recipes we'll need to further
8843 // process after constructing the initial VPlan.
8844 // ---------------------------------------------------------------------------
8845
8846 // For each interleave group which is relevant for this (possibly trimmed)
8847 // Range, add it to the set of groups to be later applied to the VPlan and add
8848 // placeholders for its members' Recipes which we'll be replacing with a
8849 // single VPInterleaveRecipe.
8851 auto applyIG = [IG, this](ElementCount VF) -> bool {
8852 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8853 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8855 // For scalable vectors, the only interleave factor currently supported
8856 // is 2 since we require the (de)interleave2 intrinsics instead of
8857 // shufflevectors.
8858 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8859 "Unsupported interleave factor for scalable vectors");
8860 return Result;
8861 };
8862 if (!getDecisionAndClampRange(applyIG, Range))
8863 continue;
8864 InterleaveGroups.insert(IG);
8865 };
8866
8867 // ---------------------------------------------------------------------------
8868 // Construct recipes for the instructions in the loop
8869 // ---------------------------------------------------------------------------
8870
8871 // Scan the body of the loop in a topological order to visit each basic block
8872 // after having visited its predecessor basic blocks.
8873 LoopBlocksDFS DFS(OrigLoop);
8874 DFS.perform(LI);
8875
8876 VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
8877 VPBasicBlock *VPBB = HeaderVPBB;
8878 BasicBlock *HeaderBB = OrigLoop->getHeader();
8879 bool NeedsMasks =
8880 CM.foldTailByMasking() ||
8881 any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
8882 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
8883 return Legal->blockNeedsPredication(BB) || NeedsBlends;
8884 });
8885 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8886 // Relevant instructions from basic block BB will be grouped into VPRecipe
8887 // ingredients and fill a new VPBasicBlock.
8888 if (VPBB != HeaderVPBB)
8889 VPBB->setName(BB->getName());
8890 Builder.setInsertPoint(VPBB);
8891
8892 if (VPBB == HeaderVPBB)
8893 RecipeBuilder.createHeaderMask();
8894 else if (NeedsMasks)
8895 RecipeBuilder.createBlockInMask(BB);
8896
8897 // Introduce each ingredient into VPlan.
8898 // TODO: Model and preserve debug intrinsics in VPlan.
8899 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
8900 Instruction *Instr = &I;
8902 auto *Phi = dyn_cast<PHINode>(Instr);
8903 if (Phi && Phi->getParent() == HeaderBB) {
8904 Operands.push_back(Plan->getOrAddLiveIn(
8905 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8906 } else {
8907 auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
8908 Operands = {OpRange.begin(), OpRange.end()};
8909 }
8910
8911 // Invariant stores inside loop will be deleted and a single store
8912 // with the final reduction value will be added to the exit block
8913 StoreInst *SI;
8914 if ((SI = dyn_cast<StoreInst>(&I)) &&
8915 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8916 continue;
8917
8918 VPRecipeBase *Recipe =
8919 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
8920 if (!Recipe)
8921 Recipe = RecipeBuilder.handleReplication(Instr, Range);
8922
8923 RecipeBuilder.setRecipe(Instr, Recipe);
8924 if (isa<VPHeaderPHIRecipe>(Recipe)) {
8925 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8926 // the following cases, VPHeaderPHIRecipes may be created after non-phi
8927 // recipes and need to be moved to the phi section of HeaderVPBB:
8928 // * tail-folding (non-phi recipes computing the header mask are
8929 // introduced earlier than regular header phi recipes, and should appear
8930 // after them)
8931 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8932
8933 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8934 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8935 "unexpected recipe needs moving");
8936 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8937 } else
8938 VPBB->appendRecipe(Recipe);
8939 }
8940
8942 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8943 }
8944
8945 // After here, VPBB should not be used.
8946 VPBB = nullptr;
8947
8948 if (CM.requiresScalarEpilogue(Range)) {
8949 // No edge from the middle block to the unique exit block has been inserted
8950 // and there is nothing to fix from vector loop; phis should have incoming
8951 // from scalar loop only.
8952 } else
8953 addUsersInExitBlock(HeaderVPBB, OrigLoop, RecipeBuilder, *Plan);
8954
8955 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8956 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8957 "entry block must be set to a VPRegionBlock having a non-empty entry "
8958 "VPBasicBlock");
8959 RecipeBuilder.fixHeaderPhis();
8960
8962
8963 // ---------------------------------------------------------------------------
8964 // Transform initial VPlan: Apply previously taken decisions, in order, to
8965 // bring the VPlan to its final state.
8966 // ---------------------------------------------------------------------------
8967
8968 // Adjust the recipes for any inloop reductions.
8969 adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
8970
8971 // Interleave memory: for each Interleave Group we marked earlier as relevant
8972 // for this VPlan, replace the Recipes widening its memory instructions with a
8973 // single VPInterleaveRecipe at its insertion point.
8974 for (const auto *IG : InterleaveGroups) {
8975 auto *Recipe =
8976 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos()));
8977 SmallVector<VPValue *, 4> StoredValues;
8978 for (unsigned i = 0; i < IG->getFactor(); ++i)
8979 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8980 auto *StoreR = cast<VPWidenStoreRecipe>(RecipeBuilder.getRecipe(SI));
8981 StoredValues.push_back(StoreR->getStoredValue());
8982 }
8983
8984 bool NeedsMaskForGaps =
8985 IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8986 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8987 Recipe->getMask(), NeedsMaskForGaps);
8988 VPIG->insertBefore(Recipe);
8989 unsigned J = 0;
8990 for (unsigned i = 0; i < IG->getFactor(); ++i)
8991 if (Instruction *Member = IG->getMember(i)) {
8992 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
8993 if (!Member->getType()->isVoidTy()) {
8994 VPValue *OriginalV = MemberR->getVPSingleValue();
8995 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8996 J++;
8997 }
8998 MemberR->eraseFromParent();
8999 }
9000 }
9001
9002 for (ElementCount VF : Range)
9003 Plan->addVF(VF);
9004 Plan->setName("Initial VPlan");
9005
9006 // Replace VPValues for known constant strides guaranteed by predicate scalar
9007 // evolution.
9008 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
9009 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
9010 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
9011 // Only handle constant strides for now.
9012 if (!ScevStride)
9013 continue;
9014
9015 auto *CI = Plan->getOrAddLiveIn(
9016 ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
9017 if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
9018 StrideVPV->replaceAllUsesWith(CI);
9019
9020 // The versioned value may not be used in the loop directly but through a
9021 // sext/zext. Add new live-ins in those cases.
9022 for (Value *U : StrideV->users()) {
9023 if (!isa<SExtInst, ZExtInst>(U))
9024 continue;
9025 VPValue *StrideVPV = Plan->getLiveIn(U);
9026 if (!StrideVPV)
9027 continue;
9028 unsigned BW = U->getType()->getScalarSizeInBits();
9029 APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
9030 : ScevStride->getAPInt().zext(BW);
9031 VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
9032 StrideVPV->replaceAllUsesWith(CI);
9033 }
9034 }
9035
9037 return Legal->blockNeedsPredication(BB);
9038 });
9039
9040 // Sink users of fixed-order recurrence past the recipe defining the previous
9041 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
9043 return nullptr;
9044
9045 if (useActiveLaneMask(Style)) {
9046 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
9047 // TailFoldingStyle is visible there.
9048 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
9049 bool WithoutRuntimeCheck =
9051 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
9052 WithoutRuntimeCheck);
9053 }
9054 return Plan;
9055}
9056
9057VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9058 // Outer loop handling: They may require CFG and instruction level
9059 // transformations before even evaluating whether vectorization is profitable.
9060 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9061 // the vectorization pipeline.
9062 assert(!OrigLoop->isInnermost());
9063 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9064
9065 // Create new empty VPlan
9066 auto Plan = VPlan::createInitialVPlan(
9067 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
9068 *PSE.getSE(), true, false, OrigLoop);
9069
9070 // Build hierarchical CFG
9071 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9072 HCFGBuilder.buildHierarchicalCFG();
9073
9074 for (ElementCount VF : Range)
9075 Plan->addVF(VF);
9076
9078 Plan,
9079 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9080 *PSE.getSE(), *TLI);
9081
9082 // Remove the existing terminator of the exiting block of the top-most region.
9083 // A BranchOnCount will be added instead when adding the canonical IV recipes.
9084 auto *Term =
9085 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9086 Term->eraseFromParent();
9087
9088 // Tail folding is not supported for outer loops, so the induction increment
9089 // is guaranteed to not wrap.
9090 bool HasNUW = true;
9091 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
9092 DebugLoc());
9093 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9094 return Plan;
9095}
9096
9097// Adjust the recipes for reductions. For in-loop reductions the chain of
9098// instructions leading from the loop exit instr to the phi need to be converted
9099// to reductions, with one operand being vector and the other being the scalar
9100// reduction chain. For other reductions, a select is introduced between the phi
9101// and live-out recipes when folding the tail.
9102//
9103// A ComputeReductionResult recipe is added to the middle block, also for
9104// in-loop reductions which compute their result in-loop, because generating
9105// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
9106//
9107// Adjust AnyOf reductions; replace the reduction phi for the selected value
9108// with a boolean reduction phi node to check if the condition is true in any
9109// iteration. The final value is selected by the final ComputeReductionResult.
9110void LoopVectorizationPlanner::adjustRecipesForReductions(
9111 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
9112 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
9113 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
9114 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
9115 // sank outside of the loop would keep the same order as they had in the
9116 // original loop.
9117 SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
9118 for (VPRecipeBase &R : Header->phis()) {
9119 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
9120 ReductionPHIList.emplace_back(ReductionPhi);
9121 }
9122 bool HasIntermediateStore = false;
9123 stable_sort(ReductionPHIList,
9124 [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
9125 const VPReductionPHIRecipe *R2) {
9126 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
9127 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
9128 HasIntermediateStore |= IS1 || IS2;
9129
9130 // If neither of the recipes has an intermediate store, keep the
9131 // order the same.
9132 if (!IS1 && !IS2)
9133 return false;
9134
9135 // If only one of the recipes has an intermediate store, then
9136 // move it towards the beginning of the list.
9137 if (IS1 && !IS2)
9138 return true;
9139
9140 if (!IS1 && IS2)
9141 return false;
9142
9143 // If both recipes have an intermediate store, then the recipe
9144 // with the later store should be processed earlier. So it
9145 // should go to the beginning of the list.
9146 return DT->dominates(IS2, IS1);
9147 });
9148
9149 if (HasIntermediateStore && ReductionPHIList.size() > 1)
9150 for (VPRecipeBase *R : ReductionPHIList)
9151 R->moveBefore(*Header, Header->getFirstNonPhi());
9152
9153 for (VPRecipeBase &R : Header->phis()) {
9154 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9155 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
9156 continue;
9157
9158 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9159 RecurKind Kind = RdxDesc.getRecurrenceKind();
9161 "AnyOf reductions are not allowed for in-loop reductions");
9162
9163 // Collect the chain of "link" recipes for the reduction starting at PhiR.
9165 Worklist.insert(PhiR);
9166 for (unsigned I = 0; I != Worklist.size(); ++I) {
9167 VPSingleDefRecipe *Cur = Worklist[I];
9168 for (VPUser *U : Cur->users()) {
9169 auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
9170 if (!UserRecipe) {
9171 assert(isa<VPLiveOut>(U) &&
9172 "U must either be a VPSingleDef or VPLiveOut");
9173 continue;
9174 }
9175 Worklist.insert(UserRecipe);
9176 }
9177 }
9178
9179 // Visit operation "Links" along the reduction chain top-down starting from
9180 // the phi until LoopExitValue. We keep track of the previous item
9181 // (PreviousLink) to tell which of the two operands of a Link will remain
9182 // scalar and which will be reduced. For minmax by select(cmp), Link will be
9183 // the select instructions. Blend recipes of in-loop reduction phi's will
9184 // get folded to their non-phi operand, as the reduction recipe handles the
9185 // condition directly.
9186 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9187 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9188 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9189
9190 // Index of the first operand which holds a non-mask vector operand.
9191 unsigned IndexOfFirstOperand;
9192 // Recognize a call to the llvm.fmuladd intrinsic.
9193 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9194 VPValue *VecOp;
9195 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9196 if (IsFMulAdd) {
9197 assert(
9199 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9200 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9201 isa<VPWidenCallRecipe>(CurrentLink)) &&
9202 CurrentLink->getOperand(2) == PreviousLink &&
9203 "expected a call where the previous link is the added operand");
9204
9205 // If the instruction is a call to the llvm.fmuladd intrinsic then we
9206 // need to create an fmul recipe (multiplying the first two operands of
9207 // the fmuladd together) to use as the vector operand for the fadd
9208 // reduction.
9209 VPInstruction *FMulRecipe = new VPInstruction(
9210 Instruction::FMul,
9211 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9212 CurrentLinkI->getFastMathFlags());
9213 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9214 VecOp = FMulRecipe;
9215 } else {
9216 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
9217 if (PhiR->isInLoop() && Blend) {
9218 assert(Blend->getNumIncomingValues() == 2 &&
9219 "Blend must have 2 incoming values");
9220 if (Blend->getIncomingValue(0) == PhiR)
9221 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9222 else {
9223 assert(Blend->getIncomingValue(1) == PhiR &&
9224 "PhiR must be an operand of the blend");
9225 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9226 }
9227 continue;
9228 }
9229
9231 if (isa<VPWidenRecipe>(CurrentLink)) {
9232 assert(isa<CmpInst>(CurrentLinkI) &&
9233 "need to have the compare of the select");
9234 continue;
9235 }
9236 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9237 "must be a select recipe");
9238 IndexOfFirstOperand = 1;
9239 } else {
9240 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9241 "Expected to replace a VPWidenSC");
9242 IndexOfFirstOperand = 0;
9243 }
9244 // Note that for non-commutable operands (cmp-selects), the semantics of
9245 // the cmp-select are captured in the recurrence kind.
9246 unsigned VecOpId =
9247 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9248 ? IndexOfFirstOperand + 1
9249 : IndexOfFirstOperand;
9250 VecOp = CurrentLink->getOperand(VecOpId);
9251 assert(VecOp != PreviousLink &&
9252 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9253 (VecOpId - IndexOfFirstOperand)) ==
9254 PreviousLink &&
9255 "PreviousLink must be the operand other than VecOp");
9256 }
9257
9258 BasicBlock *BB = CurrentLinkI->getParent();
9259 VPValue *CondOp = nullptr;
9261 CondOp = RecipeBuilder.getBlockInMask(BB);
9262
9263 VPReductionRecipe *RedRecipe =
9264 new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp,
9265 CondOp, CM.useOrderedReductions(RdxDesc));
9266 // Append the recipe to the end of the VPBasicBlock because we need to
9267 // ensure that it comes after all of it's inputs, including CondOp.
9268 // Note that this transformation may leave over dead recipes (including
9269 // CurrentLink), which will be cleaned by a later VPlan transform.
9270 LinkVPBB->appendRecipe(RedRecipe);
9271 CurrentLink->replaceAllUsesWith(RedRecipe);
9272 PreviousLink = RedRecipe;
9273 }
9274 }
9275 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9276 Builder.setInsertPoint(&*LatchVPBB->begin());
9277 VPBasicBlock *MiddleVPBB =
9278 cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
9279 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9280 for (VPRecipeBase &R :
9281 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9282 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9283 if (!PhiR)
9284 continue;
9285
9286 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9287 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9288 // with a boolean reduction phi node to check if the condition is true in
9289 // any iteration. The final value is selected by the final
9290 // ComputeReductionResult.
9292 RdxDesc.getRecurrenceKind())) {
9293 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9294 return isa<VPWidenSelectRecipe>(U) ||
9295 (isa<VPReplicateRecipe>(U) &&
9296 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9297 Instruction::Select);
9298 }));
9299 VPValue *Cmp = Select->getOperand(0);
9300 // If the compare is checking the reduction PHI node, adjust it to check
9301 // the start value.
9302 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
9303 for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
9304 if (CmpR->getOperand(I) == PhiR)
9305 CmpR->setOperand(I, PhiR->getStartValue());
9306 }
9307 VPBuilder::InsertPointGuard Guard(Builder);
9308 Builder.setInsertPoint(Select);
9309
9310 // If the true value of the select is the reduction phi, the new value is
9311 // selected if the negated condition is true in any iteration.
9312 if (Select->getOperand(1) == PhiR)
9313 Cmp = Builder.createNot(Cmp);
9314 VPValue *Or = Builder.createOr(PhiR, Cmp);
9315 Select->getVPSingleValue()->replaceAllUsesWith(Or);
9316
9317 // Convert the reduction phi to operate on bools.
9318 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9319 OrigLoop->getHeader()->getContext())));
9320 }
9321
9322 // If tail is folded by masking, introduce selects between the phi
9323 // and the live-out instruction of each reduction, at the beginning of the
9324 // dedicated latch block.
9325 auto *OrigExitingVPV = PhiR->getBackedgeValue();
9326 auto *NewExitingVPV = PhiR->getBackedgeValue();
9327 if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9328 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9329 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9330 "reduction recipe must be defined before latch");
9331 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9332 std::optional<FastMathFlags> FMFs =
9333 PhiTy->isFloatingPointTy()
9334 ? std::make_optional(RdxDesc.getFastMathFlags())
9335 : std::nullopt;
9336 NewExitingVPV =
9337 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9338 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9339 return isa<VPInstruction>(&U) &&
9340 cast<VPInstruction>(&U)->getOpcode() ==
9342 });
9345 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
9347 PhiR->setOperand(1, NewExitingVPV);
9348 }
9349
9350 // If the vector reduction can be performed in a smaller type, we truncate
9351 // then extend the loop exit value to enable InstCombine to evaluate the
9352 // entire expression in the smaller type.
9353 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9354 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9356 RdxDesc.getRecurrenceKind())) {
9357 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9358 Type *RdxTy = RdxDesc.getRecurrenceType();
9359 auto *Trunc =
9360 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9361 auto *Extnd =
9362 RdxDesc.isSigned()
9363 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9364 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9365
9366 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9367 Extnd->insertAfter(Trunc);
9368 if (PhiR->getOperand(1) == NewExitingVPV)
9369 PhiR->setOperand(1, Extnd->getVPSingleValue());
9370 NewExitingVPV = Extnd;
9371 }
9372
9373 // We want code in the middle block to appear to execute on the location of
9374 // the scalar loop's latch terminator because: (a) it is all compiler
9375 // generated, (b) these instructions are always executed after evaluating
9376 // the latch conditional branch, and (c) other passes may add new
9377 // predecessors which terminate on this line. This is the easiest way to
9378 // ensure we don't accidentally cause an extra step back into the loop while
9379 // debugging.
9380 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9381
9382 // TODO: At the moment ComputeReductionResult also drives creation of the
9383 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9384 // even for in-loop reductions, until the reduction resume value handling is
9385 // also modeled in VPlan.
9386 auto *FinalReductionResult = new VPInstruction(
9387 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9388 FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9389 OrigExitingVPV->replaceUsesWithIf(
9390 FinalReductionResult,
9391 [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
9392 }
9393
9395}
9396
9397#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9399 VPSlotTracker &SlotTracker) const {
9400 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9401 IG->getInsertPos()->printAsOperand(O, false);
9402 O << ", ";
9404 VPValue *Mask = getMask();
9405 if (Mask) {
9406 O << ", ";
9407 Mask->printAsOperand(O, SlotTracker);
9408 }
9409
9410 unsigned OpIdx = 0;
9411 for (unsigned i = 0; i < IG->getFactor(); ++i) {
9412 if (!IG->getMember(i))
9413 continue;
9414 if (getNumStoreOperands() > 0) {
9415 O << "\n" << Indent << " store ";
9416 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9417 O << " to index " << i;
9418 } else {
9419 O << "\n" << Indent << " ";
9421 O << " = load from index " << i;
9422 }
9423 ++OpIdx;
9424 }
9425}
9426#endif
9427
9430 "Not a pointer induction according to InductionDescriptor!");
9431 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9432 "Unexpected type.");
9434 "Recipe should have been replaced");
9435
9436 auto *IVR = getParent()->getPlan()->getCanonicalIV();
9437 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
9438 Type *PhiType = IndDesc.getStep()->getType();
9439
9440 // Build a pointer phi
9441 Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9442 Type *ScStValueType = ScalarStartValue->getType();
9443 PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
9444 CanonicalIV->getIterator());
9445
9446 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9447 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9448
9449 // A pointer induction, performed by using a gep
9450 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
9451
9452 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9453 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9454 Value *NumUnrolledElems =
9455 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9456 Value *InductionGEP = GetElementPtrInst::Create(
9457 State.Builder.getInt8Ty(), NewPointerPhi,
9458 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9459 InductionLoc);
9460 // Add induction update using an incorrect block temporarily. The phi node
9461 // will be fixed after VPlan execution. Note that at this point the latch
9462 // block cannot be used, as it does not exist yet.
9463 // TODO: Model increment value in VPlan, by turning the recipe into a
9464 // multi-def and a subclass of VPHeaderPHIRecipe.
9465 NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9466
9467 // Create UF many actual address geps that use the pointer
9468 // phi as base and a vectorized version of the step value
9469 // (<step*0, ..., step*N>) as offset.
9470 for (unsigned Part = 0; Part < State.UF; ++Part) {
9471 Type *VecPhiType = VectorType::get(PhiType, State.VF);
9472 Value *StartOffsetScalar =
9473 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9474 Value *StartOffset =
9475 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9476 // Create a vector of consecutive numbers from zero to VF.
9477 StartOffset = State.Builder.CreateAdd(
9478 StartOffset, State.Builder.CreateStepVector(VecPhiType));
9479
9480 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
9481 "scalar step must be the same across all parts");
9482 Value *GEP = State.Builder.CreateGEP(
9483 State.Builder.getInt8Ty(), NewPointerPhi,
9484 State.Builder.CreateMul(
9485 StartOffset,
9486 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9487 "vector.gep"));
9488 State.set(this, GEP, Part);
9489 }
9490}
9491
9493 assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9494
9495 // Fast-math-flags propagate from the original induction instruction.
9497 if (FPBinOp)
9498 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9499
9500 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9501 Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0));
9502 Value *DerivedIV = emitTransformedIndex(
9503 State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9504 Kind, cast_if_present<BinaryOperator>(FPBinOp));
9505 DerivedIV->setName("offset.idx");
9506 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9507
9508 State.set(this, DerivedIV, VPIteration(0, 0));
9509}
9510
9512 assert(!State.Instance && "Interleave group being replicated.");
9513 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9515 NeedsMaskForGaps);
9516}
9517
9520 if (State.Instance) { // Generate a single instance.
9521 assert((State.VF.isScalar() || !isUniform()) &&
9522 "uniform recipe shouldn't be predicated");
9523 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9524 State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
9525 // Insert scalar instance packing it into a vector.
9526 if (State.VF.isVector() && shouldPack()) {
9527 // If we're constructing lane 0, initialize to start from poison.
9528 if (State.Instance->Lane.isFirstLane()) {
9529 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9530 Value *Poison = PoisonValue::get(
9531 VectorType::get(UI->getType(), State.VF));
9532 State.set(this, Poison, State.Instance->Part);
9533 }
9534 State.packScalarIntoVectorValue(this, *State.Instance);
9535 }
9536 return;
9537 }
9538
9539 if (IsUniform) {
9540 // If the recipe is uniform across all parts (instead of just per VF), only
9541 // generate a single instance.
9542 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9543 all_of(operands(), [](VPValue *Op) {
9544 return Op->isDefinedOutsideVectorRegions();
9545 })) {
9546 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
9547 if (user_begin() != user_end()) {
9548 for (unsigned Part = 1; Part < State.UF; ++Part)
9549 State.set(this, State.get(this, VPIteration(0, 0)),
9550 VPIteration(Part, 0));
9551 }
9552 return;
9553 }
9554
9555 // Uniform within VL means we need to generate lane 0 only for each
9556 // unrolled copy.
9557 for (unsigned Part = 0; Part < State.UF; ++Part)
9558 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
9559 return;
9560 }
9561
9562 // A store of a loop varying value to a uniform address only needs the last
9563 // copy of the store.
9564 if (isa<StoreInst>(UI) &&
9566 auto Lane = VPLane::getLastLaneForVF(State.VF);
9567 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
9568 State);
9569 return;
9570 }
9571
9572 // Generate scalar instances for all VF lanes of all UF parts.
9573 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9574 const unsigned EndLane = State.VF.getKnownMinValue();
9575 for (unsigned Part = 0; Part < State.UF; ++Part)
9576 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9577 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
9578}
9579
9581 auto *LI = cast<LoadInst>(&Ingredient);
9582
9583 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9584 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9585 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9586 bool CreateGather = !isConsecutive();
9587
9588 auto &Builder = State.Builder;
9590 for (unsigned Part = 0; Part < State.UF; ++Part) {
9591 Value *NewLI;
9592 Value *Mask = nullptr;
9593 if (auto *VPMask = getMask()) {
9594 // Mask reversal is only needed for non-all-one (null) masks, as reverse
9595 // of a null all-one mask is a null mask.
9596 Mask = State.get(VPMask, Part);
9597 if (isReverse())
9598 Mask = Builder.CreateVectorReverse(Mask, "reverse");
9599 }
9600
9601 Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather);
9602 if (CreateGather) {
9603 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
9604 "wide.masked.gather");
9605 } else if (Mask) {
9606 NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
9607 PoisonValue::get(DataTy),
9608 "wide.masked.load");
9609 } else {
9610 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
9611 }
9612 // Add metadata to the load, but setVectorValue to the reverse shuffle.
9613 State.addMetadata(NewLI, LI);
9614 if (Reverse)
9615 NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9616 State.set(this, NewLI, Part);
9617 }
9618}
9619
9620/// Use all-true mask for reverse rather than actual mask, as it avoids a
9621/// dependence w/o affecting the result.
9623 Value *EVL, const Twine &Name) {
9624 VectorType *ValTy = cast<VectorType>(Operand->getType());
9625 Value *AllTrueMask =
9626 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
9627 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
9628 {Operand, AllTrueMask, EVL}, nullptr, Name);
9629}
9630
9632 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9633 "explicit vector length.");
9634 auto *LI = cast<LoadInst>(&Ingredient);
9635
9636 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9637 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9638 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9639 bool CreateGather = !isConsecutive();
9640
9641 auto &Builder = State.Builder;
9643 CallInst *NewLI;
9644 Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9645 Value *Addr = State.get(getAddr(), 0, !CreateGather);
9646 Value *Mask = nullptr;
9647 if (VPValue *VPMask = getMask()) {
9648 Mask = State.get(VPMask, 0);
9649 if (isReverse())
9650 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
9651 } else {
9652 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9653 }
9654
9655 if (CreateGather) {
9656 NewLI =
9657 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
9658 nullptr, "wide.masked.gather");
9659 } else {
9660 VectorBuilder VBuilder(Builder);
9661 VBuilder.setEVL(EVL).setMask(Mask);
9662 NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
9663 Instruction::Load, DataTy, Addr, "vp.op.load"));
9664 }
9665 NewLI->addParamAttr(
9666 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
9667 State.addMetadata(NewLI, LI);
9668 Instruction *Res = NewLI;
9669 if (isReverse())
9670 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
9671 State.set(this, Res, 0);
9672}
9673
9675 auto *SI = cast<StoreInst>(&Ingredient);
9676
9677 VPValue *StoredVPValue = getStoredValue();
9678 bool CreateScatter = !isConsecutive();
9679 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9680
9681 auto &Builder = State.Builder;
9683
9684 for (unsigned Part = 0; Part < State.UF; ++Part) {
9685 Instruction *NewSI = nullptr;
9686 Value *Mask = nullptr;
9687 if (auto *VPMask = getMask()) {
9688 // Mask reversal is only needed for non-all-one (null) masks, as reverse
9689 // of a null all-one mask is a null mask.
9690 Mask = State.get(VPMask, Part);
9691 if (isReverse())
9692 Mask = Builder.CreateVectorReverse(Mask, "reverse");
9693 }
9694
9695 Value *StoredVal = State.get(StoredVPValue, Part);
9696 if (isReverse()) {
9697 // If we store to reverse consecutive memory locations, then we need
9698 // to reverse the order of elements in the stored value.
9699 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9700 // We don't want to update the value in the map as it might be used in
9701 // another expression. So don't call resetVectorValue(StoredVal).
9702 }
9703 Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter);
9704 if (CreateScatter)
9705 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
9706 else if (Mask)
9707 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
9708 else
9709 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
9710 State.addMetadata(NewSI, SI);
9711 }
9712}
9713
9715 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9716 "explicit vector length.");
9717 auto *SI = cast<StoreInst>(&Ingredient);
9718
9719 VPValue *StoredValue = getStoredValue();
9720 bool CreateScatter = !isConsecutive();
9721 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9722
9723 auto &Builder = State.Builder;
9725
9726 CallInst *NewSI = nullptr;
9727 Value *StoredVal = State.get(StoredValue, 0);
9728 Value *EVL = State.get(getEVL(), VPIteration(0, 0));
9729 if (isReverse())
9730 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
9731 Value *Mask = nullptr;
9732 if (VPValue *VPMask = getMask()) {
9733 Mask = State.get(VPMask, 0);
9734 if (isReverse())
9735 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
9736 } else {
9737 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
9738 }
9739 Value *Addr = State.get(getAddr(), 0, !CreateScatter);
9740 if (CreateScatter) {
9741 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
9742 Intrinsic::vp_scatter,
9743 {StoredVal, Addr, Mask, EVL});
9744 } else {
9745 VectorBuilder VBuilder(Builder);
9746 VBuilder.setEVL(EVL).setMask(Mask);
9747 NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
9748 Instruction::Store, Type::getVoidTy(EVL->getContext()),
9749 {StoredVal, Addr}));
9750 }
9751 NewSI->addParamAttr(
9752 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
9753 State.addMetadata(NewSI, SI);
9754}
9755
9756// Determine how to lower the scalar epilogue, which depends on 1) optimising
9757// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9758// predication, and 4) a TTI hook that analyses whether the loop is suitable
9759// for predication.
9764 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9765 // don't look at hints or options, and don't request a scalar epilogue.
9766 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9767 // LoopAccessInfo (due to code dependency and not being able to reliably get
9768 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9769 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9770 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9771 // back to the old way and vectorize with versioning when forced. See D81345.)
9772 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9776
9777 // 2) If set, obey the directives
9778 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9786 };
9787 }
9788
9789 // 3) If set, obey the hints
9790 switch (Hints.getPredicate()) {
9795 };
9796
9797 // 4) if the TTI hook indicates this is profitable, request predication.
9798 TailFoldingInfo TFI(TLI, &LVL, IAI);
9801
9803}
9804
9805// Process the loop in the VPlan-native vectorization path. This path builds
9806// VPlan upfront in the vectorization pipeline, which allows to apply
9807// VPlan-to-VPlan transformations from the very beginning without modifying the
9808// input LLVM IR.
9815 LoopVectorizationRequirements &Requirements) {
9816
9817 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9818 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9819 return false;
9820 }
9821 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9822 Function *F = L->getHeader()->getParent();
9823 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9824
9826 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9827
9828 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9829 &Hints, IAI);
9830 // Use the planner for outer loop vectorization.
9831 // TODO: CM is not used at this point inside the planner. Turn CM into an
9832 // optional argument if we don't need it in the future.
9833 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9834 ORE);
9835
9836 // Get user vectorization factor.
9837 ElementCount UserVF = Hints.getWidth();
9838
9840
9841 // Plan how to best vectorize, return the best VF and its cost.
9842 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9843
9844 // If we are stress testing VPlan builds, do not attempt to generate vector
9845 // code. Masked vector code generation support will follow soon.
9846 // Also, do not attempt to vectorize if no vector code will be produced.
9848 return false;
9849
9850 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9851
9852 {
9853 bool AddBranchWeights =
9854 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9855 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9856 F->getDataLayout(), AddBranchWeights);
9857 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9858 VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9859 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9860 << L->getHeader()->getParent()->getName() << "\"\n");
9861 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9862 }
9863
9864 reportVectorization(ORE, L, VF, 1);
9865
9866 // Mark the loop as already vectorized to avoid vectorizing again.
9867 Hints.setAlreadyVectorized();
9868 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9869 return true;
9870}
9871
9872// Emit a remark if there are stores to floats that required a floating point
9873// extension. If the vectorized loop was generated with floating point there
9874// will be a performance penalty from the conversion overhead and the change in
9875// the vector width.
9878 for (BasicBlock *BB : L->getBlocks()) {
9879 for (Instruction &Inst : *BB) {
9880 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9881 if (S->getValueOperand()->getType()->isFloatTy())
9882 Worklist.push_back(S);
9883 }
9884 }
9885 }
9886
9887 // Traverse the floating point stores upwards searching, for floating point
9888 // conversions.
9891 while (!Worklist.empty()) {
9892 auto *I = Worklist.pop_back_val();
9893 if (!L->contains(I))
9894 continue;
9895 if (!Visited.insert(I).second)
9896 continue;
9897
9898 // Emit a remark if the floating point store required a floating
9899 // point conversion.
9900 // TODO: More work could be done to identify the root cause such as a
9901 // constant or a function return type and point the user to it.
9902 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9903 ORE->emit([&]() {
9904 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9905 I->getDebugLoc(), L->getHeader())
9906 << "floating point conversion changes vector width. "
9907 << "Mixed floating point precision requires an up/down "
9908 << "cast that will negatively impact performance.";
9909 });
9910
9911 for (Use &Op : I->operands())
9912 if (auto *OpI = dyn_cast<Instruction>(Op))
9913 Worklist.push_back(OpI);
9914 }
9915}
9916
9917static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9919 std::optional<unsigned> VScale, Loop *L,
9920 ScalarEvolution &SE,
9922 InstructionCost CheckCost = Checks.getCost();
9923 if (!CheckCost.isValid())
9924 return false;
9925
9926 // When interleaving only scalar and vector cost will be equal, which in turn
9927 // would lead to a divide by 0. Fall back to hard threshold.
9928 if (VF.Width.isScalar()) {
9929 if (CheckCost > VectorizeMemoryCheckThreshold) {
9930 LLVM_DEBUG(
9931 dbgs()
9932 << "LV: Interleaving only is not profitable due to runtime checks\n");
9933 return false;
9934 }
9935 return true;
9936 }
9937
9938 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9939 uint64_t ScalarC = *VF.ScalarCost.getValue();
9940 if (ScalarC == 0)
9941 return true;
9942
9943 // First, compute the minimum iteration count required so that the vector
9944 // loop outperforms the scalar loop.
9945 // The total cost of the scalar loop is
9946 // ScalarC * TC
9947 // where
9948 // * TC is the actual trip count of the loop.
9949 // * ScalarC is the cost of a single scalar iteration.
9950 //
9951 // The total cost of the vector loop is
9952 // RtC + VecC * (TC / VF) + EpiC
9953 // where
9954 // * RtC is the cost of the generated runtime checks
9955 // * VecC is the cost of a single vector iteration.
9956 // * TC is the actual trip count of the loop
9957 // * VF is the vectorization factor
9958 // * EpiCost is the cost of the generated epilogue, including the cost
9959 // of the remaining scalar operations.
9960 //
9961 // Vectorization is profitable once the total vector cost is less than the
9962 // total scalar cost:
9963 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9964 //
9965 // Now we can compute the minimum required trip count TC as
9966 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
9967 //
9968 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9969 // the computations are performed on doubles, not integers and the result
9970 // is rounded up, hence we get an upper estimate of the TC.
9971 unsigned IntVF = VF.Width.getKnownMinValue();
9972 if (VF.Width.isScalable()) {
9973 unsigned AssumedMinimumVscale = 1;
9974 if (VScale)
9975 AssumedMinimumVscale = *VScale;
9976 IntVF *= AssumedMinimumVscale;
9977 }
9978 uint64_t RtC = *CheckCost.getValue();
9979 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
9980 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
9981
9982 // Second, compute a minimum iteration count so that the cost of the
9983 // runtime checks is only a fraction of the total scalar loop cost. This
9984 // adds a loop-dependent bound on the overhead incurred if the runtime
9985 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9986 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9987 // cost, compute
9988 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9989 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
9990
9991 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9992 // epilogue is allowed, choose the next closest multiple of VF. This should
9993 // partly compensate for ignoring the epilogue cost.
9994 uint64_t MinTC = std::max(MinTC1, MinTC2);
9995 if (SEL == CM_ScalarEpilogueAllowed)
9996 MinTC = alignTo(MinTC, IntVF);
9998
9999 LLVM_DEBUG(
10000 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
10001 << VF.MinProfitableTripCount << "\n");
10002
10003 // Skip vectorization if the expected trip count is less than the minimum
10004 // required trip count.
10005 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
10008 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
10009 "trip count < minimum profitable VF ("
10010 << *ExpectedTC << " < " << VF.MinProfitableTripCount
10011 << ")\n");
10012
10013 return false;
10014 }
10015 }
10016 return true;
10017}
10018
10020 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10022 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10024
10026 assert((EnableVPlanNativePath || L->isInnermost()) &&
10027 "VPlan-native path is not enabled. Only process inner loops.");
10028
10029 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10030 << L->getHeader()->getParent()->getName() << "' from "
10031 << L->getLocStr() << "\n");
10032
10033 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10034
10035 LLVM_DEBUG(
10036 dbgs() << "LV: Loop hints:"
10037 << " force="
10039 ? "disabled"
10041 ? "enabled"
10042 : "?"))
10043 << " width=" << Hints.getWidth()
10044 << " interleave=" << Hints.getInterleave() << "\n");
10045
10046 // Function containing loop
10047 Function *F = L->getHeader()->getParent();
10048
10049 // Looking at the diagnostic output is the only way to determine if a loop
10050 // was vectorized (other than looking at the IR or machine code), so it
10051 // is important to generate an optimization remark for each loop. Most of
10052 // these messages are generated as OptimizationRemarkAnalysis. Remarks
10053 // generated as OptimizationRemark and OptimizationRemarkMissed are
10054 // less verbose reporting vectorized loops and unvectorized loops that may
10055 // benefit from vectorization, respectively.
10056
10057 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10058 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10059 return false;
10060 }
10061
10062 PredicatedScalarEvolution PSE(*SE, *L);
10063
10064 // Check if it is legal to vectorize the loop.
10065 LoopVectorizationRequirements Requirements;
10066 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
10067 &Requirements, &Hints, DB, AC, BFI, PSI);
10069 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10070 Hints.emitRemarkWithHints();
10071 return false;
10072 }
10073
10074 // Entrance to the VPlan-native vectorization path. Outer loops are processed
10075 // here. They may require CFG and instruction level transformations before
10076 // even evaluating whether vectorization is profitable. Since we cannot modify
10077 // the incoming IR, we need to build VPlan upfront in the vectorization
10078 // pipeline.
10079 if (!L->isInnermost())
10080 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10081 ORE, BFI, PSI, Hints, Requirements);
10082
10083 assert(L->isInnermost() && "Inner loop expected.");
10084
10085 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10086 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10087
10088 // If an override option has been passed in for interleaved accesses, use it.
10089 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10090 UseInterleaved = EnableInterleavedMemAccesses;
10091
10092 // Analyze interleaved memory accesses.
10093 if (UseInterleaved)
10095
10096 // Check the function attributes and profiles to find out if this function
10097 // should be optimized for size.
10099 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
10100
10101 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10102 // count by optimizing for size, to minimize overheads.
10103 auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10104 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10105 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10106 << "This loop is worth vectorizing only if no scalar "
10107 << "iteration overheads are incurred.");
10109 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10110 else {
10111 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10112 LLVM_DEBUG(dbgs() << "\n");
10113 // Predicate tail-folded loops are efficient even when the loop
10114 // iteration count is low. However, setting the epilogue policy to
10115 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10116 // with runtime checks. It's more effective to let
10117 // `areRuntimeChecksProfitable` determine if vectorization is beneficial
10118 // for the loop.
10121 } else {
10122 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
10123 "small to consider vectorizing.\n");
10125 "The trip count is below the minial threshold value.",
10126 "loop trip count is too low, avoiding vectorization",
10127 "LowTripCount", ORE, L);
10128 Hints.emitRemarkWithHints();
10129 return false;
10130 }
10131 }
10132 }
10133
10134 // Check the function attributes to see if implicit floats or vectors are
10135 // allowed.
10136 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10138 "Can't vectorize when the NoImplicitFloat attribute is used",
10139 "loop not vectorized due to NoImplicitFloat attribute",
10140 "NoImplicitFloat", ORE, L);
10141 Hints.emitRemarkWithHints();
10142 return false;
10143 }
10144
10145 // Check if the target supports potentially unsafe FP vectorization.
10146 // FIXME: Add a check for the type of safety issue (denormal, signaling)
10147 // for the target we're vectorizing for, to make sure none of the
10148 // additional fp-math flags can help.
10149 if (Hints.isPotentiallyUnsafe() &&
10152 "Potentially unsafe FP op prevents vectorization",
10153 "loop not vectorized due to unsafe FP support.",
10154 "UnsafeFP", ORE, L);
10155 Hints.emitRemarkWithHints();
10156 return false;
10157 }
10158
10159 bool AllowOrderedReductions;
10160 // If the flag is set, use that instead and override the TTI behaviour.
10161 if (ForceOrderedReductions.getNumOccurrences() > 0)
10162 AllowOrderedReductions = ForceOrderedReductions;
10163 else
10164 AllowOrderedReductions = TTI->enableOrderedReductions();
10165 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10166 ORE->emit([&]() {
10167 auto *ExactFPMathInst = Requirements.getExactFPInst();
10168 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10169 ExactFPMathInst->getDebugLoc(),
10170 ExactFPMathInst->getParent())
10171 << "loop not vectorized: cannot prove it is safe to reorder "
10172 "floating-point operations";
10173 });
10174 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10175 "reorder floating-point operations\n");
10176 Hints.emitRemarkWithHints();
10177 return false;
10178 }
10179
10180 // Use the cost model.
10181 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10182 F, &Hints, IAI);
10183 // Use the planner for vectorization.
10184 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10185 ORE);
10186
10187 // Get user vectorization factor and interleave count.
10188 ElementCount UserVF = Hints.getWidth();
10189 unsigned UserIC = Hints.getInterleave();
10190
10191 // Plan how to best vectorize, return the best VF and its cost.
10192 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10193
10195 unsigned IC = 1;
10196
10197 bool AddBranchWeights =
10198 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10199 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
10200 F->getDataLayout(), AddBranchWeights);
10201 if (MaybeVF) {
10202 VF = *MaybeVF;
10203 // Select the interleave count.
10204 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10205
10206 unsigned SelectedIC = std::max(IC, UserIC);
10207 // Optimistically generate runtime checks if they are needed. Drop them if
10208 // they turn out to not be profitable.
10209 if (VF.Width.isVector() || SelectedIC > 1)
10210 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10211
10212 // Check if it is profitable to vectorize with runtime checks.
10213 bool ForceVectorization =
10215 if (!ForceVectorization &&
10217 *PSE.getSE(), SEL)) {
10218 ORE->emit([&]() {
10220 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10221 L->getHeader())
10222 << "loop not vectorized: cannot prove it is safe to reorder "
10223 "memory operations";
10224 });
10225 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10226 Hints.emitRemarkWithHints();
10227 return false;
10228 }
10229 }
10230
10231 // Identify the diagnostic messages that should be produced.
10232 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10233 bool VectorizeLoop = true, InterleaveLoop = true;
10234 if (VF.Width.isScalar()) {
10235 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10236 VecDiagMsg = std::make_pair(
10237 "VectorizationNotBeneficial",
10238 "the cost-model indicates that vectorization is not beneficial");
10239 VectorizeLoop = false;
10240 }
10241
10242 if (!MaybeVF && UserIC > 1) {
10243 // Tell the user interleaving was avoided up-front, despite being explicitly
10244 // requested.
10245 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10246 "interleaving should be avoided up front\n");
10247 IntDiagMsg = std::make_pair(
10248 "InterleavingAvoided",
10249 "Ignoring UserIC, because interleaving was avoided up front");
10250 InterleaveLoop = false;
10251 } else if (IC == 1 && UserIC <= 1) {
10252 // Tell the user interleaving is not beneficial.
10253 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10254 IntDiagMsg = std::make_pair(
10255 "InterleavingNotBeneficial",
10256 "the cost-model indicates that interleaving is not beneficial");
10257 InterleaveLoop = false;
10258 if (UserIC == 1) {
10259 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10260 IntDiagMsg.second +=
10261 " and is explicitly disabled or interleave count is set to 1";
10262 }
10263 } else if (IC > 1 && UserIC == 1) {
10264 // Tell the user interleaving is beneficial, but it explicitly disabled.
10265 LLVM_DEBUG(
10266 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10267 IntDiagMsg = std::make_pair(
10268 "InterleavingBeneficialButDisabled",
10269 "the cost-model indicates that interleaving is beneficial "
10270 "but is explicitly disabled or interleave count is set to 1");
10271 InterleaveLoop = false;
10272 }
10273
10274 // Override IC if user provided an interleave count.
10275 IC = UserIC > 0 ? UserIC : IC;
10276
10277 // Emit diagnostic messages, if any.
10278 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10279 if (!VectorizeLoop && !InterleaveLoop) {
10280 // Do not vectorize or interleaving the loop.
10281 ORE->emit([&]() {
10282 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10283 L->getStartLoc(), L->getHeader())
10284 << VecDiagMsg.second;
10285 });
10286 ORE->emit([&]() {
10287 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10288 L->getStartLoc(), L->getHeader())
10289 << IntDiagMsg.second;
10290 });
10291 return false;
10292 } else if (!VectorizeLoop && InterleaveLoop) {
10293 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10294 ORE->emit([&]() {
10295 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10296 L->getStartLoc(), L->getHeader())
10297 << VecDiagMsg.second;
10298 });
10299 } else if (VectorizeLoop && !InterleaveLoop) {
10300 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10301 << ") in " << L->getLocStr() << '\n');
10302 ORE->emit([&]() {
10303 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10304 L->getStartLoc(), L->getHeader())
10305 << IntDiagMsg.second;
10306 });
10307 } else if (VectorizeLoop && InterleaveLoop) {
10308 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10309 << ") in " << L->getLocStr() << '\n');
10310 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10311 }
10312
10313 bool DisableRuntimeUnroll = false;
10314 MDNode *OrigLoopID = L->getLoopID();
10315 {
10316 using namespace ore;
10317 if (!VectorizeLoop) {
10318 assert(IC > 1 && "interleave count should not be 1 or 0");
10319 // If we decided that it is not legal to vectorize the loop, then
10320 // interleave it.
10321 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10322 &CM, BFI, PSI, Checks);
10323
10324 VPlan &BestPlan =
10326 assert((UseLegacyCostModel || BestPlan.hasScalarVFOnly()) &&
10327 "VPlan cost model and legacy cost model disagreed");
10328 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10329
10330 ORE->emit([&]() {
10331 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10332 L->getHeader())
10333 << "interleaved loop (interleaved count: "
10334 << NV("InterleaveCount", IC) << ")";
10335 });
10336 } else {
10337 // If we decided that it is *legal* to vectorize the loop, then do it.
10338
10339 // Consider vectorizing the epilogue too if it's profitable.
10340 VectorizationFactor EpilogueVF =
10342 if (EpilogueVF.Width.isVector()) {
10343
10344 // The first pass vectorizes the main loop and creates a scalar epilogue
10345 // to be vectorized by executing the plan (potentially with a different
10346 // factor) again shortly afterwards.
10347 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10348 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10349 EPI, &LVL, &CM, BFI, PSI, Checks);
10350
10351 std::unique_ptr<VPlan> BestMainPlan(
10353 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
10354 EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
10355 ++LoopsVectorized;
10356
10357 // Second pass vectorizes the epilogue and adjusts the control flow
10358 // edges from the first pass.
10359 EPI.MainLoopVF = EPI.EpilogueVF;
10360 EPI.MainLoopUF = EPI.EpilogueUF;
10361 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10362 ORE, EPI, &LVL, &CM, BFI, PSI,
10363 Checks);
10364
10365 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10366 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10367 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10368 Header->setName("vec.epilog.vector.body");
10369
10370 // Re-use the trip count and steps expanded for the main loop, as
10371 // skeleton creation needs it as a value that dominates both the scalar
10372 // and vector epilogue loops
10373 // TODO: This is a workaround needed for epilogue vectorization and it
10374 // should be removed once induction resume value creation is done
10375 // directly in VPlan.
10376 EpilogILV.setTripCount(MainILV.getTripCount());
10377 for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
10378 auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
10379 auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn(
10380 ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10381 ExpandR->replaceAllUsesWith(ExpandedVal);
10382 if (BestEpiPlan.getTripCount() == ExpandR)
10383 BestEpiPlan.resetTripCount(ExpandedVal);
10384 ExpandR->eraseFromParent();
10385 }
10386
10387 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10388 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10389 // before vectorizing the epilogue loop.
10390 for (VPRecipeBase &R : Header->phis()) {
10391 if (isa<VPCanonicalIVPHIRecipe>(&R))
10392 continue;
10393
10394 Value *ResumeV = nullptr;
10395 // TODO: Move setting of resume values to prepareToExecute.
10396 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10397 const RecurrenceDescriptor &RdxDesc =
10398 ReductionPhi->getRecurrenceDescriptor();
10399 RecurKind RK = RdxDesc.getRecurrenceKind();
10400 ResumeV = ReductionResumeValues.find(&RdxDesc)->second;
10402 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10403 // start value; compare the final value from the main vector loop
10404 // to the start value.
10405 IRBuilder<> Builder(
10406 cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10407 ResumeV = Builder.CreateICmpNE(ResumeV,
10408 RdxDesc.getRecurrenceStartValue());
10409 }
10410 } else {
10411 // Create induction resume values for both widened pointer and
10412 // integer/fp inductions and update the start value of the induction
10413 // recipes to use the resume value.
10414 PHINode *IndPhi = nullptr;
10415 const InductionDescriptor *ID;
10416 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10417 IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10418 ID = &Ind->getInductionDescriptor();
10419 } else {
10420 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10421 IndPhi = WidenInd->getPHINode();
10422 ID = &WidenInd->getInductionDescriptor();
10423 }
10424
10425 ResumeV = MainILV.createInductionResumeValue(
10426 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
10428 }
10429 assert(ResumeV && "Must have a resume value");
10430 VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV);
10431 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10432 }
10433
10434 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10435 "DT not preserved correctly");
10436 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10437 DT, true, &ExpandedSCEVs);
10438 ++LoopsEpilogueVectorized;
10439
10440 if (!MainILV.areSafetyChecksAdded())
10441 DisableRuntimeUnroll = true;
10442 } else {
10443 ElementCount Width = VF.Width;
10444 VPlan &BestPlan =
10445 UseLegacyCostModel ? LVP.getBestPlanFor(Width) : LVP.getBestPlan();
10446 if (!UseLegacyCostModel) {
10447 assert(size(BestPlan.vectorFactors()) == 1 &&
10448 "Plan should have a single VF");
10449 Width = *BestPlan.vectorFactors().begin();
10451 << "VF picked by VPlan cost model: " << Width << "\n");
10452 assert(VF.Width == Width &&
10453 "VPlan cost model and legacy cost model disagreed");
10454 }
10455 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, Width,
10456 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10457 PSI, Checks);
10458 LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
10459 ++LoopsVectorized;
10460
10461 // Add metadata to disable runtime unrolling a scalar loop when there
10462 // are no runtime checks about strides and memory. A scalar loop that is
10463 // rarely used is not worth unrolling.
10464 if (!LB.areSafetyChecksAdded())
10465 DisableRuntimeUnroll = true;
10466 }
10467 // Report the vectorization decision.
10468 reportVectorization(ORE, L, VF, IC);
10469 }
10470
10473 }
10474
10475 std::optional<MDNode *> RemainderLoopID =
10478 if (RemainderLoopID) {
10479 L->setLoopID(*RemainderLoopID);
10480 } else {
10481 if (DisableRuntimeUnroll)
10483
10484 // Mark the loop as already vectorized to avoid vectorizing again.
10485 Hints.setAlreadyVectorized();
10486 }
10487
10488 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10489 return true;
10490}
10491
10497 SE = &SE_;
10498 LI = &LI_;
10499 TTI = &TTI_;
10500 DT = &DT_;
10501 BFI = BFI_;
10502 TLI = TLI_;
10503 AC = &AC_;
10504 LAIs = &LAIs_;
10505 DB = &DB_;
10506 ORE = &ORE_;
10507 PSI = PSI_;
10508
10509 // Don't attempt if
10510 // 1. the target claims to have no vector registers, and
10511 // 2. interleaving won't help ILP.
10512 //
10513 // The second condition is necessary because, even if the target has no
10514 // vector registers, loop vectorization may still enable scalar
10515 // interleaving.
10518 return LoopVectorizeResult(false, false);
10519
10520 bool Changed = false, CFGChanged = false;
10521
10522 // The vectorizer requires loops to be in simplified form.
10523 // Since simplification may add new inner loops, it has to run before the
10524 // legality and profitability checks. This means running the loop vectorizer
10525 // will simplify all loops, regardless of whether anything end up being
10526 // vectorized.
10527 for (const auto &L : *LI)
10528 Changed |= CFGChanged |=
10529 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10530
10531 // Build up a worklist of inner-loops to vectorize. This is necessary as
10532 // the act of vectorizing or partially unrolling a loop creates new loops
10533 // and can invalidate iterators across the loops.
10534 SmallVector<Loop *, 8> Worklist;
10535
10536 for (Loop *L : *LI)
10537 collectSupportedLoops(*L, LI, ORE, Worklist);
10538
10539 LoopsAnalyzed += Worklist.size();
10540
10541 // Now walk the identified inner loops.
10542 while (!Worklist.empty()) {
10543 Loop *L = Worklist.pop_back_val();
10544
10545 // For the inner loops we actually process, form LCSSA to simplify the
10546 // transform.
10547 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10548
10549 Changed |= CFGChanged |= processLoop(L);
10550
10551 if (Changed) {
10552 LAIs->clear();
10553
10554#ifndef NDEBUG
10555 if (VerifySCEV)
10556 SE->verify();
10557#endif
10558 }
10559 }
10560
10561 // Process each loop nest in the function.
10562 return LoopVectorizeResult(Changed, CFGChanged);
10563}
10564
10567 auto &LI = AM.getResult<LoopAnalysis>(F);
10568 // There are no loops in the function. Return before computing other expensive
10569 // analyses.
10570 if (LI.empty())
10571 return PreservedAnalyses::all();
10573 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10574 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10575 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10576 auto &AC = AM.getResult<AssumptionAnalysis>(F);
10577 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10579
10581 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10583 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10584 BlockFrequencyInfo *BFI = nullptr;
10585 if (PSI && PSI->hasProfileSummary())
10587 LoopVectorizeResult Result =
10588 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10589 if (!Result.MadeAnyChange)
10590 return PreservedAnalyses::all();
10592
10593 if (isAssignmentTrackingEnabled(*F.getParent())) {
10594 for (auto &BB : F)
10596 }
10597
10598 PA.preserve<LoopAnalysis>();
10602
10603 if (Result.MadeCFGChange) {
10604 // Making CFG changes likely means a loop got vectorized. Indicate that
10605 // extra simplification passes should be run.
10606 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10607 // be run if runtime checks have been added.
10610 } else {
10612 }
10613 return PA;
10614}
10615
10617 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10618 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10619 OS, MapClassName2PassName);
10620
10621 OS << '<';
10622 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10623 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10624 OS << '>';
10625}
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
ReachingDefAnalysis InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
#define DEBUG_WITH_TYPE(TYPE, X)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition: Debug.h:64
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
uint64_t Addr
std::string Name
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define Check(C,...)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This defines the Use class.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
This header provides classes for managing per-loop analyses.
static const char * VerboseDebug
loop Loop Strength Reduction
#define LV_NAME
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static cl::opt< bool > UseLegacyCostModel("vectorize-use-legacy-cost-model", cl::init(false), cl::Hidden, cl::desc("Use the legacy cost model instead of the VPlan-based cost model. " "This option will be removed in the future."))
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static void createAndCollectMergePhiForReduction(VPInstruction *RedResult, DenseMap< const RecurrenceDescriptor *, Value * > &ReductionResumeValues, VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock, bool VectorizingEpilogue)
static std::optional< unsigned > getSmallBestKnownTC(ScalarEvolution &SE, Loop *L)
Returns "best known" trip count for the specified loop L as defined by the following procedure: 1) Re...
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static Instruction * createReverseEVL(IRBuilderBase &Builder, Value *Operand, Value *EVL, const Twine &Name)
Use all-true mask for reverse rather than actual mask, as it avoids a dependence w/o affecting the re...
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static Value * interleaveVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vals, const Twine &Name)
Return a vector containing interleaved elements from multiple smaller input vectors.
static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or it's operands.
static void emitInvalidCostRemarks(SmallVector< InstructionVFPair > InvalidCosts, OptimizationRemarkEmitter *ORE, Loop *TheLoop)
static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan)
const char LLVMLoopVectorizeFollowupAll[]
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, DebugLoc DL)
static std::optional< unsigned > getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI)
Convenience function that returns the value of vscale_range iff vscale_range.min == vscale_range....
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static constexpr uint32_t MemCheckBypassWeights[]
static Type * MaybeVectorizeType(Type *Elt, ElementCount VF)
cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing."))
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, std::optional< unsigned > VScale, Loop *L, ScalarEvolution &SE, ScalarEpilogueLowering SEL)
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
const char VerboseDebug[]
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop)
Creates a VPWidenIntOrFpInductionRecpipe for Phi.
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I)
Create an analysis remark that explains why vectorization failed.
static constexpr uint32_t SCEVCheckBypassWeights[]
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
const char LLVMLoopVectorizeFollowupVectorized[]
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan)
Feed a resume value for every FOR from the vector loop to the scalar loop, if middle block branches t...
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
const char LLVMLoopVectorizeFollowupEpilogue[]
static bool useActiveLaneMask(TailFoldingStyle Style)
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
#define DEBUG_TYPE
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static void AddRuntimeUnrollDisableMetaData(Loop *L)
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static cl::opt< bool > PrintVPlansInDotFormat("vplan-print-in-dot-format", cl::Hidden, cl::desc("Use dot format instead of plain text when dumping VPlans"))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static bool willGenerateVectors(VPlan &Plan, ElementCount VF, const TargetTransformInfo &TTI)
Check if any recipe of Plan will generate a vector value, which will be assigned a vector register.
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
#define R2(n)
This file contains the declarations for metadata subclasses.
Module.h This file contains the declarations for the Module class.
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(VerifyEach)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
#define OP(OPC)
Definition: SandboxIR.h:483
separate const offset from gep
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
This file defines the VPlanHCFGBuilder class which contains the public interface (buildHierarchicalCF...
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:214
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1522
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
void registerAssumption(AssumeInst *CI)
Add an @llvm.assume intrinsic to this function's cache.
unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
Definition: Attributes.cpp:459
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:232
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:438
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:507
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
Definition: BasicBlock.cpp:372
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:365
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:457
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:209
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:167
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:229
BinaryOps getOpcode() const
Definition: InstrTypes.h:442
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
bool isConditional() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1965
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1401
unsigned arg_size() const
Definition: InstrTypes.h:1408
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1594
This class represents a function call, abstracting a target machine's calling convention.
static bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ FIRST_ICMP_PREDICATE
Definition: InstrTypes.h:788
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:783
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:850
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:857
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
A debug info location.
Definition: DebugLoc.h:33
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:211
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
DomTreeNodeBase * getIDom() const
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:326
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:314
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition: TypeSize.h:317
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
BasicBlock * emitMinimumVectorEpilogueIterCountCheck(BasicBlock *Bypass, BasicBlock *Insert)
Emits an iteration count bypass check after the main vector loop has finished to see if there are any...
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (ie the ...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the main loop strategy (ie the firs...
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent function types.
Definition: DerivedTypes.h:103
param_iterator param_begin() const
Definition: DerivedTypes.h:128
param_iterator param_end() const
Definition: DerivedTypes.h:129
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:698
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:207
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:362
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:745
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:695
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:719
static GetElementPtrInst * Create(Type *PointeeType, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Definition: Instructions.h:938
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:91
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:922
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:508
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1812
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1193
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2521
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:463
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:933
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:579
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1091
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:172
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:523
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:308
Value * CreateVectorReverse(Value *V, const Twine &Name="")
Return a vector value that contains the vector V reversed.
Definition: IRBuilder.cpp:1151
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2250
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1871
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1726
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:483
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2210
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2246
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:142
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1349
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2499
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:599
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1332
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:468
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1671
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1831
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2356
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:513
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1409
Value * CreateStepVector(Type *DstType, const Twine &Name="")
Creates a vector of type DstType with the linear sequence <0, 1, ...>
Definition: IRBuilder.cpp:110
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1366
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2671
A struct for saving information about induction variables.
InductionKind getKind() const
const SCEV * getStep() const
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
Value * getStartValue() const
An extension of the inner loop vectorizer that creates a skeleton for a vectorized loop that has its ...
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks)
virtual std::pair< BasicBlock *, Value * > createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)=0
The interface for creating a vectorized skeleton using one of two different strategies,...
std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
PHINode * createInductionResumeValue(PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, ArrayRef< BasicBlock * > BypassBlocks, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create a new phi node for the induction variable OrigPhi to resume iteration count in the scalar epil...
void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
BasicBlock * LoopScalarBody
The scalar loop body.
Value * TripCount
Trip count of the original loop.
void sinkScalarOperands(Instruction *PredInst)
Iteratively sink the scalarized operands of a predicated instruction into the block that was created ...
const TargetLibraryInfo * TLI
Target Library Info.
DenseMap< PHINode *, Value * > IVEndValues
ElementCount MinProfitableTripCount
Value * createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL)
Returns a bitcasted value to the requested vector type.
const TargetTransformInfo * TTI
Target Transform Info.
Value * VectorTripCount
Trip count of the widened loop (TripCount - TripCount % (VF*UF))
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
BasicBlock * emitSCEVChecks(BasicBlock *Bypass)
Emit a bypass check to see if all of the SCEV assumptions we've had to make are correct.
LoopVectorizationCostModel * Cost
The profitablity analysis.
SmallMapVector< const RecurrenceDescriptor *, PHINode *, 4 > ReductionResumeValues
BlockFrequencyInfo * BFI
BFI and PSI are used to check for profile guided size optimizations.
Value * getTripCount() const
Returns the original loop trip count.
BasicBlock * LoopMiddleBlock
Middle Block between the vector and the scalar.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
SmallVector< Instruction *, 4 > PredicatedInstructions
Store instructions that were predicated.
BasicBlock * completeLoopSkeleton()
Complete the loop skeleton by adding debug MDs, creating appropriate conditional branches in the midd...
void createVectorLoopSkeleton(StringRef Prefix)
Emit basic blocks (prefixed with Prefix) for the iteration check, vector loop preheader,...
BasicBlock * emitMemRuntimeChecks(BasicBlock *Bypass)
Emit bypass checks to check any memory assumptions we may have made.
BasicBlock * LoopScalarPreHeader
The scalar-loop preheader.
LoopVectorizationLegality * Legal
The legality analysis.
void emitIterationCountCheck(BasicBlock *Bypass)
Emit a bypass check to see if the vector trip count is zero, including if it overflows.
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, BasicBlock *MiddleBlock, BasicBlock *VectorHeader, VPlan &Plan, VPTransformState &State)
Set up the values of the IVs correctly when exiting the vector loop.
LoopInfo * LI
Loop Info.
void createInductionResumeValues(const SCEV2ValueTy &ExpandedSCEVs, std::pair< BasicBlock *, Value * > AdditionalBypass={nullptr, nullptr})
Create new phi nodes for the induction variables to resume iteration count in the scalar epilogue,...
ProfileSummaryInfo * PSI
void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State)
Fix the non-induction PHIs in Plan.
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
BasicBlock * LoopVectorPreHeader
The vector-loop preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
Value * getOrCreateVectorTripCount(BasicBlock *InsertBlock)
Returns (and creates if needed) the trip count of the widened loop.
IRBuilder Builder
The builder that we use.
void vectorizeInterleaveGroup(const InterleaveGroup< Instruction > *Group, ArrayRef< VPValue * > VPDefs, VPTransformState &State, VPValue *Addr, ArrayRef< VPValue * > StoredValues, VPValue *BlockInMask, bool NeedsMaskForGaps)
Try to vectorize interleaved access group Group with the base address given in Addr,...
virtual std::pair< BasicBlock *, Value * > createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
unsigned UF
The vectorization unroll factor to use.
void fixVectorizedLoop(VPTransformState &State, VPlan &Plan)
Fix the vectorized code, taking care of header phi's, live-outs, and more.
BasicBlock * LoopExitBlock
The unique ExitBlock of the scalar loop if one exists.
SmallVector< BasicBlock *, 4 > LoopBypassBlocks
A list of all bypass blocks. The first block is the entry of the loop.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
Definition: Instruction.cpp:97
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:466
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
void replaceSuccessorWith(BasicBlock *OldBB, BasicBlock *NewBB)
Replace specified successor OldBB to point at the provided block.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:169
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:463
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
The group of interleaved loads/stores sharing the same stride and close to each other.
Definition: VectorUtils.h:470
uint32_t getFactor() const
Definition: VectorUtils.h:486
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
Definition: VectorUtils.h:540
uint32_t getIndex(const InstTy *Instr) const
Get the index for the given member.
Definition: VectorUtils.h:547
bool isReverse() const
Definition: VectorUtils.h:485
InstTy * getInsertPos() const
Definition: VectorUtils.h:556
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
Align getAlign() const
Definition: VectorUtils.h:487
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:612
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
Definition: VectorUtils.h:657
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
Definition: VectorUtils.h:668
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
Definition: VectorUtils.h:649
bool invalidateGroups()
Invalidate groups, e.g., in case all blocks in loop will be predicated contrary to original assumptio...
Definition: VectorUtils.h:632
iterator_range< SmallPtrSetIterator< llvm::InterleaveGroup< Instruction > * > > getInterleaveGroups()
Definition: VectorUtils.h:662
void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:174
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:571
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
iterator_range< block_iterator > blocks() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
Store the result of a depth first search within basic blocks contained by a single loop.
Definition: LoopIterator.h:97
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
Definition: LoopIterator.h:136
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopInfo.cpp:1254
RPOIterator endRPO() const
Definition: LoopIterator.h:140
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopIterator.h:180
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool requiresScalarEpilogue(VFRange Range) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
std::optional< InstructionCost > getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, TTI::TargetCostKind CostKind) const
Return the cost of instructions in an inloop reduction pattern, if I is part of that pattern.
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI)
LoopVectorizationLegality * Legal
Vectorization legality.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
InstructionCost getInstructionCost(Instruction *I, ElementCount VF)
Returns the execution time cost of an instruction for a given vector width.
DemandedBits * DB
Demanded bits analysis.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr) const
Get the interleaved access group that Instr belongs to.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
Loop * TheLoop
The loop that we evaluate.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool isAccessInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleaved access group.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallVector< RegisterUsage, 8 > calculateRegisterUsage(ArrayRef< ElementCount > VFs)
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
bool isEpilogueVectorizationProfitable(const ElementCount VF) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
void collectUniformsAndScalars(ElementCount VF)
Collect Uniform and Scalar values for the given VF.
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const
Returns true if I is an instruction which requires predication and for which our chosen predication s...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
InstructionCost expectedCost(ElementCount VF, SmallVectorImpl< InstructionVFPair > *Invalid=nullptr)
Returns the expected execution cost.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost)
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
bool hasVectorCallVariants() const
Returns true if there is at least one function call in the loop which has a vectorized variant availa...
bool isInvariantAddressOfReduction(Value *V)
Returns True if given address is invariant and is used to store recurrent expression.
bool blockNeedsPredication(BasicBlock *BB) const
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const
Check if this pointer is consecutive when vectorizing.
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
bool isReductionVariable(PHINode *PN) const
Returns True if PN is a reduction variable in this loop.
bool isFixedOrderRecurrence(const PHINode *Phi) const
Returns True if Phi is a fixed-order recurrence in this loop.
const InductionDescriptor * getPointerInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is pointer induction.
const InductionDescriptor * getIntOrFpInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is an integer or floating point induction.
bool isInductionPhi(const Value *V) const
Returns True if V is a Phi node of an induction variable in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool isInvariant(Value *V) const
Returns true if value V is uniform across VF lanes, when VF is provided, and otherwise if V is invari...
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
bool canFoldTailByMasking() const
Return true if we can vectorize this loop while folding its tail by masking.
void prepareToFoldTailByMasking()
Mark all respective loads/stores for masking.
Type * getWidestInductionType()
Returns the widest induction type.
const LoopAccessInfo * getLAI() const
bool isUniformMemOp(Instruction &I, ElementCount VF) const
A uniform memory op is a load or store which accesses the same memory location on all VF lanes,...
bool isMaskRequired(const Instruction *I) const
Returns true if vector representation of the instruction I requires mask.
const RuntimePointerChecking * getRuntimePointerChecking() const
Returns the information that we collected about runtime memory check.
Planner drives the vectorization process after having passed Legality checks.
std::optional< VectorizationFactor > plan(ElementCount UserVF, unsigned UserIC)
Plan how to best vectorize, return the best VF and its cost, or std::nullopt if vectorization and int...
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
std::pair< DenseMap< const SCEV *, Value * >, DenseMap< const RecurrenceDescriptor *, Value * > > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool IsEpilogueVectorization, const DenseMap< const SCEV *, Value * > *ExpandedSCEVs=nullptr)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
VPlan & getBestPlanFor(ElementCount VF) const
Return the best VPlan for VF.
VPlan & getBestPlan() const
Return the most profitable plan and fix its VF to the most profitable one.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
void printPlans(raw_ostream &O)
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
void prepareNoAliasMetadata()
Set up the aliasing scopes based on the memchecks.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition: LoopInfo.cpp:67
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:632
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:61
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Definition: LoopInfo.cpp:502
Metadata node.
Definition: Metadata.h:1067
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1071
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1428
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1541
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1434
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:600
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
size_type size() const
Definition: MapVector.h:60
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:193
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:688
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
void setIncomingValueForBlock(const BasicBlock *BB, Value *V)
Set every incoming value(s) for block BB to V.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static unsigned getIncomingValueNumForOperand(unsigned i)
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1852
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
void preserve()
Mark an analysis as preserved.
Definition: Analysis.h:131
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:71
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
Instruction * getLoopExitInstr() const
static unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
TrackingVH< Value > getRecurrenceStartValue() const
SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
bool Need
This flag indicates if we need to add the runtime check.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class represents a constant integer value.
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
This class represents an assumption made using SCEV expressions which can be checked at run-time.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isZero() const
Return true if the expression is a constant zero.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getSmallConstantMaxTripCount(const Loop *L)
Returns the upper bound of the loop trip count as a normal unsigned value.
const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVMContext & getContext() const
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
value_type pop_back_val()
Definition: SetVector.h:285
This class provides computation of slot numbers for LLVM Assembly writing.
Definition: AsmWriter.cpp:696
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:323
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:361
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:412
iterator end() const
Definition: SmallPtrSet.h:437
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
iterator begin() const
Definition: SmallPtrSet.h:432
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:290
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
std::optional< unsigned > getVScaleForTuning() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const
Return true if the target supports masked scatter.
bool enableAggressiveInterleaving(bool LoopHasReductions) const
Don't restrict interleaved unrolling to small loops.
bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
bool prefersVectorizedAddressing() const
Return true if target doesn't mind addresses in vectors.
bool hasBranchDivergence(const Function *F=nullptr) const
Return true if branch divergence exists.
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE=nullptr, const SCEV *Ptr=nullptr) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const
Get target-customized preferences for the generic loop unrolling transformation.
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instructions unique non-constant operands.
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
std::optional< unsigned > getMaxVScale() const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool enableOrderedReductions() const
Return true if we should be enabling ordered reductions for the target.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Query the target what the preferred style of tail folding is.
unsigned getRegUsageForType(Type *Ty) const
Returns the estimated number of registers required to represent Ty.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add ...
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isElementTypeLegalForScalableVector(Type *Ty) const
ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const
bool enableMaskedInterleavedAccessVectorization() const
Enable matching of interleaved access groups that contain predicated accesses or gaps and therefore v...
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
const char * getRegisterClassName(unsigned ClassID) const
bool preferEpilogueVectorization() const
Return true if the loop vectorizer should consider vectorizing an otherwise scalar epilogue loop.
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
bool isLegalMaskedStore(Type *DataType, Align Alignment) const
Return true if the target supports masked store.
bool enableInterleavedAccessVectorization() const
Enable matching of interleaved access groups.
unsigned getMinTripCountTailFoldingThreshold() const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
unsigned getMaxInterleaveFactor(ElementCount VF) const
unsigned getNumberOfParts(Type *Tp) const
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr) const
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
Query the target whether it would be prefered to create a predicated vector loop, which can avoid the...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool isLegalMaskedLoad(Type *DataType, Align Alignment) const
Return true if the target supports masked load.
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition: Type.h:243
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isTokenTy() const
Return true if this is 'token'.
Definition: Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:242
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
op_iterator op_begin()
Definition: User.h:234
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
op_iterator op_end()
Definition: User.h:236
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:71
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:2969
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition: VPlan.h:3041
RecipeListTy::iterator iterator
Instruction iterators...
Definition: VPlan.h:2993
void execute(VPTransformState *State) override
The method which generates the output IR instructions that correspond to this VPBasicBlock,...
Definition: VPlan.cpp:483
iterator end()
Definition: VPlan.h:3003
iterator begin()
Recipe iterator methods.
Definition: VPlan.h:3001
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition: VPlan.h:3054
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition: VPlan.cpp:212
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition: VPlan.h:3032
bool empty() const
Definition: VPlan.h:3012
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition: VPlan.h:2023
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:437
VPRegionBlock * getParent()
Definition: VPlan.h:509
const VPBasicBlock * getExitingBasicBlock() const
Definition: VPlan.cpp:177
void setName(const Twine &newName)
Definition: VPlan.h:502
VPlan * getPlan()
Definition: VPlan.cpp:150
const VPBasicBlock * getEntryBasicBlock() const
Definition: VPlan.cpp:155
VPBlockBase * getSingleSuccessor() const
Definition: VPlan.h:544
const VPBlocksTy & getSuccessors() const
Definition: VPlan.h:534
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBase NewBlock after BlockPtr.
Definition: VPlan.h:3582
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPBasicBlock * getInsertBlock() const
VPValue * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL={}, const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createOverflowingOp(unsigned Opcode, std::initializer_list< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL={}, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPValue * createNot(VPValue *Operand, DebugLoc DL={}, const Twine &Name="")
VPValue * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPValue * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL={}, const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition: VPlan.h:2708
Type * getScalarType() const
Returns the scalar type of the induction.
Definition: VPlan.h:2737
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition: VPlanValue.h:418
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition: VPlanValue.h:396
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition: VPlanValue.h:408
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPValue * getStepValue() const
Definition: VPlan.h:2907
VPValue * getStartValue() const
Definition: VPlan.h:2906
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition: VPlan.h:1709
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition: VPlan.h:1753
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition: VPlan.h:1742
This is a concrete Recipe that models a single VPlan-level instruction.
Definition: VPlan.h:1229
@ ResumePhi
Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
Definition: VPlan.h:1247
unsigned getOpcode() const
Definition: VPlan.h:1341
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition: VPlan.h:2080
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition: VPlan.h:2121
VPValue * getMask() const
Return the mask used by this recipe.
Definition: VPlan.h:2127
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition: VPlan.h:2134
unsigned getNumStoreOperands() const
Returns the number of stored operands of this interleave group.
Definition: VPlan.h:2154
static VPLane getLastLaneForVF(const ElementCount &VF)
Definition: VPlan.h:196
static VPLane getFirstLane()
Definition: VPlan.h:180
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition: VPlan.h:764
VPBasicBlock * getParent()
Definition: VPlan.h:789
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition: VPlan.h:860
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPValue * getVPValueOrAddLiveIn(Value *V, VPlan &Plan)
VPValue * createEdgeMask(BasicBlock *Src, BasicBlock *Dst)
A helper function that computes the predicate of the edge between SRC and DST.
VPReplicateRecipe * handleReplication(Instruction *I, VFRange &Range)
Build a VPReplicationRecipe for I.
VPValue * getBlockInMask(BasicBlock *BB) const
Returns the entry mask for the block BB.
VPValue * getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const
A helper that returns the previously computed predicate of the edge between SRC and DST.
iterator_range< mapped_iterator< Use *, std::function< VPValue *(Value *)> > > mapToVPValues(User::op_range Operands)
Returns a range mapping the values of the range Operands to their corresponding VPValues.
void fixHeaderPhis()
Add the incoming values from the backedge to reduction & first-order recurrence cross-iteration phis.
VPRecipeBase * tryToCreateWidenRecipe(Instruction *Instr, ArrayRef< VPValue * > Operands, VFRange &Range, VPBasicBlock *VPBB)
Create and return a widened recipe for I if one can be created within the given VF Range.
void createHeaderMask()
Create the mask for the vector loop header block.
void createBlockInMask(BasicBlock *BB)
A helper function that computes the predicate of the block BB, assuming that the header block of the ...
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
void setFlags(Instruction *I) const
Set the IR flags for I.
Definition: VPlan.h:1149
A recipe for handling reduction phis.
Definition: VPlan.h:1964
bool isInLoop() const
Returns true, if the phi is part of an in-loop reduction.
Definition: VPlan.h:2018
const RecurrenceDescriptor & getRecurrenceDescriptor() const
Definition: VPlan.h:2010
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition: VPlan.h:2171
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition: VPlan.h:3147
const VPBlockBase * getEntry() const
Definition: VPlan.h:3186
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition: VPlan.h:3218
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition: VPlan.h:2286
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isUniform() const
Definition: VPlan.h:2326
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition: VPlan.h:891
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition: VPlan.h:955
This class can be used to assign names to VPValues.
Definition: VPlanValue.h:449
An analysis for type-inference for VPValues.
Definition: VPlanAnalysis.h:39
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition: VPlanValue.h:202
operand_range operands()
Definition: VPlanValue.h:272
void setOperand(unsigned I, VPValue *New)
Definition: VPlanValue.h:257
unsigned getNumOperands() const
Definition: VPlanValue.h:251
VPValue * getOperand(unsigned N) const
Definition: VPlanValue.h:252
void addOperand(VPValue *Operand)
Definition: VPlanValue.h:246
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const
Definition: VPlan.cpp:1453
void replaceAllUsesWith(VPValue *New)
Definition: VPlan.cpp:1421
user_iterator user_begin()
Definition: VPlanValue.h:128
Value * getLiveInIRValue()
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition: VPlanValue.h:172
user_iterator user_end()
Definition: VPlanValue.h:130
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition: VPlanValue.h:167
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition: VPlan.cpp:1425
user_range users()
Definition: VPlanValue.h:132
A recipe to compute the pointers for widened memory accesses of IndexTy for all parts.
Definition: VPlan.h:1653
A recipe for widening Call instructions.
Definition: VPlan.h:1524
A Recipe for widening the canonical induction variable of the vector loop.
Definition: VPlan.h:2833
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition: VPlan.h:1437
A recipe for handling GEP instructions.
Definition: VPlan.h:1611
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition: VPlan.h:1766
A common base class for widening memory operations.
Definition: VPlan.h:2443
bool Reverse
Whether the consecutive accessed addresses are in reverse order.
Definition: VPlan.h:2451
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
Definition: VPlan.h:2490
Instruction & Ingredient
Definition: VPlan.h:2445
VPValue * getMask() const
Return the mask used by this recipe.
Definition: VPlan.h:2504
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition: VPlan.h:2497
bool isReverse() const
Return whether the consecutive loaded/stored addresses are in reverse order.
Definition: VPlan.h:2494
A recipe for handling phis that are widened in the vector loop.
Definition: VPlan.h:1892
VPValue * getIncomingValue(unsigned I)
Returns the I th incoming VPValue.
Definition: VPlan.h:1931
VPBasicBlock * getIncomingBlock(unsigned I)
Returns the I th incoming VPBasicBlock.
Definition: VPlan.h:1928
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void execute(VPTransformState &State) override
Generate vector values for the pointer induction.
VPWidenRecipe is a recipe for producing a copy of vector type its ingredient.
Definition: VPlan.h:1405
Main class to build the VPlan H-CFG for an incoming IR.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition: VPlan.h:3251
void printDOT(raw_ostream &O) const
Print this VPlan in DOT format to O.
Definition: VPlan.cpp:1173
void prepareToExecute(Value *TripCount, Value *VectorTripCount, Value *CanonicalIVStartValue, VPTransformState &State)
Prepare the plan for execution, setting up the required live-in values.
Definition: VPlan.cpp:916
VPBasicBlock * getEntry()
Definition: VPlan.h:3353
VPValue & getVectorTripCount()
The vector trip count.
Definition: VPlan.h:3378
void setName(const Twine &newName)
Definition: VPlan.h:3415
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition: VPlan.h:3381
VPValue * getTripCount() const
The trip count of the original loop.
Definition: VPlan.h:3357
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition: VPlan.h:3371
void removeLiveOut(PHINode *PN)
Definition: VPlan.h:3472
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition: VPlan.h:3398
void addLiveOut(PHINode *PN, VPValue *V)
Definition: VPlan.cpp:1182
VPBasicBlock * getPreheader()
Definition: VPlan.h:3491
VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition: VPlan.h:3453
bool hasVF(ElementCount VF)
Definition: VPlan.h:3391
bool hasUF(unsigned UF) const
Definition: VPlan.h:3404
void setVF(ElementCount VF)
Definition: VPlan.h:3385
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this plan.
Definition: VPlan.cpp:1086
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition: VPlan.h:3364
static VPlanPtr createInitialVPlan(const SCEV *TripCount, ScalarEvolution &PSE, bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop)
Create initial VPlan, having an "entry" VPBasicBlock (wrapping original scalar pre-header ) which con...
Definition: VPlan.cpp:858
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition: VPlan.h:3419
LLVM_DUMP_METHOD void dump() const
Dump the plan to stderr (for debugging).
Definition: VPlan.cpp:1179
bool hasScalarVFOnly() const
Definition: VPlan.h:3402
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition: VPlan.cpp:976
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition: VPlan.h:3461
const MapVector< PHINode *, VPLiveOut * > & getLiveOuts() const
Definition: VPlan.h:3477
void print(raw_ostream &O) const
Print this VPlan to O.
Definition: VPlan.cpp:1123
VPValue * getSCEVExpansion(const SCEV *S) const
Definition: VPlan.h:3481
VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition: VPlan.cpp:1225
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition: Value.cpp:157
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition: Value.cpp:694
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
VectorBuilder & setEVL(Value *NewExplicitVectorLength)
Definition: VectorBuilder.h:83
VectorBuilder & setMask(Value *NewMask)
Definition: VectorBuilder.h:79
Value * createVectorInstruction(unsigned Opcode, Type *ReturnTy, ArrayRef< Value * > VecOpArray, const Twine &Name=Twine())
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:683
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Type * getElementType() const
Definition: DerivedTypes.h:436
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:232
constexpr bool isNonZero() const
Definition: TypeSize.h:158
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:218
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition: TypeSize.h:258
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr bool isZero() const
Definition: TypeSize.h:156
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:225
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:254
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:239
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
A range adaptor for a pair of iterators.
IteratorT end() const
IteratorT begin() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ Entry
Definition: COFF.h:811
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition: DwarfDebug.h:190
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:816
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:236
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE)
Get or create a VPValue that corresponds to the expansion of Expr.
Definition: VPlan.cpp:1610
bool isUniformAfterVectorization(VPValue *VPV)
Returns true if VPV is uniform after vectorization.
Definition: VPlan.h:3806
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
@ Offset
Definition: DWP.cpp:480
Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
Definition: LoopUtils.cpp:1854
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:849
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:950
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2400
unsigned getLoadStoreAddressSpace(Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7128
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
const SCEV * createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, Loop *OrigLoop)
std::pair< Instruction *, ElementCount > InstructionVFPair
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:425
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
Definition: LoopUtils.cpp:263
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2067
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition: VPlanCFG.h:214
bool VerifySCEV
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition: VPlanCFG.h:226
static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
auto map_range(ContainerTy &&C, FuncTy F)
Definition: STLExtras.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
void collectEphemeralRecipesForVPlan(VPlan &Plan, DenseSet< VPRecipeBase * > &EphRecipes)
Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
cl::opt< bool > EnableVPlanNativePath("enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization."))
Definition: VPlan.cpp:55
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
std::unique_ptr< VPlan > VPlanPtr
Definition: VPlan.h:147
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:120
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
cl::opt< bool > EnableLoopVectorization
Align getLoadStoreAlignment(Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition: STLExtras.h:572
void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition: STLExtras.h:336
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
Type * ToVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
Definition: VectorUtils.h:135
TargetTransformInfo TTI
bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
Definition: DebugInfo.cpp:2242
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ Or
Bitwise or logical OR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, Loop *RemainderLoop, uint64_t UF)
Set weights for UnrolledLoop and RemainderLoop based on weights for OrigLoop and the following distri...
Definition: LoopUtils.cpp:1661
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
ScalarEpilogueLowering
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
Definition: LoopUtils.cpp:1913
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
InstructionCost Cost
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
unsigned getReciprocalPredBlockProb()
A helper function that returns the reciprocal of the block probability of predicated blocks.
Definition: VPlan.h:95
bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:593
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
bool verifyVPlanIsValid(const VPlan &Plan)
Verify invariants for general VPlans.
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:471
cl::opt< bool > EnableLoopInterleaving
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:28
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
SmallMapVector< unsigned, unsigned, 4 > LoopInvariantRegs
Holds the number of loop invariant values that are used in the loop.
TargetLibraryInfo * TLI
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LoopVectorizePass(LoopVectorizeOptions Opts={})
BlockFrequencyInfo * BFI
ScalarEvolution * SE
AssumptionCache * AC
LoopVectorizeResult runImpl(Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
Storage for information about made changes.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition: PassManager.h:69
A marker to determine if extra passes after loop vectorization should be run.
Definition: LoopVectorize.h:86
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Flags describing the kind of vector reduction.
Parameters that control the generic loop unrolling transformation.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Definition: VPlan.h:100
ElementCount End
Definition: VPlan.h:105
Struct to hold various analysis needed for cost computations.
Definition: VPlan.h:737
LoopVectorizationCostModel & CM
Definition: VPlan.h:741
bool skipCostComputation(Instruction *UI, bool IsVector) const
Return true if the cost for UI shouldn't be computed, e.g.
InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const
Return the cost for UI with VF using the legacy cost model as fallback until computing the cost of al...
SmallPtrSet< Instruction *, 8 > SkipCostComputation
Definition: VPlan.h:742
A recipe for handling first-order recurrence phis.
Definition: VPlan.h:1937
VPIteration represents a single point in the iteration space of the output (vectorized and/or unrolle...
Definition: VPlan.h:238
bool isFirstIteration() const
Definition: VPlan.h:250
BasicBlock * PrevBB
The previous IR BasicBlock created or used.
Definition: VPlan.h:384
SmallDenseMap< VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
Definition: VPlan.h:392
BasicBlock * getPreheaderBBFor(VPRecipeBase *R)
Returns the BasicBlock* mapped to the pre-header of the loop region containing R.
Definition: VPlan.cpp:356
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
Definition: VPlan.h:255
Value * get(VPValue *Def, unsigned Part, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def and a given Part if IsScalar is false,...
Definition: VPlan.cpp:254
DenseMap< const SCEV *, Value * > ExpandedSCEVs
Map SCEVs to their expanded values.
Definition: VPlan.h:429
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
Definition: VPlan.h:432
void addMetadata(Value *To, Instruction *From)
Add metadata from one instruction to another.
Definition: VPlan.cpp:369
struct llvm::VPTransformState::CFGState CFG
LoopVersioning * LVer
LoopVersioning.
Definition: VPlan.h:425
void addNewMetadata(Instruction *To, const Instruction *Orig)
Add additional metadata to To that was not present on Orig.
Definition: VPlan.cpp:361
void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance)
Construct the vector value of a scalarized value V one lane at a time.
Definition: VPlan.cpp:401
void set(VPValue *Def, Value *V, unsigned Part, bool IsScalar=false)
Set the generated vector Value for a given VPValue and a given Part, if IsScalar is false.
Definition: VPlan.h:307
std::optional< VPIteration > Instance
Hold the indices to generate specific scalar instructions.
Definition: VPlan.h:267
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
Definition: VPlan.h:409
VPlan * Plan
Pointer to the VPlan code is generated for.
Definition: VPlan.h:415
InnerLoopVectorizer * ILV
Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
Definition: VPlan.h:412
ElementCount VF
The chosen Vectorization and Unroll Factors of the loop being vectorized.
Definition: VPlan.h:261
void setDebugLocFrom(DebugLoc DL)
Set the debug location in the builder using the debug location DL.
Definition: VPlan.cpp:380
void execute(VPTransformState &State) override
Generate the wide load or gather.
VPValue * getEVL() const
Return the EVL operand.
Definition: VPlan.h:2570
A recipe for widening load operations, using the address to load from and an optional mask.
Definition: VPlan.h:2519
void execute(VPTransformState &State) override
Generate a wide load or gather.
A recipe for widening select instructions.
Definition: VPlan.h:1577
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition: VPlan.h:2646
void execute(VPTransformState &State) override
Generate the wide store or scatter.
VPValue * getEVL() const
Return the EVL operand.
Definition: VPlan.h:2649
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition: VPlan.h:2593
void execute(VPTransformState &State) override
Generate a wide store or scatter.
VPValue * getStoredValue() const
Return the value stored by this recipe.
Definition: VPlan.h:2610
static bool tryAddExplicitVectorLength(VPlan &Plan)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void dropPoisonGeneratingRecipes(VPlan &Plan, function_ref< bool(BasicBlock *)> BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimize(VPlan &Plan, ScalarEvolution &SE)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void VPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, ScalarEvolution &SE, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs, LLVMContext &Ctx)
Insert truncates and extends for any truncated recipe.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Sink users of fixed-order recurrences after the recipe defining their previous value.
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.