LLVM 23.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanCFG.h"
62#include "VPlanHelpers.h"
63#include "VPlanPatternMatch.h"
64#include "VPlanTransforms.h"
65#include "VPlanUtils.h"
66#include "VPlanVerifier.h"
67#include "llvm/ADT/APInt.h"
68#include "llvm/ADT/ArrayRef.h"
69#include "llvm/ADT/DenseMap.h"
71#include "llvm/ADT/Hashing.h"
72#include "llvm/ADT/MapVector.h"
73#include "llvm/ADT/STLExtras.h"
76#include "llvm/ADT/Statistic.h"
77#include "llvm/ADT/StringRef.h"
78#include "llvm/ADT/Twine.h"
79#include "llvm/ADT/TypeSwitch.h"
84#include "llvm/Analysis/CFG.h"
101#include "llvm/IR/Attributes.h"
102#include "llvm/IR/BasicBlock.h"
103#include "llvm/IR/CFG.h"
104#include "llvm/IR/Constant.h"
105#include "llvm/IR/Constants.h"
106#include "llvm/IR/DataLayout.h"
107#include "llvm/IR/DebugInfo.h"
108#include "llvm/IR/DebugLoc.h"
109#include "llvm/IR/DerivedTypes.h"
111#include "llvm/IR/Dominators.h"
112#include "llvm/IR/Function.h"
113#include "llvm/IR/IRBuilder.h"
114#include "llvm/IR/InstrTypes.h"
115#include "llvm/IR/Instruction.h"
116#include "llvm/IR/Instructions.h"
118#include "llvm/IR/Intrinsics.h"
119#include "llvm/IR/MDBuilder.h"
120#include "llvm/IR/Metadata.h"
121#include "llvm/IR/Module.h"
122#include "llvm/IR/Operator.h"
123#include "llvm/IR/PatternMatch.h"
125#include "llvm/IR/Type.h"
126#include "llvm/IR/Use.h"
127#include "llvm/IR/User.h"
128#include "llvm/IR/Value.h"
129#include "llvm/IR/Verifier.h"
130#include "llvm/Support/Casting.h"
132#include "llvm/Support/Debug.h"
147#include <algorithm>
148#include <cassert>
149#include <cmath>
150#include <cstdint>
151#include <functional>
152#include <iterator>
153#include <limits>
154#include <memory>
155#include <string>
156#include <tuple>
157#include <utility>
158
159using namespace llvm;
160using namespace SCEVPatternMatch;
161
162#define LV_NAME "loop-vectorize"
163#define DEBUG_TYPE LV_NAME
164
165#ifndef NDEBUG
166const char VerboseDebug[] = DEBUG_TYPE "-verbose";
167#endif
168
169STATISTIC(LoopsVectorized, "Number of loops vectorized");
170STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
172STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
173
175 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
176 cl::desc("Enable vectorization of epilogue loops."));
177
179 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
180 cl::desc("When epilogue vectorization is enabled, and a value greater than "
181 "1 is specified, forces the given VF for all applicable epilogue "
182 "loops."));
183
185 "epilogue-vectorization-minimum-VF", cl::Hidden,
186 cl::desc("Only loops with vectorization factor equal to or larger than "
187 "the specified value are considered for epilogue vectorization."));
188
189/// Loops with a known constant trip count below this number are vectorized only
190/// if no scalar iteration overheads are incurred.
192 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
193 cl::desc("Loops with a constant trip count that is smaller than this "
194 "value are vectorized only if no scalar iteration overheads "
195 "are incurred."));
196
198 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
199 cl::desc("The maximum allowed number of runtime memory checks"));
200
201/// Option tail-folding-policy controls the tail-folding strategy and lists all
202/// available options. The vectorizer will attempt to fold the tail-loop into
203/// the vector loop (main/epilogue loops) and predicate the instructions
204/// accordingly. If tail-folding fails, there are different fallback strategies
205/// depending on these values:
207
209 "tail-folding-policy", cl::init(TailFoldingPolicyTy::None), cl::Hidden,
210 cl::desc("Tail-folding preferences over creating an epilogue loop."),
212 clEnumValN(TailFoldingPolicyTy::None, "dont-fold-tail",
213 "Don't tail-fold loops."),
215 "prefer tail-folding, otherwise create an epilogue when "
216 "appropriate."),
218 "always tail-fold, don't attempt vectorization if "
219 "tail-folding fails.")));
220
222 "epilogue-tail-folding-policy", cl::Hidden,
223 cl::desc(
224 "Epilogue-tail-folding preferences over creating an epilogue loop."),
226 clEnumValN(TailFoldingPolicyTy::None, "dont-fold-tail",
227 "Don't tail-fold loops."),
229 "prefer tail-folding, otherwise create an epilogue when "
230 "appropriate.")));
231
233 "force-tail-folding-style", cl::desc("Force the tail folding style"),
236 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
239 "Create lane mask for data only, using active.lane.mask intrinsic"),
241 "data-without-lane-mask",
242 "Create lane mask with compare/stepvector"),
244 "Create lane mask using active.lane.mask intrinsic, and use "
245 "it for both data and control flow"),
247 "Use predicated EVL instructions for tail folding. If EVL "
248 "is unsupported, fallback to data-without-lane-mask.")));
249
251 "enable-wide-lane-mask", cl::init(false), cl::Hidden,
252 cl::desc("Enable use of wide lane masks when used for control flow in "
253 "tail-folded loops"));
254
256 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
257 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
258
259/// An interleave-group may need masking if it resides in a block that needs
260/// predication, or in order to mask away gaps.
262 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
263 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
264
266 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
267 cl::desc("A flag that overrides the target's number of scalar registers."));
268
270 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
271 cl::desc("A flag that overrides the target's number of vector registers."));
272
274 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
275 cl::desc("A flag that overrides the target's max interleave factor for "
276 "scalar loops."));
277
279 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
280 cl::desc("A flag that overrides the target's max interleave factor for "
281 "vectorized loops."));
282
284 "force-target-instruction-cost", cl::init(0), cl::Hidden,
285 cl::desc("A flag that overrides the target's expected cost for "
286 "an instruction to a single constant value. Mostly "
287 "useful for getting consistent testing."));
288
290 "small-loop-cost", cl::init(20), cl::Hidden,
291 cl::desc(
292 "The cost of a loop that is considered 'small' by the interleaver."));
293
295 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
296 cl::desc("Enable the use of the block frequency analysis to access PGO "
297 "heuristics minimizing code growth in cold regions and being more "
298 "aggressive in hot regions."));
299
300// Runtime interleave loops for load/store throughput.
302 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
303 cl::desc(
304 "Enable runtime interleaving until load/store ports are saturated"));
305
306/// The number of stores in a loop that are allowed to need predication.
308 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
309 cl::desc("Max number of stores to be predicated behind an if."));
310
312 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
313 cl::desc("Count the induction variable only once when interleaving"));
314
316 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
317 cl::desc("The maximum interleave count to use when interleaving a scalar "
318 "reduction in a nested loop."));
319
321 "force-ordered-reductions", cl::init(false), cl::Hidden,
322 cl::desc("Enable the vectorisation of loops with in-order (strict) "
323 "FP reductions"));
324
326 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
327 cl::desc(
328 "Prefer predicating a reduction operation over an after loop select."));
329
331 "enable-vplan-native-path", cl::Hidden,
332 cl::desc("Enable VPlan-native vectorization path with "
333 "support for outer loop vectorization."));
334
336 llvm::VerifyEachVPlan("vplan-verify-each",
337#ifdef EXPENSIVE_CHECKS
338 cl::init(true),
339#else
340 cl::init(false),
341#endif
343 cl::desc("Verify VPlans after VPlan transforms."));
344
345#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
347 "vplan-print-after-all", cl::init(false), cl::Hidden,
348 cl::desc("Print VPlans after all VPlan transformations."));
349
351 "vplan-print-after", cl::Hidden,
352 cl::desc("Print VPlans after specified VPlan transformations (regexp)."));
353
355 "vplan-print-vector-region-scope", cl::init(false), cl::Hidden,
356 cl::desc("Limit VPlan printing to vector loop region in "
357 "`-vplan-print-after*` if the plan has one."));
358#endif
359
360// This flag enables the stress testing of the VPlan H-CFG construction in the
361// VPlan-native vectorization path. It must be used in conjuction with
362// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
363// verification of the H-CFGs built.
365 "vplan-build-outerloop-stress-test", cl::init(false), cl::Hidden,
366 cl::desc(
367 "Build VPlan for every supported loop nest in the function and bail "
368 "out right after the build (stress test the VPlan H-CFG construction "
369 "in the VPlan-native vectorization path)."));
370
372 "interleave-loops", cl::init(true), cl::Hidden,
373 cl::desc("Enable loop interleaving in Loop vectorization passes"));
375 "vectorize-loops", cl::init(true), cl::Hidden,
376 cl::desc("Run the Loop vectorization passes"));
377
379 ForceMaskedDivRem("force-widen-divrem-via-masked-intrinsic", cl::Hidden,
380 cl::desc("Override cost based masked intrinsic widening "
381 "for div/rem instructions"));
382
384 "enable-early-exit-vectorization", cl::init(true), cl::Hidden,
385 cl::desc(
386 "Enable vectorization of early exit loops with uncountable exits."));
387
388// Likelyhood of bypassing the vectorized loop because there are zero trips left
389// after prolog. See `emitIterationCountCheck`.
390static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
391
392/// A helper function that returns true if the given type is irregular. The
393/// type is irregular if its allocated size doesn't equal the store size of an
394/// element of the corresponding vector type.
395static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
396 // Determine if an array of N elements of type Ty is "bitcast compatible"
397 // with a <N x Ty> vector.
398 // This is only true if there is no padding between the array elements.
399 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
400}
401
402/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
403/// ElementCount to include loops whose trip count is a function of vscale.
405 const Loop *L) {
406 if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L))
407 return ElementCount::getFixed(ExpectedTC);
408
409 const SCEV *BTC = SE->getBackedgeTakenCount(L);
411 return ElementCount::getFixed(0);
412
413 const SCEV *ExitCount = SE->getTripCountFromExitCount(BTC, BTC->getType(), L);
414 if (isa<SCEVVScale>(ExitCount))
416
417 const APInt *Scale;
418 if (match(ExitCount, m_scev_Mul(m_scev_APInt(Scale), m_SCEVVScale())))
419 if (cast<SCEVMulExpr>(ExitCount)->hasNoUnsignedWrap())
420 if (Scale->getActiveBits() <= 32)
422
423 return ElementCount::getFixed(0);
424}
425
426/// Get the maximum trip count for \p L from the SCEV unsigned range, excluding
427/// zero from the range. Only valid when not folding the tail, as the minimum
428/// iteration count check guards against a zero trip count. Returns 0 if
429/// unknown.
431 Loop *L) {
432 const SCEV *BTC = PSE.getBackedgeTakenCount();
434 return 0;
435 ScalarEvolution *SE = PSE.getSE();
436 const SCEV *TripCount = SE->getTripCountFromExitCount(BTC, BTC->getType(), L);
437 ConstantRange TCRange = SE->getUnsignedRange(TripCount);
438 APInt MaxTCFromRange = TCRange.getUnsignedMax();
439 if (!MaxTCFromRange.isZero() && MaxTCFromRange.getActiveBits() <= 32)
440 return MaxTCFromRange.getZExtValue();
441 return 0;
442}
443
444/// Returns "best known" trip count, which is either a valid positive trip count
445/// or std::nullopt when an estimate cannot be made (including when the trip
446/// count would overflow), for the specified loop \p L as defined by the
447/// following procedure:
448/// 1) Returns exact trip count if it is known.
449/// 2) Returns expected trip count according to profile data if any.
450/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
451/// 4) Returns the maximum trip count from the SCEV range excluding zero,
452/// if \p CanUseConstantMax and \p CanExcludeZeroTrips.
453/// 5) Returns std::nullopt if all of the above failed.
454static std::optional<ElementCount>
456 bool CanUseConstantMax = true,
457 bool CanExcludeZeroTrips = false) {
458 // Check if exact trip count is known.
459 if (auto ExpectedTC = getSmallConstantTripCount(PSE.getSE(), L))
460 return ExpectedTC;
461
462 // Check if there is an expected trip count available from profile data.
464 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
465 return ElementCount::getFixed(*EstimatedTC);
466
467 if (!CanUseConstantMax)
468 return std::nullopt;
469
470 // Check if upper bound estimate is known.
471 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
472 return ElementCount::getFixed(ExpectedTC);
473
474 // Get the maximum trip count from the SCEV range excluding zero. This is
475 // only safe when not folding the tail, as the minimum iteration count check
476 // prevents entering the vector loop with a zero trip count.
477 if (CanUseConstantMax && CanExcludeZeroTrips)
478 if (unsigned RefinedTC = getMaxTCFromNonZeroRange(PSE, L))
479 return ElementCount::getFixed(RefinedTC);
480
481 return std::nullopt;
482}
483
484namespace {
485// Forward declare GeneratedRTChecks.
486class GeneratedRTChecks;
487
488using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
489} // namespace
490
491namespace llvm {
492
494
495/// InnerLoopVectorizer vectorizes loops which contain only one basic
496/// block to a specified vectorization factor (VF).
497/// This class performs the widening of scalars into vectors, or multiple
498/// scalars. This class also implements the following features:
499/// * It inserts an epilogue loop for handling loops that don't have iteration
500/// counts that are known to be a multiple of the vectorization factor.
501/// * It handles the code generation for reduction variables.
502/// * Scalarization (implementation using scalars) of un-vectorizable
503/// instructions.
504/// InnerLoopVectorizer does not perform any vectorization-legality
505/// checks, and relies on the caller to check for the different legality
506/// aspects. The InnerLoopVectorizer relies on the
507/// LoopVectorizationLegality class to provide information about the induction
508/// and reduction variables that were found to a given vectorization factor.
510public:
514 ElementCount VecWidth, unsigned UnrollFactor,
516 GeneratedRTChecks &RTChecks, VPlan &Plan)
517 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
518 VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
521 Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
522
523 virtual ~InnerLoopVectorizer() = default;
524
525 /// Creates a basic block for the scalar preheader. Both
526 /// EpilogueVectorizerMainLoop and EpilogueVectorizerEpilogueLoop overwrite
527 /// the method to create additional blocks and checks needed for epilogue
528 /// vectorization.
530
531 /// Fix the vectorized code, taking care of header phi's, and more.
533
534 /// Fix the non-induction PHIs in \p Plan.
536
537protected:
539
540 /// Create and return a new IR basic block for the scalar preheader whose name
541 /// is prefixed with \p Prefix.
543
544 /// Allow subclasses to override and print debug traces before/after vplan
545 /// execution, when trace information is requested.
546 virtual void printDebugTracesAtStart() {}
547 virtual void printDebugTracesAtEnd() {}
548
549 /// The original loop.
551
552 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
553 /// dynamic knowledge to simplify SCEV expressions and converts them to a
554 /// more usable form.
556
557 /// Loop Info.
559
560 /// Dominator Tree.
562
563 /// Target Transform Info.
565
566 /// Assumption Cache.
568
569 /// The vectorization SIMD factor to use. Each vector will have this many
570 /// vector elements.
572
573 /// The vectorization unroll factor to use. Each scalar is vectorized to this
574 /// many different vector instructions.
575 unsigned UF;
576
577 /// The builder that we use
579
580 // --- Vectorization state ---
581
582 /// The profitablity analysis.
584
585 /// Structure to hold information about generated runtime checks, responsible
586 /// for cleaning the checks, if vectorization turns out unprofitable.
587 GeneratedRTChecks &RTChecks;
588
590
591 /// The vector preheader block of \p Plan, used as target for check blocks
592 /// introduced during skeleton creation.
594};
595
596/// Encapsulate information regarding vectorization of a loop and its epilogue.
597/// This information is meant to be updated and used across two stages of
598/// epilogue vectorization.
601 unsigned MainLoopUF = 0;
603 unsigned EpilogueUF = 0;
608
610 ElementCount EVF, unsigned EUF,
612 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
614 assert(EUF == 1 &&
615 "A high UF for the epilogue loop is likely not beneficial.");
616 }
617};
618
619/// An extension of the inner loop vectorizer that creates a skeleton for a
620/// vectorized loop that has its epilogue (residual) also vectorized.
621/// The idea is to run the vplan on a given loop twice, firstly to setup the
622/// skeleton and vectorize the main loop, and secondly to complete the skeleton
623/// from the first step and vectorize the epilogue. This is achieved by
624/// deriving two concrete strategy classes from this base class and invoking
625/// them in succession from the loop vectorizer planner.
627public:
637
638 /// Holds and updates state information required to vectorize the main loop
639 /// and its epilogue in two separate passes. This setup helps us avoid
640 /// regenerating and recomputing runtime safety checks. It also helps us to
641 /// shorten the iteration-count-check path length for the cases where the
642 /// iteration count of the loop is so small that the main vector loop is
643 /// completely skipped.
645
646protected:
648};
649
650/// A specialized derived class of inner loop vectorizer that performs
651/// vectorization of *main* loops in the process of vectorizing loops and their
652/// epilogues.
654public:
665
666protected:
667 void printDebugTracesAtStart() override;
668 void printDebugTracesAtEnd() override;
669};
670
671// A specialized derived class of inner loop vectorizer that performs
672// vectorization of *epilogue* loops in the process of vectorizing loops and
673// their epilogues.
675public:
682 GeneratedRTChecks &Checks, VPlan &Plan)
684 Checks, Plan, EPI.EpilogueVF,
685 EPI.EpilogueVF, EPI.EpilogueUF) {}
686 /// Implements the interface for creating a vectorized skeleton using the
687 /// *epilogue loop* strategy (i.e., the second pass of VPlan execution).
689
690protected:
691 void printDebugTracesAtStart() override;
692 void printDebugTracesAtEnd() override;
693};
694} // end namespace llvm
695
696/// Look for a meaningful debug location on the instruction or its operands.
698 if (!I)
699 return DebugLoc::getUnknown();
700
702 if (I->getDebugLoc() != Empty)
703 return I->getDebugLoc();
704
705 for (Use &Op : I->operands()) {
706 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
707 if (OpInst->getDebugLoc() != Empty)
708 return OpInst->getDebugLoc();
709 }
710
711 return I->getDebugLoc();
712}
713
714/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
715/// is passed, the message relates to that particular instruction.
716#ifndef NDEBUG
717static void debugVectorizationMessage(const StringRef Prefix,
718 const StringRef DebugMsg,
719 Instruction *I) {
720 dbgs() << "LV: " << Prefix << DebugMsg;
721 if (I != nullptr)
722 dbgs() << " " << *I;
723 else
724 dbgs() << '.';
725 dbgs() << '\n';
726}
727#endif
728
729/// Create an analysis remark that explains why vectorization failed
730///
731/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
732/// RemarkName is the identifier for the remark. If \p I is passed it is an
733/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
734/// the location of the remark. If \p DL is passed, use it as debug location for
735/// the remark. \return the remark object that can be streamed to.
736static OptimizationRemarkAnalysis
737createLVAnalysis(const char *PassName, StringRef RemarkName,
738 const Loop *TheLoop, Instruction *I, DebugLoc DL = {}) {
739 BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
740 // If debug location is attached to the instruction, use it. Otherwise if DL
741 // was not provided, use the loop's.
742 if (I && I->getDebugLoc())
743 DL = I->getDebugLoc();
744 else if (!DL)
745 DL = TheLoop->getStartLoc();
746
747 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
748}
749
750namespace llvm {
751
752/// Return the runtime value for VF.
754 return B.CreateElementCount(Ty, VF);
755}
756
758 const StringRef OREMsg, const StringRef ORETag,
760 const Loop *TheLoop, Instruction *I) {
761 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
762 LoopVectorizeHints Hints(TheLoop, false /* doesn't matter */, *ORE);
763 ORE->emit(createLVAnalysis(LV_NAME, ORETag, TheLoop, I)
764 << "loop not vectorized: " << OREMsg);
765}
766
767void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
769 const Loop *TheLoop, Instruction *I, DebugLoc DL) {
771 LoopVectorizeHints Hints(TheLoop, false /* doesn't matter */, *ORE);
772 ORE->emit(createLVAnalysis(LV_NAME, ORETag, TheLoop, I, DL) << Msg);
773}
774
775/// Report successful vectorization of the loop. In case an outer loop is
776/// vectorized, prepend "outer" to the vectorization remark.
778 VectorizationFactor VF, unsigned IC) {
780 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
781 nullptr));
782 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
783 ORE->emit([&]() {
784 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
785 TheLoop->getHeader())
786 << "vectorized " << LoopType << "loop (vectorization width: "
787 << ore::NV("VectorizationFactor", VF.Width)
788 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
789 });
790}
791
792} // end namespace llvm
793
794namespace llvm {
795
796// Loop vectorization cost-model hints how the epilogue/tail loop should be
797// lowered.
799
800 // The default: allowing epilogues.
802
803 // Vectorization with OptForSize: don't allow epilogues.
805
806 // A special case of vectorisation with OptForSize: loops with a very small
807 // trip count are considered for vectorization under OptForSize, thereby
808 // making sure the cost of their loop body is dominant, free of runtime
809 // guards and scalar iteration overheads.
811
812 // Loop hint indicating an epilogue is undesired, apply tail folding.
814
815 // Directive indicating we must either fold the epilogue/tail or not vectorize
817};
818
819/// LoopVectorizationCostModel - estimates the expected speedups due to
820/// vectorization.
821/// In many cases vectorization is not profitable. This can happen because of
822/// a number of reasons. In this class we mainly attempt to predict the
823/// expected speedup/slowdowns due to the supported instruction set. We use the
824/// TargetTransformInfo to query the different backends for the cost of
825/// different operations.
828
829public:
843
844 /// \return An upper bound for the vectorization factors (both fixed and
845 /// scalable). If the factors are 0, vectorization and interleaving should be
846 /// avoided up front.
847 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
848
849 /// Memory access instruction may be vectorized in more than one way.
850 /// Form of instruction after vectorization depends on cost.
851 /// This function takes cost-based decisions for Load/Store instructions
852 /// and collects them in a map. This decisions map is used for building
853 /// the lists of loop-uniform and loop-scalar instructions.
854 /// The calculated cost is saved with widening decision in order to
855 /// avoid redundant calculations.
856 void setCostBasedWideningDecision(ElementCount VF);
857
858 /// A call may be vectorized in different ways depending on whether we have
859 /// vectorized variants available and whether the target supports masking.
860 /// This function analyzes all calls in the function at the supplied VF,
861 /// makes a decision based on the costs of available options, and stores that
862 /// decision in a map for use in planning and plan execution.
863 void setVectorizedCallDecision(ElementCount VF);
864
865 /// Collect values we want to ignore in the cost model.
866 void collectValuesToIgnore();
867
868 /// \returns True if it is more profitable to scalarize instruction \p I for
869 /// vectorization factor \p VF.
871 assert(VF.isVector() &&
872 "Profitable to scalarize relevant only for VF > 1.");
873 assert(
874 TheLoop->isInnermost() &&
875 "cost-model should not be used for outer loops (in VPlan-native path)");
876
877 auto Scalars = InstsToScalarize.find(VF);
878 assert(Scalars != InstsToScalarize.end() &&
879 "VF not yet analyzed for scalarization profitability");
880 return Scalars->second.contains(I);
881 }
882
883 /// Returns true if \p I is known to be uniform after vectorization.
885 assert(
886 TheLoop->isInnermost() &&
887 "cost-model should not be used for outer loops (in VPlan-native path)");
888
889 // If VF is scalar, then all instructions are trivially uniform.
890 if (VF.isScalar())
891 return true;
892
893 // Pseudo probes must be duplicated per vector lane so that the
894 // profiled loop trip count is not undercounted.
896 return false;
897
898 auto UniformsPerVF = Uniforms.find(VF);
899 assert(UniformsPerVF != Uniforms.end() &&
900 "VF not yet analyzed for uniformity");
901 return UniformsPerVF->second.count(I);
902 }
903
904 /// Returns true if \p I is known to be scalar after vectorization.
906 assert(
907 TheLoop->isInnermost() &&
908 "cost-model should not be used for outer loops (in VPlan-native path)");
909 if (VF.isScalar())
910 return true;
911
912 auto ScalarsPerVF = Scalars.find(VF);
913 assert(ScalarsPerVF != Scalars.end() &&
914 "Scalar values are not calculated for VF");
915 return ScalarsPerVF->second.count(I);
916 }
917
918 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
919 /// for vectorization factor \p VF.
921 const auto &MinBWs = Config.getMinimalBitwidths();
922 // Truncs must truncate at most to their destination type.
923 if (isa_and_nonnull<TruncInst>(I) && MinBWs.contains(I) &&
924 I->getType()->getScalarSizeInBits() < MinBWs.lookup(I))
925 return false;
926 return VF.isVector() && MinBWs.contains(I) &&
929 }
930
931 /// Decision that was taken during cost calculation for memory instruction.
934 CM_Widen, // For consecutive accesses with stride +1.
935 CM_Widen_Reverse, // For consecutive accesses with stride -1.
941 /// A widening decision that has been invalidated after replacing the
942 /// corresponding recipe during VPlan transforms.
943 /// TODO: Remove once the legacy exit cost computation is retired.
945 };
946
947 /// Save vectorization decision \p W and \p Cost taken by the cost model for
948 /// instruction \p I and vector width \p VF.
951 assert(VF.isVector() && "Expected VF >=2");
952 WideningDecisions[{I, VF}] = {W, Cost};
953 }
954
955 /// Save vectorization decision \p W and \p Cost taken by the cost model for
956 /// interleaving group \p Grp and vector width \p VF.
960 assert(VF.isVector() && "Expected VF >=2");
961 /// Broadcast this decicion to all instructions inside the group.
962 /// When interleaving, the cost will only be assigned one instruction, the
963 /// insert position. For other cases, add the appropriate fraction of the
964 /// total cost to each instruction. This ensures accurate costs are used,
965 /// even if the insert position instruction is not used.
966 InstructionCost InsertPosCost = Cost;
967 InstructionCost OtherMemberCost = 0;
968 if (W != CM_Interleave)
969 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
970 ;
971 for (auto *I : Grp->members()) {
972 if (Grp->getInsertPos() == I)
973 WideningDecisions[{I, VF}] = {W, InsertPosCost};
974 else
975 WideningDecisions[{I, VF}] = {W, OtherMemberCost};
976 }
977 }
978
979 /// Return the cost model decision for the given instruction \p I and vector
980 /// width \p VF. Return CM_Unknown if this instruction did not pass
981 /// through the cost modeling.
983 assert(VF.isVector() && "Expected VF to be a vector VF");
984 assert(
985 TheLoop->isInnermost() &&
986 "cost-model should not be used for outer loops (in VPlan-native path)");
987
988 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
989 auto Itr = WideningDecisions.find(InstOnVF);
990 if (Itr == WideningDecisions.end())
991 return CM_Unknown;
992 return Itr->second.first;
993 }
994
995 /// Return the vectorization cost for the given instruction \p I and vector
996 /// width \p VF.
998 assert(VF.isVector() && "Expected VF >=2");
999 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1000 assert(WideningDecisions.contains(InstOnVF) &&
1001 "The cost is not calculated");
1002 return WideningDecisions[InstOnVF].second;
1003 }
1004
1011
1013 Function *Variant, Intrinsic::ID IID,
1015 assert(!VF.isScalar() && "Expected vector VF");
1016 CallWideningDecisions[{CI, VF}] = {Kind, Variant, IID, Cost};
1017 }
1018
1020 ElementCount VF) const {
1021 assert(!VF.isScalar() && "Expected vector VF");
1022 auto I = CallWideningDecisions.find({CI, VF});
1023 if (I == CallWideningDecisions.end())
1024 return {CM_Unknown, nullptr, Intrinsic::not_intrinsic, 0};
1025 return I->second;
1026 }
1027
1028 /// Return True if instruction \p I is an optimizable truncate whose operand
1029 /// is an induction variable. Such a truncate will be removed by adding a new
1030 /// induction variable with the destination type.
1032 // If the instruction is not a truncate, return false.
1033 auto *Trunc = dyn_cast<TruncInst>(I);
1034 if (!Trunc)
1035 return false;
1036
1037 // Get the source and destination types of the truncate.
1038 Type *SrcTy = toVectorTy(Trunc->getSrcTy(), VF);
1039 Type *DestTy = toVectorTy(Trunc->getDestTy(), VF);
1040
1041 // If the truncate is free for the given types, return false. Replacing a
1042 // free truncate with an induction variable would add an induction variable
1043 // update instruction to each iteration of the loop. We exclude from this
1044 // check the primary induction variable since it will need an update
1045 // instruction regardless.
1046 Value *Op = Trunc->getOperand(0);
1047 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1048 return false;
1049
1050 // If the truncated value is not an induction variable, return false.
1051 return Legal->isInductionPhi(Op);
1052 }
1053
1054 /// Collects the instructions to scalarize for each predicated instruction in
1055 /// the loop.
1056 void collectInstsToScalarize(ElementCount VF);
1057
1058 /// Collect values that will not be widened, including Uniforms, Scalars, and
1059 /// Instructions to Scalarize for the given \p VF.
1060 /// The sets depend on CM decision for Load/Store instructions
1061 /// that may be vectorized as interleave, gather-scatter or scalarized.
1062 /// Also make a decision on what to do about call instructions in the loop
1063 /// at that VF -- scalarize, call a known vector routine, or call a
1064 /// vector intrinsic.
1066 // Do the analysis once.
1067 if (VF.isScalar() || Uniforms.contains(VF))
1068 return;
1070 collectLoopUniforms(VF);
1072 collectLoopScalars(VF);
1074 }
1075
1076 /// Given costs for both strategies, return true if the scalar predication
1077 /// lowering should be used for div/rem. This incorporates an override
1078 /// option so it is not simply a cost comparison.
1080 InstructionCost MaskedCost) const {
1081 switch (ForceMaskedDivRem) {
1082 case cl::BOU_UNSET:
1083 return ScalarCost < MaskedCost;
1084 case cl::BOU_TRUE:
1085 return false;
1086 case cl::BOU_FALSE:
1087 return true;
1088 }
1089 llvm_unreachable("impossible case value");
1090 }
1091
1092 /// Returns true if \p I is an instruction which requires predication and
1093 /// for which our chosen predication strategy is scalarization (i.e. we
1094 /// don't have an alternate strategy such as masking available).
1095 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1096 bool isScalarWithPredication(Instruction *I, ElementCount VF);
1097
1098 /// Wrapper function for LoopVectorizationLegality::isMaskRequired,
1099 /// that passes the Instruction \p I and if we fold tail.
1100 bool isMaskRequired(Instruction *I) const;
1101
1102 /// Returns true if \p I is an instruction that needs to be predicated
1103 /// at runtime. The result is independent of the predication mechanism.
1104 /// Superset of instructions that return true for isScalarWithPredication.
1105 bool isPredicatedInst(Instruction *I) const;
1106
1107 /// A helper function that returns how much we should divide the cost of a
1108 /// predicated block by. Typically this is the reciprocal of the block
1109 /// probability, i.e. if we return X we are assuming the predicated block will
1110 /// execute once for every X iterations of the loop header so the block should
1111 /// only contribute 1/X of its cost to the total cost calculation, but when
1112 /// optimizing for code size it will just be 1 as code size costs don't depend
1113 /// on execution probabilities.
1114 ///
1115 /// Note that if a block wasn't originally predicated but was predicated due
1116 /// to tail folding, the divisor will still be 1 because it will execute for
1117 /// every iteration of the loop header.
1118 inline uint64_t
1119 getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
1120 const BasicBlock *BB);
1121
1122 /// Returns true if an artificially high cost for emulated masked memrefs
1123 /// should be used.
1124 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1125
1126 /// Return the costs for our two available strategies for lowering a
1127 /// div/rem operation which requires speculating at least one lane.
1128 /// First result is for scalarization (will be invalid for scalable
1129 /// vectors); second is for the masked intrinsic strategy.
1130 std::pair<InstructionCost, InstructionCost>
1131 getDivRemSpeculationCost(Instruction *I, ElementCount VF);
1132
1133 /// Returns true if \p I is a memory instruction with consecutive memory
1134 /// access that can be widened.
1135 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1136
1137 /// Returns true if \p I is a memory instruction in an interleaved-group
1138 /// of memory accesses that can be vectorized with wide vector loads/stores
1139 /// and shuffles.
1140 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1141
1142 /// Check if \p Instr belongs to any interleaved access group.
1144 return InterleaveInfo.isInterleaved(Instr);
1145 }
1146
1147 /// Get the interleaved access group that \p Instr belongs to.
1150 return InterleaveInfo.getInterleaveGroup(Instr);
1151 }
1152
1153 /// Returns true if we're required to use a scalar epilogue for at least
1154 /// the final iteration of the original loop.
1155 bool requiresScalarEpilogue(bool IsVectorizing) const {
1156 if (!isEpilogueAllowed()) {
1157 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1158 return false;
1159 }
1160 // If we might exit from anywhere but the latch and early exit vectorization
1161 // is disabled, we must run the exiting iteration in scalar form.
1162 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1163 !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1164 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1165 "from latch block\n");
1166 return true;
1167 }
1168 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1169 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1170 "interleaved group requires scalar epilogue\n");
1171 return true;
1172 }
1173 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1174 return false;
1175 }
1176
1177 /// Returns true if an epilogue is allowed (e.g., not prevented by
1178 /// optsize or a loop hint annotation).
1179 bool isEpilogueAllowed() const {
1180 return EpilogueLoweringStatus == CM_EpilogueAllowed;
1181 }
1182
1183 /// Returns true if tail-folding is preferred over an epilogue.
1185 return EpilogueLoweringStatus == CM_EpilogueNotNeededFoldTail ||
1186 EpilogueLoweringStatus == CM_EpilogueNotAllowedFoldTail;
1187 }
1188
1189 /// Returns the TailFoldingStyle that is best for the current loop.
1191 return ChosenTailFoldingStyle;
1192 }
1193
1194 /// Selects and saves TailFoldingStyle.
1195 /// \param IsScalableVF true if scalable vector factors enabled.
1196 /// \param UserIC User specific interleave count.
1197 void setTailFoldingStyle(bool IsScalableVF, unsigned UserIC) {
1198 assert(ChosenTailFoldingStyle == TailFoldingStyle::None &&
1199 "Tail folding must not be selected yet.");
1200 if (!Legal->canFoldTailByMasking()) {
1201 ChosenTailFoldingStyle = TailFoldingStyle::None;
1202 return;
1203 }
1204
1205 // Default to TTI preference, but allow command line override.
1206 ChosenTailFoldingStyle = TTI.getPreferredTailFoldingStyle();
1207 if (ForceTailFoldingStyle.getNumOccurrences())
1208 ChosenTailFoldingStyle = ForceTailFoldingStyle.getValue();
1209
1210 if (ChosenTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1211 return;
1212 // Override EVL styles if needed.
1213 // FIXME: Investigate opportunity for fixed vector factor.
1214 bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
1215 TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
1216 if (EVLIsLegal)
1217 return;
1218 // If for some reason EVL mode is unsupported, fallback to an epilogue
1219 // if it's allowed, or DataWithoutLaneMask otherwise.
1220 if (EpilogueLoweringStatus == CM_EpilogueAllowed ||
1221 EpilogueLoweringStatus == CM_EpilogueNotNeededFoldTail)
1222 ChosenTailFoldingStyle = TailFoldingStyle::None;
1223 else
1224 ChosenTailFoldingStyle = TailFoldingStyle::DataWithoutLaneMask;
1225
1226 LLVM_DEBUG(
1227 dbgs() << "LV: Preference for VP intrinsics indicated. Will "
1228 "not try to generate VP Intrinsics "
1229 << (UserIC > 1
1230 ? "since interleave count specified is greater than 1.\n"
1231 : "due to non-interleaving reasons.\n"));
1232 }
1233
1234 /// Returns true if all loop blocks should be masked to fold tail loop.
1235 bool foldTailByMasking() const {
1237 }
1238
1239 /// Returns true if the use of wide lane masks is requested and the loop is
1240 /// using tail-folding with a lane mask for control flow.
1243 return false;
1244
1246 }
1247
1248 /// Returns true if the instructions in this block requires predication
1249 /// for any reason, e.g. because tail folding now requires a predicate
1250 /// or because the block in the original loop was predicated.
1252 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1253 }
1254
1255 /// Returns true if VP intrinsics with explicit vector length support should
1256 /// be generated in the tail folded loop.
1260
1261 /// Returns true if the predicated reduction select should be used to set the
1262 /// incoming value for the reduction phi.
1263 bool usePredicatedReductionSelect(RecurKind RecurrenceKind) const {
1264 // Force to use predicated reduction select since the EVL of the
1265 // second-to-last iteration might not be VF*UF.
1266 if (foldTailWithEVL())
1267 return true;
1268
1269 // Note: For FindLast recurrences we prefer a predicated select to simplify
1270 // matching in handleFindLastReductions(), rather than handle multiple
1271 // cases.
1273 return true;
1274
1276 TTI.preferPredicatedReductionSelect();
1277 }
1278
1279 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1280 /// with factor VF. Return the cost of the instruction, including
1281 /// scalarization overhead if it's needed.
1282 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1283
1284 /// Estimate cost of a call instruction CI if it were vectorized with factor
1285 /// VF. Return the cost of the instruction, including scalarization overhead
1286 /// if it's needed.
1287 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1288
1289 /// Invalidates decisions already taken by the cost model.
1291 WideningDecisions.clear();
1292 CallWideningDecisions.clear();
1293 Uniforms.clear();
1294 Scalars.clear();
1295 }
1296
1297 /// Returns the expected execution cost. The unit of the cost does
1298 /// not matter because we use the 'cost' units to compare different
1299 /// vector widths. The cost that is returned is *not* normalized by
1300 /// the factor width.
1301 InstructionCost expectedCost(ElementCount VF);
1302
1303 /// Returns true if epilogue vectorization is considered profitable, and
1304 /// false otherwise.
1305 /// \p VF is the vectorization factor chosen for the original loop.
1306 /// \p Multiplier is an aditional scaling factor applied to VF before
1307 /// comparing to EpilogueVectorizationMinVF.
1308 bool isEpilogueVectorizationProfitable(const ElementCount VF,
1309 const unsigned IC) const;
1310
1311 /// Returns the execution time cost of an instruction for a given vector
1312 /// width. Vector width of one means scalar.
1313 InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1314
1315 /// Return the cost of instructions in an inloop reduction pattern, if I is
1316 /// part of that pattern.
1317 std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1318 ElementCount VF,
1319 Type *VectorTy) const;
1320
1321 /// Returns true if \p Op should be considered invariant and if it is
1322 /// trivially hoistable.
1323 bool shouldConsiderInvariant(Value *Op);
1324
1325 /// Returns true if \p I has been forced to be scalarized at \p VF.
1327 auto FS = ForcedScalars.find(VF);
1328 return FS != ForcedScalars.end() && FS->second.contains(I);
1329 }
1330
1331private:
1332 unsigned NumPredStores = 0;
1333
1334 /// VF selection state independent of cost-modeling decisions.
1335 VFSelectionContext &Config;
1336
1337 /// Calculate vectorization cost of memory instruction \p I.
1338 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1339
1340 /// The cost computation for scalarized memory instruction.
1341 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1342
1343 /// The cost computation for interleaving group of memory instructions.
1344 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1345
1346 /// The cost computation for Gather/Scatter instruction.
1347 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1348
1349 /// The cost computation for widening instruction \p I with consecutive
1350 /// memory access.
1351 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1352
1353 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1354 /// Load: scalar load + broadcast.
1355 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1356 /// element)
1357 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1358
1359 /// Estimate the overhead of scalarizing an instruction. This is a
1360 /// convenience wrapper for the type-based getScalarizationOverhead API.
1362 ElementCount VF) const;
1363
1364 /// A type representing the costs for instructions if they were to be
1365 /// scalarized rather than vectorized. The entries are Instruction-Cost
1366 /// pairs.
1367 using ScalarCostsTy = MapVector<Instruction *, InstructionCost>;
1368
1369 /// A set containing all BasicBlocks that are known to present after
1370 /// vectorization as a predicated block.
1372 PredicatedBBsAfterVectorization;
1373
1374 /// Records whether it is allowed to have the original scalar loop execute at
1375 /// least once. This may be needed as a fallback loop in case runtime
1376 /// aliasing/dependence checks fail, or to handle the tail/remainder
1377 /// iterations when the trip count is unknown or doesn't divide by the VF,
1378 /// or as a peel-loop to handle gaps in interleave-groups.
1379 /// Under optsize and when the trip count is very small we don't allow any
1380 /// iterations to execute in the scalar loop.
1381 EpilogueLowering EpilogueLoweringStatus = CM_EpilogueAllowed;
1382
1383 /// Control finally chosen tail folding style.
1384 TailFoldingStyle ChosenTailFoldingStyle = TailFoldingStyle::None;
1385
1386 /// A map holding scalar costs for different vectorization factors. The
1387 /// presence of a cost for an instruction in the mapping indicates that the
1388 /// instruction will be scalarized when vectorizing with the associated
1389 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1391
1392 /// Holds the instructions known to be uniform after vectorization.
1393 /// The data is collected per VF.
1395
1396 /// Holds the instructions known to be scalar after vectorization.
1397 /// The data is collected per VF.
1399
1400 /// Holds the instructions (address computations) that are forced to be
1401 /// scalarized.
1403
1404 /// Returns the expected difference in cost from scalarizing the expression
1405 /// feeding a predicated instruction \p PredInst. The instructions to
1406 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1407 /// non-negative return value implies the expression will be scalarized.
1408 /// Currently, only single-use chains are considered for scalarization.
1409 InstructionCost computePredInstDiscount(Instruction *PredInst,
1410 ScalarCostsTy &ScalarCosts,
1411 ElementCount VF);
1412
1413 /// Collect the instructions that are uniform after vectorization. An
1414 /// instruction is uniform if we represent it with a single scalar value in
1415 /// the vectorized loop corresponding to each vector iteration. Examples of
1416 /// uniform instructions include pointer operands of consecutive or
1417 /// interleaved memory accesses. Note that although uniformity implies an
1418 /// instruction will be scalar, the reverse is not true. In general, a
1419 /// scalarized instruction will be represented by VF scalar values in the
1420 /// vectorized loop, each corresponding to an iteration of the original
1421 /// scalar loop.
1422 void collectLoopUniforms(ElementCount VF);
1423
1424 /// Collect the instructions that are scalar after vectorization. An
1425 /// instruction is scalar if it is known to be uniform or will be scalarized
1426 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1427 /// to the list if they are used by a load/store instruction that is marked as
1428 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1429 /// VF values in the vectorized loop, each corresponding to an iteration of
1430 /// the original scalar loop.
1431 void collectLoopScalars(ElementCount VF);
1432
1433 /// Keeps cost model vectorization decision and cost for instructions.
1434 /// Right now it is used for memory instructions only.
1436 std::pair<InstWidening, InstructionCost>>;
1437
1438 DecisionList WideningDecisions;
1439
1440 using CallDecisionList =
1441 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1442
1443 CallDecisionList CallWideningDecisions;
1444
1445 /// Returns true if \p V is expected to be vectorized and it needs to be
1446 /// extracted.
1447 bool needsExtract(Value *V, ElementCount VF) const {
1449 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1450 TheLoop->isLoopInvariant(I) ||
1451 getWideningDecision(I, VF) == CM_Scalarize ||
1452 (isa<CallInst>(I) &&
1453 getCallWideningDecision(cast<CallInst>(I), VF).Kind == CM_Scalarize))
1454 return false;
1455
1456 // Assume we can vectorize V (and hence we need extraction) if the
1457 // scalars are not computed yet. This can happen, because it is called
1458 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1459 // the scalars are collected. That should be a safe assumption in most
1460 // cases, because we check if the operands have vectorizable types
1461 // beforehand in LoopVectorizationLegality.
1462 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1463 };
1464
1465 /// Returns a range containing only operands needing to be extracted.
1466 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1467 ElementCount VF) const {
1468
1469 SmallPtrSet<const Value *, 4> UniqueOperands;
1471 for (Value *Op : Ops) {
1472 if (isa<Constant>(Op) || !UniqueOperands.insert(Op).second ||
1473 !needsExtract(Op, VF))
1474 continue;
1475 Res.push_back(Op);
1476 }
1477 return Res;
1478 }
1479
1480public:
1481 /// The loop that we evaluate.
1483
1484 /// Predicated scalar evolution analysis.
1486
1487 /// Loop Info analysis.
1489
1490 /// Vectorization legality.
1492
1493 /// Vector target information.
1495
1496 /// Target Library Info.
1498
1499 /// Assumption cache.
1501
1502 /// Interface to emit optimization remarks.
1504
1505 /// A function to lazily fetch BlockFrequencyInfo. This avoids computing it
1506 /// unless necessary, e.g. when the loop isn't legal to vectorize or when
1507 /// there is no predication.
1508 std::function<BlockFrequencyInfo &()> GetBFI;
1509 /// The BlockFrequencyInfo returned from GetBFI.
1511 /// Returns the BlockFrequencyInfo for the function if cached, otherwise
1512 /// fetches it via GetBFI. Avoids an indirect call to the std::function.
1514 if (!BFI)
1515 BFI = &GetBFI();
1516 return *BFI;
1517 }
1518
1520
1521 /// Loop Vectorize Hint.
1523
1524 /// The interleave access information contains groups of interleaved accesses
1525 /// with the same stride and close to each other.
1527
1528 /// Values to ignore in the cost model.
1530
1531 /// Values to ignore in the cost model when VF > 1.
1533};
1534} // end namespace llvm
1535
1536namespace {
1537/// Helper struct to manage generating runtime checks for vectorization.
1538///
1539/// The runtime checks are created up-front in temporary blocks to allow better
1540/// estimating the cost and un-linked from the existing IR. After deciding to
1541/// vectorize, the checks are moved back. If deciding not to vectorize, the
1542/// temporary blocks are completely removed.
1543class GeneratedRTChecks {
1544 /// Basic block which contains the generated SCEV checks, if any.
1545 BasicBlock *SCEVCheckBlock = nullptr;
1546
1547 /// The value representing the result of the generated SCEV checks. If it is
1548 /// nullptr no SCEV checks have been generated.
1549 Value *SCEVCheckCond = nullptr;
1550
1551 /// Basic block which contains the generated memory runtime checks, if any.
1552 BasicBlock *MemCheckBlock = nullptr;
1553
1554 /// The value representing the result of the generated memory runtime checks.
1555 /// If it is nullptr no memory runtime checks have been generated.
1556 Value *MemRuntimeCheckCond = nullptr;
1557
1558 DominatorTree *DT;
1559 LoopInfo *LI;
1561
1562 SCEVExpander SCEVExp;
1563 SCEVExpander MemCheckExp;
1564
1565 bool CostTooHigh = false;
1566
1567 Loop *OuterLoop = nullptr;
1568
1570
1571 /// The kind of cost that we are calculating
1573
1574public:
1575 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1578 : DT(DT), LI(LI), TTI(TTI),
1579 SCEVExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
1580 MemCheckExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
1581 PSE(PSE), CostKind(CostKind) {}
1582
1583 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1584 /// accurately estimate the cost of the runtime checks. The blocks are
1585 /// un-linked from the IR and are added back during vector code generation. If
1586 /// there is no vector code generation, the check blocks are removed
1587 /// completely.
1588 void create(Loop *L, const LoopAccessInfo &LAI,
1589 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC,
1590 OptimizationRemarkEmitter &ORE) {
1591
1592 // Hard cutoff to limit compile-time increase in case a very large number of
1593 // runtime checks needs to be generated.
1594 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1595 // profile info.
1596 CostTooHigh =
1598 if (CostTooHigh) {
1599 // Mark runtime checks as never succeeding when they exceed the threshold.
1600 MemRuntimeCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
1601 SCEVCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
1602 ORE.emit([&]() {
1603 return OptimizationRemarkAnalysisAliasing(
1604 DEBUG_TYPE, "TooManyMemoryRuntimeChecks", L->getStartLoc(),
1605 L->getHeader())
1606 << "loop not vectorized: too many memory checks needed";
1607 });
1608 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
1609 return;
1610 }
1611
1612 BasicBlock *LoopHeader = L->getHeader();
1613 BasicBlock *Preheader = L->getLoopPreheader();
1614
1615 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1616 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1617 // may be used by SCEVExpander. The blocks will be un-linked from their
1618 // predecessors and removed from LI & DT at the end of the function.
1619 if (!UnionPred.isAlwaysTrue()) {
1620 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1621 nullptr, "vector.scevcheck");
1622
1623 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1624 &UnionPred, SCEVCheckBlock->getTerminator());
1625 if (isa<Constant>(SCEVCheckCond)) {
1626 // Clean up directly after expanding the predicate to a constant, to
1627 // avoid further expansions re-using anything left over from SCEVExp.
1628 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1629 SCEVCleaner.cleanup();
1630 }
1631 }
1632
1633 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1634 if (RtPtrChecking.Need) {
1635 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1636 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1637 "vector.memcheck");
1638
1639 auto DiffChecks = RtPtrChecking.getDiffChecks();
1640 if (DiffChecks) {
1641 Value *RuntimeVF = nullptr;
1642 MemRuntimeCheckCond = addDiffRuntimeChecks(
1643 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1644 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1645 if (!RuntimeVF)
1646 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1647 return RuntimeVF;
1648 },
1649 IC);
1650 } else {
1651 MemRuntimeCheckCond = addRuntimeChecks(
1652 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1654 }
1655 assert(MemRuntimeCheckCond &&
1656 "no RT checks generated although RtPtrChecking "
1657 "claimed checks are required");
1658 }
1659
1660 SCEVExp.eraseDeadInstructions(SCEVCheckCond);
1661
1662 if (!MemCheckBlock && !SCEVCheckBlock)
1663 return;
1664
1665 // Unhook the temporary block with the checks, update various places
1666 // accordingly.
1667 if (SCEVCheckBlock)
1668 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1669 if (MemCheckBlock)
1670 MemCheckBlock->replaceAllUsesWith(Preheader);
1671
1672 if (SCEVCheckBlock) {
1673 SCEVCheckBlock->getTerminator()->moveBefore(
1674 Preheader->getTerminator()->getIterator());
1675 auto *UI = new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1676 UI->setDebugLoc(DebugLoc::getTemporary());
1677 Preheader->getTerminator()->eraseFromParent();
1678 }
1679 if (MemCheckBlock) {
1680 MemCheckBlock->getTerminator()->moveBefore(
1681 Preheader->getTerminator()->getIterator());
1682 auto *UI = new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1683 UI->setDebugLoc(DebugLoc::getTemporary());
1684 Preheader->getTerminator()->eraseFromParent();
1685 }
1686
1687 DT->changeImmediateDominator(LoopHeader, Preheader);
1688 if (MemCheckBlock) {
1689 DT->eraseNode(MemCheckBlock);
1690 LI->removeBlock(MemCheckBlock);
1691 }
1692 if (SCEVCheckBlock) {
1693 DT->eraseNode(SCEVCheckBlock);
1694 LI->removeBlock(SCEVCheckBlock);
1695 }
1696
1697 // Outer loop is used as part of the later cost calculations.
1698 OuterLoop = L->getParentLoop();
1699 }
1700
1702 if (SCEVCheckBlock || MemCheckBlock)
1703 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1704
1705 if (CostTooHigh) {
1707 Cost.setInvalid();
1708 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1709 return Cost;
1710 }
1711
1712 InstructionCost RTCheckCost = 0;
1713 if (SCEVCheckBlock)
1714 for (Instruction &I : *SCEVCheckBlock) {
1715 if (SCEVCheckBlock->getTerminator() == &I)
1716 continue;
1718 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1719 RTCheckCost += C;
1720 }
1721 if (MemCheckBlock) {
1722 InstructionCost MemCheckCost = 0;
1723 for (Instruction &I : *MemCheckBlock) {
1724 if (MemCheckBlock->getTerminator() == &I)
1725 continue;
1727 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1728 MemCheckCost += C;
1729 }
1730
1731 // If the runtime memory checks are being created inside an outer loop
1732 // we should find out if these checks are outer loop invariant. If so,
1733 // the checks will likely be hoisted out and so the effective cost will
1734 // reduce according to the outer loop trip count.
1735 if (OuterLoop) {
1736 ScalarEvolution *SE = MemCheckExp.getSE();
1737 // TODO: If profitable, we could refine this further by analysing every
1738 // individual memory check, since there could be a mixture of loop
1739 // variant and invariant checks that mean the final condition is
1740 // variant.
1741 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1742 if (SE->isLoopInvariant(Cond, OuterLoop)) {
1743 // It seems reasonable to assume that we can reduce the effective
1744 // cost of the checks even when we know nothing about the trip
1745 // count. Assume that the outer loop executes at least twice.
1746 unsigned BestTripCount = 2;
1747
1748 // Get the best known TC estimate.
1749 if (auto EstimatedTC = getSmallBestKnownTC(
1750 PSE, OuterLoop, /* CanUseConstantMax = */ false))
1751 if (EstimatedTC->isFixed())
1752 BestTripCount = EstimatedTC->getFixedValue();
1753
1754 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1755
1756 // Let's ensure the cost is always at least 1.
1757 NewMemCheckCost = std::max(NewMemCheckCost.getValue(),
1758 (InstructionCost::CostType)1);
1759
1760 if (BestTripCount > 1)
1762 << "We expect runtime memory checks to be hoisted "
1763 << "out of the outer loop. Cost reduced from "
1764 << MemCheckCost << " to " << NewMemCheckCost << '\n');
1765
1766 MemCheckCost = NewMemCheckCost;
1767 }
1768 }
1769
1770 RTCheckCost += MemCheckCost;
1771 }
1772
1773 if (SCEVCheckBlock || MemCheckBlock)
1774 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
1775 << "\n");
1776
1777 return RTCheckCost;
1778 }
1779
1780 /// Remove the created SCEV & memory runtime check blocks & instructions, if
1781 /// unused.
1782 ~GeneratedRTChecks() {
1783 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1784 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
1785 bool SCEVChecksUsed = !SCEVCheckBlock || !pred_empty(SCEVCheckBlock);
1786 bool MemChecksUsed = !MemCheckBlock || !pred_empty(MemCheckBlock);
1787 if (SCEVChecksUsed)
1788 SCEVCleaner.markResultUsed();
1789
1790 if (MemChecksUsed) {
1791 MemCheckCleaner.markResultUsed();
1792 } else {
1793 auto &SE = *MemCheckExp.getSE();
1794 // Memory runtime check generation creates compares that use expanded
1795 // values. Remove them before running the SCEVExpanderCleaners.
1796 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
1797 if (MemCheckExp.isInsertedInstruction(&I))
1798 continue;
1799 SE.forgetValue(&I);
1800 I.eraseFromParent();
1801 }
1802 }
1803 MemCheckCleaner.cleanup();
1804 SCEVCleaner.cleanup();
1805
1806 if (!SCEVChecksUsed)
1807 SCEVCheckBlock->eraseFromParent();
1808 if (!MemChecksUsed)
1809 MemCheckBlock->eraseFromParent();
1810 }
1811
1812 /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
1813 /// outside VPlan.
1814 std::pair<Value *, BasicBlock *> getSCEVChecks() const {
1815 using namespace llvm::PatternMatch;
1816 if (!SCEVCheckCond || match(SCEVCheckCond, m_ZeroInt()))
1817 return {nullptr, nullptr};
1818
1819 return {SCEVCheckCond, SCEVCheckBlock};
1820 }
1821
1822 /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
1823 /// outside VPlan.
1824 std::pair<Value *, BasicBlock *> getMemRuntimeChecks() const {
1825 using namespace llvm::PatternMatch;
1826 if (MemRuntimeCheckCond && match(MemRuntimeCheckCond, m_ZeroInt()))
1827 return {nullptr, nullptr};
1828 return {MemRuntimeCheckCond, MemCheckBlock};
1829 }
1830
1831 /// Return true if any runtime checks have been added
1832 bool hasChecks() const {
1833 return getSCEVChecks().first || getMemRuntimeChecks().first;
1834 }
1835};
1836} // namespace
1837
1839 return Style == TailFoldingStyle::Data ||
1841}
1842
1846
1847// Return true if \p OuterLp is an outer loop annotated with hints for explicit
1848// vectorization. The loop needs to be annotated with #pragma omp simd
1849// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1850// vector length information is not provided, vectorization is not considered
1851// explicit. Interleave hints are not allowed either. These limitations will be
1852// relaxed in the future.
1853// Please, note that we are currently forced to abuse the pragma 'clang
1854// vectorize' semantics. This pragma provides *auto-vectorization hints*
1855// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1856// provides *explicit vectorization hints* (LV can bypass legal checks and
1857// assume that vectorization is legal). However, both hints are implemented
1858// using the same metadata (llvm.loop.vectorize, processed by
1859// LoopVectorizeHints). This will be fixed in the future when the native IR
1860// representation for pragma 'omp simd' is introduced.
1861static bool isExplicitVecOuterLoop(Loop *OuterLp,
1863 assert(!OuterLp->isInnermost() && "This is not an outer loop");
1864 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1865
1866 // Only outer loops with an explicit vectorization hint are supported.
1867 // Unannotated outer loops are ignored.
1869 return false;
1870
1871 Function *Fn = OuterLp->getHeader()->getParent();
1872 if (!Hints.allowVectorization(Fn, OuterLp,
1873 true /*VectorizeOnlyWhenForced*/)) {
1874 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1875 return false;
1876 }
1877
1878 if (Hints.getInterleave() > 1) {
1879 // TODO: Interleave support is future work.
1880 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1881 "outer loops.\n");
1882 Hints.emitRemarkWithHints();
1883 return false;
1884 }
1885
1886 return true;
1887}
1888
1892 // Collect inner loops and outer loops without irreducible control flow. For
1893 // now, only collect outer loops that have explicit vectorization hints. If we
1894 // are stress testing the VPlan H-CFG construction, we collect the outermost
1895 // loop of every loop nest.
1896 if (L.isInnermost() || VPlanBuildOuterloopStressTest ||
1898 LoopBlocksRPO RPOT(&L);
1899 RPOT.perform(LI);
1901 V.push_back(&L);
1902 // TODO: Collect inner loops inside marked outer loops in case
1903 // vectorization fails for the outer loop. Do not invoke
1904 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1905 // already known to be reducible. We can use an inherited attribute for
1906 // that.
1907 return;
1908 }
1909 }
1910 for (Loop *InnerL : L)
1911 collectSupportedLoops(*InnerL, LI, ORE, V);
1912}
1913
1914//===----------------------------------------------------------------------===//
1915// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1916// LoopVectorizationCostModel and LoopVectorizationPlanner.
1917//===----------------------------------------------------------------------===//
1918
1919/// For the given VF and UF and maximum trip count computed for the loop, return
1920/// whether the induction variable might overflow in the vectorized loop. If not,
1921/// then we know a runtime overflow check always evaluates to false and can be
1922/// removed.
1924 const LoopVectorizationCostModel *Cost,
1925 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
1926 // Always be conservative if we don't know the exact unroll factor.
1927 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
1928
1929 IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
1930 APInt MaxUIntTripCount = IdxTy->getMask();
1931
1932 // We know the runtime overflow check is known false iff the (max) trip-count
1933 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
1934 // the vector loop induction variable.
1935 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
1936 uint64_t MaxVF = VF.getKnownMinValue();
1937 if (VF.isScalable()) {
1938 std::optional<unsigned> MaxVScale =
1939 getMaxVScale(*Cost->TheFunction, Cost->TTI);
1940 if (!MaxVScale)
1941 return false;
1942 MaxVF *= *MaxVScale;
1943 }
1944
1945 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
1946 }
1947
1948 return false;
1949}
1950
1951// Return whether we allow using masked interleave-groups (for dealing with
1952// strided loads/stores that reside in predicated blocks, or for dealing
1953// with gaps).
1955 // If an override option has been passed in for interleaved accesses, use it.
1956 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
1958
1959 return TTI.enableMaskedInterleavedAccessVectorization();
1960}
1961
1962/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
1963/// VPBB are moved to the end of the newly created VPIRBasicBlock. All
1964/// predecessors and successors of VPBB, if any, are rewired to the new
1965/// VPIRBasicBlock. If \p VPBB may be unreachable, \p Plan must be passed.
1967 BasicBlock *IRBB,
1968 VPlan *Plan = nullptr) {
1969 if (!Plan)
1970 Plan = VPBB->getPlan();
1971 VPIRBasicBlock *IRVPBB = Plan->createVPIRBasicBlock(IRBB);
1972 auto IP = IRVPBB->begin();
1973 for (auto &R : make_early_inc_range(VPBB->phis()))
1974 R.moveBefore(*IRVPBB, IP);
1975
1976 for (auto &R :
1978 R.moveBefore(*IRVPBB, IRVPBB->end());
1979
1980 VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
1981 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
1982 return IRVPBB;
1983}
1984
1986 BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
1987 assert(VectorPH && "Invalid loop structure");
1988 assert((OrigLoop->getUniqueLatchExitBlock() ||
1989 Cost->requiresScalarEpilogue(VF.isVector())) &&
1990 "loops not exiting via the latch without required epilogue?");
1991
1992 // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
1993 // wrapping the newly created scalar preheader here at the moment, because the
1994 // Plan's scalar preheader may be unreachable at this point. Instead it is
1995 // replaced in executePlan.
1996 return SplitBlock(VectorPH, VectorPH->getTerminator(), DT, LI, nullptr,
1997 Twine(Prefix) + "scalar.ph");
1998}
1999
2000/// Knowing that loop \p L executes a single vector iteration, add instructions
2001/// that will get simplified and thus should not have any cost to \p
2002/// InstsToIgnore.
2005 SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2006 auto *Cmp = L->getLatchCmpInst();
2007 if (Cmp)
2008 InstsToIgnore.insert(Cmp);
2009 for (const auto &KV : IL) {
2010 // Extract the key by hand so that it can be used in the lambda below. Note
2011 // that captured structured bindings are a C++20 extension.
2012 const PHINode *IV = KV.first;
2013
2014 // Get next iteration value of the induction variable.
2015 Instruction *IVInst =
2016 cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2017 if (all_of(IVInst->users(),
2018 [&](const User *U) { return U == IV || U == Cmp; }))
2019 InstsToIgnore.insert(IVInst);
2020 }
2021}
2022
2024 // Create a new IR basic block for the scalar preheader.
2025 BasicBlock *ScalarPH = createScalarPreheader("");
2026 return ScalarPH->getSinglePredecessor();
2027}
2028
2029namespace {
2030
2031struct CSEDenseMapInfo {
2032 static bool canHandle(const Instruction *I) {
2035 }
2036
2037 static inline Instruction *getEmptyKey() {
2039 }
2040
2041 static inline Instruction *getTombstoneKey() {
2042 return DenseMapInfo<Instruction *>::getTombstoneKey();
2043 }
2044
2045 static unsigned getHashValue(const Instruction *I) {
2046 assert(canHandle(I) && "Unknown instruction!");
2047 return hash_combine(I->getOpcode(),
2048 hash_combine_range(I->operand_values()));
2049 }
2050
2051 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2052 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2053 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2054 return LHS == RHS;
2055 return LHS->isIdenticalTo(RHS);
2056 }
2057};
2058
2059} // end anonymous namespace
2060
2061/// FIXME: This legacy common-subexpression-elimination routine is scheduled for
2062/// removal, in favor of the VPlan-based one.
2063static void legacyCSE(BasicBlock *BB) {
2064 // Perform simple cse.
2066 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2067 if (!CSEDenseMapInfo::canHandle(&In))
2068 continue;
2069
2070 // Check if we can replace this instruction with any of the
2071 // visited instructions.
2072 if (Instruction *V = CSEMap.lookup(&In)) {
2073 In.replaceAllUsesWith(V);
2074 In.eraseFromParent();
2075 continue;
2076 }
2077
2078 CSEMap[&In] = &In;
2079 }
2080}
2081
2082/// This function attempts to return a value that represents the ElementCount
2083/// at runtime. For fixed-width VFs we know this precisely at compile
2084/// time, but for scalable VFs we calculate it based on an estimate of the
2085/// vscale value.
2087 std::optional<unsigned> VScale) {
2088 unsigned EstimatedVF = VF.getKnownMinValue();
2089 if (VF.isScalable())
2090 if (VScale)
2091 EstimatedVF *= *VScale;
2092 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
2093 return EstimatedVF;
2094}
2095
2098 ElementCount VF) const {
2099 // We only need to calculate a cost if the VF is scalar; for actual vectors
2100 // we should already have a pre-calculated cost at each VF.
2101 if (!VF.isScalar())
2102 return getCallWideningDecision(CI, VF).Cost;
2103
2104 Type *RetTy = CI->getType();
2106 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
2107 return *RedCost;
2108
2110 for (auto &ArgOp : CI->args())
2111 Tys.push_back(ArgOp->getType());
2112
2113 InstructionCost ScalarCallCost = TTI.getCallInstrCost(
2114 CI->getCalledFunction(), RetTy, Tys, Config.CostKind);
2115
2116 // If this is an intrinsic we may have a lower cost for it.
2119 return std::min(ScalarCallCost, IntrinsicCost);
2120 }
2121 return ScalarCallCost;
2122}
2123
2125 if (VF.isScalar() || !canVectorizeTy(Ty))
2126 return Ty;
2127 return toVectorizedTy(Ty, VF);
2128}
2129
2132 ElementCount VF) const {
2134 assert(ID && "Expected intrinsic call!");
2135 Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2136 FastMathFlags FMF;
2137 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2138 FMF = FPMO->getFastMathFlags();
2139
2142 SmallVector<Type *> ParamTys;
2143 std::transform(FTy->param_begin(), FTy->param_end(),
2144 std::back_inserter(ParamTys),
2145 [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2146
2147 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2150 return TTI.getIntrinsicInstrCost(CostAttrs, Config.CostKind);
2151}
2152
2154 // Fix widened non-induction PHIs by setting up the PHI operands.
2155 fixNonInductionPHIs(State);
2156
2157 // Don't apply optimizations below when no (vector) loop remains, as they all
2158 // require one at the moment.
2159 VPBasicBlock *HeaderVPBB =
2160 vputils::getFirstLoopHeader(*State.Plan, State.VPDT);
2161 if (!HeaderVPBB)
2162 return;
2163
2164 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2165
2166 // Remove redundant induction instructions.
2167 legacyCSE(HeaderBB);
2168}
2169
2171 auto Iter = vp_depth_first_shallow(Plan.getEntry());
2173 for (VPRecipeBase &P : VPBB->phis()) {
2175 if (!VPPhi)
2176 continue;
2177 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
2178 // Make sure the builder has a valid insert point.
2179 Builder.SetInsertPoint(NewPhi);
2180 for (const auto &[Inc, VPBB] : VPPhi->incoming_values_and_blocks())
2181 NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
2182 }
2183 }
2184}
2185
2186void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
2187 // We should not collect Scalars more than once per VF. Right now, this
2188 // function is called from collectUniformsAndScalars(), which already does
2189 // this check. Collecting Scalars for VF=1 does not make any sense.
2190 assert(VF.isVector() && !Scalars.contains(VF) &&
2191 "This function should not be visited twice for the same VF");
2192
2193 // This avoids any chances of creating a REPLICATE recipe during planning
2194 // since that would result in generation of scalarized code during execution,
2195 // which is not supported for scalable vectors.
2196 if (VF.isScalable()) {
2197 Scalars[VF].insert_range(Uniforms[VF]);
2198 return;
2199 }
2200
2202
2203 // These sets are used to seed the analysis with pointers used by memory
2204 // accesses that will remain scalar.
2206 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
2207 auto *Latch = TheLoop->getLoopLatch();
2208
2209 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
2210 // The pointer operands of loads and stores will be scalar as long as the
2211 // memory access is not a gather or scatter operation. The value operand of a
2212 // store will remain scalar if the store is scalarized.
2213 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
2214 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
2215 assert(WideningDecision != CM_Unknown &&
2216 "Widening decision should be ready at this moment");
2217 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
2218 if (Ptr == Store->getValueOperand())
2219 return WideningDecision == CM_Scalarize;
2220 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
2221 "Ptr is neither a value or pointer operand");
2222 return WideningDecision != CM_GatherScatter;
2223 };
2224
2225 // A helper that returns true if the given value is a getelementptr
2226 // instruction contained in the loop.
2227 auto IsLoopVaryingGEP = [&](Value *V) {
2228 return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
2229 };
2230
2231 // A helper that evaluates a memory access's use of a pointer. If the use will
2232 // be a scalar use and the pointer is only used by memory accesses, we place
2233 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
2234 // PossibleNonScalarPtrs.
2235 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
2236 // We only care about bitcast and getelementptr instructions contained in
2237 // the loop.
2238 if (!IsLoopVaryingGEP(Ptr))
2239 return;
2240
2241 // If the pointer has already been identified as scalar (e.g., if it was
2242 // also identified as uniform), there's nothing to do.
2243 auto *I = cast<Instruction>(Ptr);
2244 if (Worklist.count(I))
2245 return;
2246
2247 // If the use of the pointer will be a scalar use, and all users of the
2248 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
2249 // place the pointer in PossibleNonScalarPtrs.
2250 if (IsScalarUse(MemAccess, Ptr) &&
2252 ScalarPtrs.insert(I);
2253 else
2254 PossibleNonScalarPtrs.insert(I);
2255 };
2256
2257 // We seed the scalars analysis with three classes of instructions: (1)
2258 // instructions marked uniform-after-vectorization and (2) bitcast,
2259 // getelementptr and (pointer) phi instructions used by memory accesses
2260 // requiring a scalar use.
2261 //
2262 // (1) Add to the worklist all instructions that have been identified as
2263 // uniform-after-vectorization.
2264 Worklist.insert_range(Uniforms[VF]);
2265
2266 // (2) Add to the worklist all bitcast and getelementptr instructions used by
2267 // memory accesses requiring a scalar use. The pointer operands of loads and
2268 // stores will be scalar unless the operation is a gather or scatter.
2269 // The value operand of a store will remain scalar if the store is scalarized.
2270 for (auto *BB : TheLoop->blocks())
2271 for (auto &I : *BB) {
2272 if (auto *Load = dyn_cast<LoadInst>(&I)) {
2273 EvaluatePtrUse(Load, Load->getPointerOperand());
2274 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
2275 EvaluatePtrUse(Store, Store->getPointerOperand());
2276 EvaluatePtrUse(Store, Store->getValueOperand());
2277 }
2278 }
2279 for (auto *I : ScalarPtrs)
2280 if (!PossibleNonScalarPtrs.count(I)) {
2281 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
2282 Worklist.insert(I);
2283 }
2284
2285 // Insert the forced scalars.
2286 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
2287 // induction variable when the PHI user is scalarized.
2288 auto ForcedScalar = ForcedScalars.find(VF);
2289 if (ForcedScalar != ForcedScalars.end())
2290 for (auto *I : ForcedScalar->second) {
2291 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
2292 Worklist.insert(I);
2293 }
2294
2295 // Expand the worklist by looking through any bitcasts and getelementptr
2296 // instructions we've already identified as scalar. This is similar to the
2297 // expansion step in collectLoopUniforms(); however, here we're only
2298 // expanding to include additional bitcasts and getelementptr instructions.
2299 unsigned Idx = 0;
2300 while (Idx != Worklist.size()) {
2301 Instruction *Dst = Worklist[Idx++];
2302 if (!IsLoopVaryingGEP(Dst->getOperand(0)))
2303 continue;
2304 auto *Src = cast<Instruction>(Dst->getOperand(0));
2305 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
2306 auto *J = cast<Instruction>(U);
2307 return !TheLoop->contains(J) || Worklist.count(J) ||
2308 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
2309 IsScalarUse(J, Src));
2310 })) {
2311 Worklist.insert(Src);
2312 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
2313 }
2314 }
2315
2316 // An induction variable will remain scalar if all users of the induction
2317 // variable and induction variable update remain scalar.
2318 for (const auto &Induction : Legal->getInductionVars()) {
2319 auto *Ind = Induction.first;
2320 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
2321
2322 // If tail-folding is applied, the primary induction variable will be used
2323 // to feed a vector compare.
2324 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
2325 continue;
2326
2327 // Returns true if \p Indvar is a pointer induction that is used directly by
2328 // load/store instruction \p I.
2329 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
2330 Instruction *I) {
2331 return Induction.second.getKind() ==
2334 Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
2335 };
2336
2337 // Determine if all users of the induction variable are scalar after
2338 // vectorization.
2339 bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
2340 auto *I = cast<Instruction>(U);
2341 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
2342 IsDirectLoadStoreFromPtrIndvar(Ind, I);
2343 });
2344 if (!ScalarInd)
2345 continue;
2346
2347 // If the induction variable update is a fixed-order recurrence, neither the
2348 // induction variable or its update should be marked scalar after
2349 // vectorization.
2350 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
2351 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
2352 continue;
2353
2354 // Determine if all users of the induction variable update instruction are
2355 // scalar after vectorization.
2356 bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
2357 auto *I = cast<Instruction>(U);
2358 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
2359 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
2360 });
2361 if (!ScalarIndUpdate)
2362 continue;
2363
2364 // The induction variable and its update instruction will remain scalar.
2365 Worklist.insert(Ind);
2366 Worklist.insert(IndUpdate);
2367 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
2368 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
2369 << "\n");
2370 }
2371
2372 Scalars[VF].insert_range(Worklist);
2373}
2374
2376 ElementCount VF) {
2377 if (!isPredicatedInst(I))
2378 return false;
2379
2380 // Do we have a non-scalar lowering for this predicated
2381 // instruction? No - it is scalar with predication.
2382 switch(I->getOpcode()) {
2383 default:
2384 return true;
2385 case Instruction::Call:
2386 if (VF.isScalar())
2387 return true;
2389 case Instruction::Load:
2390 case Instruction::Store: {
2391 bool IsConsecutive = Legal->isConsecutivePtr(getLoadStoreType(I),
2393 return !(IsConsecutive && Config.isLegalMaskedLoadOrStore(I, VF)) &&
2394 !Config.isLegalGatherOrScatter(I, VF);
2395 }
2396 case Instruction::UDiv:
2397 case Instruction::SDiv:
2398 case Instruction::SRem:
2399 case Instruction::URem: {
2400 // We have the option to use the llvm.masked.udiv intrinsics to avoid
2401 // predication. The cost based decision here will always select the masked
2402 // intrinsics for scalable vectors as scalarization isn't legal.
2403 const auto [ScalarCost, MaskedCost] = getDivRemSpeculationCost(I, VF);
2404 return isDivRemScalarWithPredication(ScalarCost, MaskedCost);
2405 }
2406 }
2407}
2408
2410 return Legal->isMaskRequired(I, foldTailByMasking());
2411}
2412
2413// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
2415 // TODO: We can use the loop-preheader as context point here and get
2416 // context sensitive reasoning for isSafeToSpeculativelyExecute.
2420 return false;
2421
2422 // If the instruction was executed conditionally in the original scalar loop,
2423 // predication is needed with a mask whose lanes are all possibly inactive.
2424 if (Legal->blockNeedsPredication(I->getParent()))
2425 return true;
2426
2427 // If we're not folding the tail by masking, predication is unnecessary.
2428 if (!foldTailByMasking())
2429 return false;
2430
2431 // All that remain are instructions with side-effects originally executed in
2432 // the loop unconditionally, but now execute under a tail-fold mask (only)
2433 // having at least one active lane (the first). If the side-effects of the
2434 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
2435 // - it will cause the same side-effects as when masked.
2436 switch(I->getOpcode()) {
2437 default:
2439 "instruction should have been considered by earlier checks");
2440 case Instruction::Call:
2441 // Side-effects of a Call are assumed to be non-invariant, needing a
2442 // (fold-tail) mask.
2444 "should have returned earlier for calls not needing a mask");
2445 return true;
2446 case Instruction::Load:
2447 // If the address is loop invariant no predication is needed.
2448 return !Legal->isInvariant(getLoadStorePointerOperand(I));
2449 case Instruction::Store: {
2450 // For stores, we need to prove both speculation safety (which follows from
2451 // the same argument as loads), but also must prove the value being stored
2452 // is correct. The easiest form of the later is to require that all values
2453 // stored are the same.
2454 return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
2455 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
2456 }
2457 case Instruction::UDiv:
2458 case Instruction::URem:
2459 // If the divisor is loop-invariant no predication is needed.
2460 return !Legal->isInvariant(I->getOperand(1));
2461 case Instruction::SDiv:
2462 case Instruction::SRem:
2463 // Conservative for now, since masked-off lanes may be poison and could
2464 // trigger signed overflow.
2465 return true;
2466 }
2467}
2468
2472 return 1;
2473 // If the block wasn't originally predicated then return early to avoid
2474 // computing BlockFrequencyInfo unnecessarily.
2475 if (!Legal->blockNeedsPredication(BB))
2476 return 1;
2477
2478 uint64_t HeaderFreq =
2479 getBFI().getBlockFreq(TheLoop->getHeader()).getFrequency();
2480 uint64_t BBFreq = getBFI().getBlockFreq(BB).getFrequency();
2481 assert(HeaderFreq >= BBFreq &&
2482 "Header has smaller block freq than dominated BB?");
2483 return std::round((double)HeaderFreq / BBFreq);
2484}
2485
2487 switch (Opcode) {
2488 case Instruction::UDiv:
2489 return Intrinsic::masked_udiv;
2490 case Instruction::SDiv:
2491 return Intrinsic::masked_sdiv;
2492 case Instruction::URem:
2493 return Intrinsic::masked_urem;
2494 case Instruction::SRem:
2495 return Intrinsic::masked_srem;
2496 default:
2497 llvm_unreachable("Unexpected opcode");
2498 }
2499}
2500
2501std::pair<InstructionCost, InstructionCost>
2503 ElementCount VF) {
2504 assert(I->getOpcode() == Instruction::UDiv ||
2505 I->getOpcode() == Instruction::SDiv ||
2506 I->getOpcode() == Instruction::SRem ||
2507 I->getOpcode() == Instruction::URem);
2509
2510 // Scalarization isn't legal for scalable vector types
2511 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
2512 if (!VF.isScalable()) {
2513 // Get the scalarization cost and scale this amount by the probability of
2514 // executing the predicated block. If the instruction is not predicated,
2515 // we fall through to the next case.
2516 ScalarizationCost = 0;
2517
2518 // These instructions have a non-void type, so account for the phi nodes
2519 // that we will create. This cost is likely to be zero. The phi node
2520 // cost, if any, should be scaled by the block probability because it
2521 // models a copy at the end of each predicated block.
2522 ScalarizationCost += VF.getFixedValue() *
2523 TTI.getCFInstrCost(Instruction::PHI, Config.CostKind);
2524
2525 // The cost of the non-predicated instruction.
2526 ScalarizationCost +=
2527 VF.getFixedValue() * TTI.getArithmeticInstrCost(
2528 I->getOpcode(), I->getType(), Config.CostKind);
2529
2530 // The cost of insertelement and extractelement instructions needed for
2531 // scalarization.
2532 ScalarizationCost += getScalarizationOverhead(I, VF);
2533
2534 // Scale the cost by the probability of executing the predicated blocks.
2535 // This assumes the predicated block for each vector lane is equally
2536 // likely.
2537 ScalarizationCost =
2538 ScalarizationCost /
2539 getPredBlockCostDivisor(Config.CostKind, I->getParent());
2540 }
2541
2542 auto *VecTy = toVectorTy(I->getType(), VF);
2543 auto *MaskTy = toVectorTy(Type::getInt1Ty(I->getContext()), VF);
2544 IntrinsicCostAttributes ICA(getMaskedDivRemIntrinsic(I->getOpcode()), VecTy,
2545 {VecTy, VecTy, MaskTy});
2546 InstructionCost MaskedCost = TTI.getIntrinsicInstrCost(ICA, Config.CostKind);
2547 return {ScalarizationCost, MaskedCost};
2548}
2549
2551 Instruction *I, ElementCount VF) const {
2552 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
2554 "Decision should not be set yet.");
2555 auto *Group = getInterleavedAccessGroup(I);
2556 assert(Group && "Must have a group.");
2557 unsigned InterleaveFactor = Group->getFactor();
2558
2559 // If the instruction's allocated size doesn't equal its type size, it
2560 // requires padding and will be scalarized.
2561 auto &DL = I->getDataLayout();
2562 auto *ScalarTy = getLoadStoreType(I);
2563 if (hasIrregularType(ScalarTy, DL))
2564 return false;
2565
2566 // For scalable vectors, the interleave factors must be <= 8 since we require
2567 // the (de)interleaveN intrinsics instead of shufflevectors.
2568 if (VF.isScalable() && InterleaveFactor > 8)
2569 return false;
2570
2571 // If the group involves a non-integral pointer, we may not be able to
2572 // losslessly cast all values to a common type.
2573 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
2574 for (Instruction *Member : Group->members()) {
2575 auto *MemberTy = getLoadStoreType(Member);
2576 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
2577 // Don't coerce non-integral pointers to integers or vice versa.
2578 if (MemberNI != ScalarNI)
2579 // TODO: Consider adding special nullptr value case here
2580 return false;
2581 if (MemberNI && ScalarNI &&
2582 ScalarTy->getPointerAddressSpace() !=
2583 MemberTy->getPointerAddressSpace())
2584 return false;
2585 }
2586
2587 // Check if masking is required.
2588 // A Group may need masking for one of two reasons: it resides in a block that
2589 // needs predication, or it was decided to use masking to deal with gaps
2590 // (either a gap at the end of a load-access that may result in a speculative
2591 // load, or any gaps in a store-access).
2592 bool PredicatedAccessRequiresMasking =
2594 bool LoadAccessWithGapsRequiresEpilogMasking =
2595 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
2597 bool StoreAccessWithGapsRequiresMasking =
2598 isa<StoreInst>(I) && !Group->isFull();
2599 if (!PredicatedAccessRequiresMasking &&
2600 !LoadAccessWithGapsRequiresEpilogMasking &&
2601 !StoreAccessWithGapsRequiresMasking)
2602 return true;
2603
2604 // If masked interleaving is required, we expect that the user/target had
2605 // enabled it, because otherwise it either wouldn't have been created or
2606 // it should have been invalidated by the CostModel.
2608 "Masked interleave-groups for predicated accesses are not enabled.");
2609
2610 if (Group->isReverse())
2611 return false;
2612
2613 // TODO: Support interleaved access that requires a gap mask for scalable VFs.
2614 bool NeedsMaskForGaps = LoadAccessWithGapsRequiresEpilogMasking ||
2615 StoreAccessWithGapsRequiresMasking;
2616 if (VF.isScalable() && NeedsMaskForGaps)
2617 return false;
2618
2619 return Config.isLegalMaskedLoadOrStore(I, VF);
2620}
2621
2623 Instruction *I, ElementCount VF) {
2624 // Get and ensure we have a valid memory instruction.
2625 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
2626
2627 auto *Ptr = getLoadStorePointerOperand(I);
2628 auto *ScalarTy = getLoadStoreType(I);
2629
2630 // In order to be widened, the pointer should be consecutive, first of all.
2631 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
2632 return false;
2633
2634 // If the instruction is a store located in a predicated block, it will be
2635 // scalarized.
2636 if (isScalarWithPredication(I, VF))
2637 return false;
2638
2639 // If the instruction's allocated size doesn't equal it's type size, it
2640 // requires padding and will be scalarized.
2641 auto &DL = I->getDataLayout();
2642 if (hasIrregularType(ScalarTy, DL))
2643 return false;
2644
2645 return true;
2646}
2647
2648void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
2649 // We should not collect Uniforms more than once per VF. Right now,
2650 // this function is called from collectUniformsAndScalars(), which
2651 // already does this check. Collecting Uniforms for VF=1 does not make any
2652 // sense.
2653
2654 assert(VF.isVector() && !Uniforms.contains(VF) &&
2655 "This function should not be visited twice for the same VF");
2656
2657 // Visit the list of Uniforms. If we find no uniform value, we won't
2658 // analyze again. Uniforms.count(VF) will return 1.
2659 Uniforms[VF].clear();
2660
2661 // Now we know that the loop is vectorizable!
2662 // Collect instructions inside the loop that will remain uniform after
2663 // vectorization.
2664
2665 // Global values, params and instructions outside of current loop are out of
2666 // scope.
2667 auto IsOutOfScope = [&](Value *V) -> bool {
2669 return (!I || !TheLoop->contains(I));
2670 };
2671
2672 // Worklist containing uniform instructions demanding lane 0.
2673 SetVector<Instruction *> Worklist;
2674
2675 // Add uniform instructions demanding lane 0 to the worklist. Instructions
2676 // that require predication must not be considered uniform after
2677 // vectorization, because that would create an erroneous replicating region
2678 // where only a single instance out of VF should be formed.
2679 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
2680 if (IsOutOfScope(I)) {
2681 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
2682 << *I << "\n");
2683 return;
2684 }
2685 if (isPredicatedInst(I)) {
2686 LLVM_DEBUG(
2687 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
2688 << "\n");
2689 return;
2690 }
2691 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
2692 Worklist.insert(I);
2693 };
2694
2695 // Start with the conditional branches exiting the loop. If the branch
2696 // condition is an instruction contained in the loop that is only used by the
2697 // branch, it is uniform. Note conditions from uncountable early exits are not
2698 // uniform.
2700 TheLoop->getExitingBlocks(Exiting);
2701 for (BasicBlock *E : Exiting) {
2702 if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
2703 continue;
2704 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
2705 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
2706 AddToWorklistIfAllowed(Cmp);
2707 }
2708
2709 auto PrevVF = VF.divideCoefficientBy(2);
2710 // Return true if all lanes perform the same memory operation, and we can
2711 // thus choose to execute only one.
2712 auto IsUniformMemOpUse = [&](Instruction *I) {
2713 // If the value was already known to not be uniform for the previous
2714 // (smaller VF), it cannot be uniform for the larger VF.
2715 if (PrevVF.isVector()) {
2716 auto Iter = Uniforms.find(PrevVF);
2717 if (Iter != Uniforms.end() && !Iter->second.contains(I))
2718 return false;
2719 }
2720 if (!Legal->isUniformMemOp(*I, VF))
2721 return false;
2722 if (isa<LoadInst>(I))
2723 // Loading the same address always produces the same result - at least
2724 // assuming aliasing and ordering which have already been checked.
2725 return true;
2726 // Storing the same value on every iteration.
2727 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
2728 };
2729
2730 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
2731 InstWidening WideningDecision = getWideningDecision(I, VF);
2732 assert(WideningDecision != CM_Unknown &&
2733 "Widening decision should be ready at this moment");
2734
2735 if (IsUniformMemOpUse(I))
2736 return true;
2737
2738 return (WideningDecision == CM_Widen ||
2739 WideningDecision == CM_Widen_Reverse ||
2740 WideningDecision == CM_Interleave);
2741 };
2742
2743 // Returns true if Ptr is the pointer operand of a memory access instruction
2744 // I, I is known to not require scalarization, and the pointer is not also
2745 // stored.
2746 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
2747 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
2748 return false;
2749 return getLoadStorePointerOperand(I) == Ptr &&
2750 (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
2751 };
2752
2753 // Holds a list of values which are known to have at least one uniform use.
2754 // Note that there may be other uses which aren't uniform. A "uniform use"
2755 // here is something which only demands lane 0 of the unrolled iterations;
2756 // it does not imply that all lanes produce the same value (e.g. this is not
2757 // the usual meaning of uniform)
2758 SetVector<Value *> HasUniformUse;
2759
2760 // Scan the loop for instructions which are either a) known to have only
2761 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
2762 for (auto *BB : TheLoop->blocks())
2763 for (auto &I : *BB) {
2764 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
2765 switch (II->getIntrinsicID()) {
2766 case Intrinsic::sideeffect:
2767 case Intrinsic::experimental_noalias_scope_decl:
2768 case Intrinsic::assume:
2769 case Intrinsic::lifetime_start:
2770 case Intrinsic::lifetime_end:
2771 if (TheLoop->hasLoopInvariantOperands(&I))
2772 AddToWorklistIfAllowed(&I);
2773 break;
2774 default:
2775 break;
2776 }
2777 }
2778
2779 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
2780 if (IsOutOfScope(EVI->getAggregateOperand())) {
2781 AddToWorklistIfAllowed(EVI);
2782 continue;
2783 }
2784 // Only ExtractValue instructions where the aggregate value comes from a
2785 // call are allowed to be non-uniform.
2786 assert(isa<CallInst>(EVI->getAggregateOperand()) &&
2787 "Expected aggregate value to be call return value");
2788 }
2789
2790 // If there's no pointer operand, there's nothing to do.
2791 auto *Ptr = getLoadStorePointerOperand(&I);
2792 if (!Ptr)
2793 continue;
2794
2795 // If the pointer can be proven to be uniform, always add it to the
2796 // worklist.
2797 if (isa<Instruction>(Ptr) && Legal->isUniform(Ptr, VF))
2798 AddToWorklistIfAllowed(cast<Instruction>(Ptr));
2799
2800 if (IsUniformMemOpUse(&I))
2801 AddToWorklistIfAllowed(&I);
2802
2803 if (IsVectorizedMemAccessUse(&I, Ptr))
2804 HasUniformUse.insert(Ptr);
2805 }
2806
2807 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
2808 // demanding) users. Since loops are assumed to be in LCSSA form, this
2809 // disallows uses outside the loop as well.
2810 for (auto *V : HasUniformUse) {
2811 if (IsOutOfScope(V))
2812 continue;
2813 auto *I = cast<Instruction>(V);
2814 bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
2815 auto *UI = cast<Instruction>(U);
2816 return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
2817 });
2818 if (UsersAreMemAccesses)
2819 AddToWorklistIfAllowed(I);
2820 }
2821
2822 // Expand Worklist in topological order: whenever a new instruction
2823 // is added , its users should be already inside Worklist. It ensures
2824 // a uniform instruction will only be used by uniform instructions.
2825 unsigned Idx = 0;
2826 while (Idx != Worklist.size()) {
2827 Instruction *I = Worklist[Idx++];
2828
2829 for (auto *OV : I->operand_values()) {
2830 // isOutOfScope operands cannot be uniform instructions.
2831 if (IsOutOfScope(OV))
2832 continue;
2833 // First order recurrence Phi's should typically be considered
2834 // non-uniform.
2835 auto *OP = dyn_cast<PHINode>(OV);
2836 if (OP && Legal->isFixedOrderRecurrence(OP))
2837 continue;
2838 // If all the users of the operand are uniform, then add the
2839 // operand into the uniform worklist.
2840 auto *OI = cast<Instruction>(OV);
2841 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
2842 auto *J = cast<Instruction>(U);
2843 return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
2844 }))
2845 AddToWorklistIfAllowed(OI);
2846 }
2847 }
2848
2849 // For an instruction to be added into Worklist above, all its users inside
2850 // the loop should also be in Worklist. However, this condition cannot be
2851 // true for phi nodes that form a cyclic dependence. We must process phi
2852 // nodes separately. An induction variable will remain uniform if all users
2853 // of the induction variable and induction variable update remain uniform.
2854 // The code below handles both pointer and non-pointer induction variables.
2855 BasicBlock *Latch = TheLoop->getLoopLatch();
2856 for (const auto &Induction : Legal->getInductionVars()) {
2857 auto *Ind = Induction.first;
2858 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
2859
2860 // Determine if all users of the induction variable are uniform after
2861 // vectorization.
2862 bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
2863 auto *I = cast<Instruction>(U);
2864 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
2865 IsVectorizedMemAccessUse(I, Ind);
2866 });
2867 if (!UniformInd)
2868 continue;
2869
2870 // Determine if all users of the induction variable update instruction are
2871 // uniform after vectorization.
2872 bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
2873 auto *I = cast<Instruction>(U);
2874 return I == Ind || Worklist.count(I) ||
2875 IsVectorizedMemAccessUse(I, IndUpdate);
2876 });
2877 if (!UniformIndUpdate)
2878 continue;
2879
2880 // The induction variable and its update instruction will remain uniform.
2881 AddToWorklistIfAllowed(Ind);
2882 AddToWorklistIfAllowed(IndUpdate);
2883 }
2884
2885 Uniforms[VF].insert_range(Worklist);
2886}
2887
2888FixedScalableVFPair
2890 // For outer loops, use simple type-based heuristic VF. No cost model or
2891 // memory dependence analysis is available.
2892 if (!TheLoop->isInnermost()) {
2893 return Config.computeVPlanOuterloopVF(UserVF);
2894 }
2895
2896 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
2897 // TODO: It may be useful to do since it's still likely to be dynamically
2898 // uniform if the target can skip.
2900 "Not inserting runtime ptr check for divergent target",
2901 "runtime pointer checks needed. Not enabled for divergent target",
2902 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
2904 }
2905
2906 ScalarEvolution *SE = PSE.getSE();
2908 unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
2909 if (!MaxTC && EpilogueLoweringStatus == CM_EpilogueAllowed)
2911 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
2912 if (TC != ElementCount::getFixed(MaxTC))
2913 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
2914 if (TC.isScalar()) {
2915 reportVectorizationFailure("Single iteration (non) loop",
2916 "loop trip count is one, irrelevant for vectorization",
2917 "SingleIterationLoop", ORE, TheLoop);
2919 }
2920
2921 // If BTC matches the widest induction type and is -1 then the trip count
2922 // computation will wrap to 0 and the vector trip count will be 0. Do not try
2923 // to vectorize.
2924 const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop);
2925 if (!isa<SCEVCouldNotCompute>(BTC) &&
2926 BTC->getType()->getScalarSizeInBits() >=
2927 Legal->getWidestInductionType()->getScalarSizeInBits() &&
2929 SE->getMinusOne(BTC->getType()))) {
2931 "Trip count computation wrapped",
2932 "backedge-taken count is -1, loop trip count wrapped to 0",
2933 "TripCountWrapped", ORE, TheLoop);
2935 }
2936
2937 assert(WideningDecisions.empty() && CallWideningDecisions.empty() &&
2938 Uniforms.empty() && Scalars.empty() &&
2939 "No cost-modeling decisions should have been taken at this point");
2940
2941 switch (EpilogueLoweringStatus) {
2942 case CM_EpilogueAllowed:
2943 return Config.computeFeasibleMaxVF(MaxTC, UserVF, UserIC, false,
2946 [[fallthrough]];
2948 LLVM_DEBUG(dbgs() << "LV: tail-folding hint/switch found.\n"
2949 << "LV: Not allowing epilogue, creating tail-folded "
2950 << "vector loop.\n");
2951 break;
2953 // fallthrough as a special case of OptForSize
2955 if (EpilogueLoweringStatus == CM_EpilogueNotAllowedOptSize)
2956 LLVM_DEBUG(dbgs() << "LV: Not allowing epilogue due to -Os/-Oz.\n");
2957 else
2958 LLVM_DEBUG(dbgs() << "LV: Not allowing epilogue due to low trip "
2959 << "count.\n");
2960
2961 // Bail if runtime checks are required, which are not good when optimising
2962 // for size.
2963 if (Config.runtimeChecksRequired())
2965
2966 break;
2967 }
2968
2969 // Now try the tail folding
2970
2971 // Invalidate interleave groups that require an epilogue if we can't mask
2972 // the interleave-group.
2974 // Note: There is no need to invalidate any cost modeling decisions here, as
2975 // none were taken so far (see assertion above).
2976 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
2977 }
2978
2979 FixedScalableVFPair MaxFactors = Config.computeFeasibleMaxVF(
2980 MaxTC, UserVF, UserIC, true, requiresScalarEpilogue(true));
2981
2982 // Avoid tail folding if the trip count is known to be a multiple of any VF
2983 // we choose.
2984 std::optional<unsigned> MaxPowerOf2RuntimeVF =
2985 MaxFactors.FixedVF.getFixedValue();
2986 if (MaxFactors.ScalableVF) {
2987 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
2988 if (MaxVScale) {
2989 MaxPowerOf2RuntimeVF = std::max<unsigned>(
2990 *MaxPowerOf2RuntimeVF,
2991 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
2992 } else
2993 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
2994 }
2995
2996 auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
2997 // Return false if the loop is neither a single-latch-exit loop nor an
2998 // early-exit loop as tail-folding is not supported in that case.
2999 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
3000 !Legal->hasUncountableEarlyExit())
3001 return false;
3002 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
3003 ScalarEvolution *SE = PSE.getSE();
3004 // Calling getSymbolicMaxBackedgeTakenCount enables support for loops
3005 // with uncountable exits. For countable loops, the symbolic maximum must
3006 // remain identical to the known back-edge taken count.
3007 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
3008 assert((Legal->hasUncountableEarlyExit() ||
3009 BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
3010 "Invalid loop count");
3011 const SCEV *ExitCount = SE->getAddExpr(
3012 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3013 const SCEV *Rem = SE->getURemExpr(
3014 SE->applyLoopGuards(ExitCount, TheLoop),
3015 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
3016 return Rem->isZero();
3017 };
3018
3019 if (MaxPowerOf2RuntimeVF > 0u) {
3020 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
3021 "MaxFixedVF must be a power of 2");
3022 if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
3023 // Accept MaxFixedVF if we do not have a tail.
3024 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
3025 return MaxFactors;
3026 }
3027 }
3028
3029 auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
3030 if (ExpectedTC && ExpectedTC->isFixed() &&
3031 ExpectedTC->getFixedValue() <=
3032 TTI.getMinTripCountTailFoldingThreshold()) {
3033 if (MaxPowerOf2RuntimeVF > 0u) {
3034 // If we have a low-trip-count, and the fixed-width VF is known to divide
3035 // the trip count but the scalable factor does not, use the fixed-width
3036 // factor in preference to allow the generation of a non-predicated loop.
3037 if (EpilogueLoweringStatus == CM_EpilogueNotAllowedLowTripLoop &&
3038 NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
3039 LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
3040 "remain for any chosen VF.\n");
3041 MaxFactors.ScalableVF = ElementCount::getScalable(0);
3042 return MaxFactors;
3043 }
3044 }
3045
3047 "The trip count is below the minial threshold value.",
3048 "loop trip count is too low, avoiding vectorization", "LowTripCount",
3049 ORE, TheLoop);
3051 }
3052
3053 // If we don't know the precise trip count, or if the trip count that we
3054 // found modulo the vectorization factor is not zero, try to fold the tail
3055 // by masking.
3056 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
3057 bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
3058 setTailFoldingStyle(ContainsScalableVF, UserIC);
3059 if (foldTailByMasking()) {
3060 if (foldTailWithEVL()) {
3061 LLVM_DEBUG(
3062 dbgs()
3063 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
3064 "try to generate VP Intrinsics with scalable vector "
3065 "factors only.\n");
3066 // Tail folded loop using VP intrinsics restricts the VF to be scalable
3067 // for now.
3068 // TODO: extend it for fixed vectors, if required.
3069 assert(ContainsScalableVF && "Expected scalable vector factor.");
3070
3071 MaxFactors.FixedVF = ElementCount::getFixed(1);
3072 }
3073 return MaxFactors;
3074 }
3075
3076 // If there was a tail-folding hint/switch, but we can't fold the tail by
3077 // masking, fallback to a vectorization with an epilogue.
3078 if (EpilogueLoweringStatus == CM_EpilogueNotNeededFoldTail) {
3079 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with an "
3080 "epilogue instead.\n");
3081 EpilogueLoweringStatus = CM_EpilogueAllowed;
3082 return MaxFactors;
3083 }
3084
3085 if (EpilogueLoweringStatus == CM_EpilogueNotAllowedFoldTail) {
3086 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
3088 }
3089
3090 if (TC.isZero()) {
3092 "unable to calculate the loop count due to complex control flow",
3093 "UnknownLoopCountComplexCFG", ORE, TheLoop);
3095 }
3096
3098 "Cannot optimize for size and vectorize at the same time.",
3099 "cannot optimize for size and vectorize at the same time. "
3100 "Enable vectorization of this loop with '#pragma clang loop "
3101 "vectorize(enable)' when compiling with -Os/-Oz",
3102 "NoTailLoopWithOptForSize", ORE, TheLoop);
3104}
3105
3108 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
3109 SmallVector<RecipeVFPair> InvalidCosts;
3110 for (const auto &Plan : VPlans) {
3111 for (ElementCount VF : Plan->vectorFactors()) {
3112 // The VPlan-based cost model is designed for computing vector cost.
3113 // Querying VPlan-based cost model with a scarlar VF will cause some
3114 // errors because we expect the VF is vector for most of the widen
3115 // recipes.
3116 if (VF.isScalar())
3117 continue;
3118
3119 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, Config.CostKind, CM.PSE,
3120 OrigLoop);
3121 precomputeCosts(*Plan, VF, CostCtx);
3122 auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
3124 for (auto &R : *VPBB) {
3125 if (!R.cost(VF, CostCtx).isValid())
3126 InvalidCosts.emplace_back(&R, VF);
3127 }
3128 }
3129 }
3130 }
3131 if (InvalidCosts.empty())
3132 return;
3133
3134 // Emit a report of VFs with invalid costs in the loop.
3135
3136 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
3138 unsigned I = 0;
3139 for (auto &Pair : InvalidCosts)
3140 if (Numbering.try_emplace(Pair.first, I).second)
3141 ++I;
3142
3143 // Sort the list, first on recipe(number) then on VF.
3144 sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
3145 unsigned NA = Numbering[A.first];
3146 unsigned NB = Numbering[B.first];
3147 if (NA != NB)
3148 return NA < NB;
3149 return ElementCount::isKnownLT(A.second, B.second);
3150 });
3151
3152 // For a list of ordered recipe-VF pairs:
3153 // [(load, VF1), (load, VF2), (store, VF1)]
3154 // group the recipes together to emit separate remarks for:
3155 // load (VF1, VF2)
3156 // store (VF1)
3157 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
3158 auto Subset = ArrayRef<RecipeVFPair>();
3159 do {
3160 if (Subset.empty())
3161 Subset = Tail.take_front(1);
3162
3163 VPRecipeBase *R = Subset.front().first;
3164
3165 unsigned Opcode =
3167 .Case([](const VPHeaderPHIRecipe *R) { return Instruction::PHI; })
3168 .Case(
3169 [](const VPWidenStoreRecipe *R) { return Instruction::Store; })
3170 .Case([](const VPWidenLoadRecipe *R) { return Instruction::Load; })
3171 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
3172 [](const auto *R) { return Instruction::Call; })
3175 [](const auto *R) { return R->getOpcode(); })
3176 .Case([](const VPInterleaveRecipe *R) {
3177 return R->getStoredValues().empty() ? Instruction::Load
3178 : Instruction::Store;
3179 })
3180 .Case([](const VPReductionRecipe *R) {
3181 return RecurrenceDescriptor::getOpcode(R->getRecurrenceKind());
3182 });
3183
3184 // If the next recipe is different, or if there are no other pairs,
3185 // emit a remark for the collated subset. e.g.
3186 // [(load, VF1), (load, VF2))]
3187 // to emit:
3188 // remark: invalid costs for 'load' at VF=(VF1, VF2)
3189 if (Subset == Tail || Tail[Subset.size()].first != R) {
3190 std::string OutString;
3191 raw_string_ostream OS(OutString);
3192 assert(!Subset.empty() && "Unexpected empty range");
3193 OS << "Recipe with invalid costs prevented vectorization at VF=(";
3194 for (const auto &Pair : Subset)
3195 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
3196 OS << "):";
3197 if (Opcode == Instruction::Call) {
3198 StringRef Name = "";
3199 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
3200 Name = Int->getIntrinsicName();
3201 } else {
3202 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
3203 Function *CalledFn =
3204 WidenCall ? WidenCall->getCalledScalarFunction()
3205 : cast<Function>(R->getOperand(R->getNumOperands() - 1)
3206 ->getLiveInIRValue());
3207 Name = CalledFn->getName();
3208 }
3209 OS << " call to " << Name;
3210 } else
3211 OS << " " << Instruction::getOpcodeName(Opcode);
3212 reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
3213 R->getDebugLoc());
3214 Tail = Tail.drop_front(Subset.size());
3215 Subset = {};
3216 } else
3217 // Grow the subset by one element
3218 Subset = Tail.take_front(Subset.size() + 1);
3219 } while (!Tail.empty());
3220}
3221
3222/// Check if any recipe of \p Plan will generate a vector value, which will be
3223/// assigned a vector register.
3225 const TargetTransformInfo &TTI) {
3226 assert(VF.isVector() && "Checking a scalar VF?");
3227 VPTypeAnalysis TypeInfo(Plan);
3228 DenseSet<VPRecipeBase *> EphemeralRecipes;
3229 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
3230 // Set of already visited types.
3231 DenseSet<Type *> Visited;
3234 for (VPRecipeBase &R : *VPBB) {
3235 if (EphemeralRecipes.contains(&R))
3236 continue;
3237 // Continue early if the recipe is considered to not produce a vector
3238 // result. Note that this includes VPInstruction where some opcodes may
3239 // produce a vector, to preserve existing behavior as VPInstructions model
3240 // aspects not directly mapped to existing IR instructions.
3241 switch (R.getVPRecipeID()) {
3242 case VPRecipeBase::VPDerivedIVSC:
3243 case VPRecipeBase::VPScalarIVStepsSC:
3244 case VPRecipeBase::VPReplicateSC:
3245 case VPRecipeBase::VPInstructionSC:
3246 case VPRecipeBase::VPCurrentIterationPHISC:
3247 case VPRecipeBase::VPVectorPointerSC:
3248 case VPRecipeBase::VPVectorEndPointerSC:
3249 case VPRecipeBase::VPExpandSCEVSC:
3250 case VPRecipeBase::VPPredInstPHISC:
3251 case VPRecipeBase::VPBranchOnMaskSC:
3252 continue;
3253 case VPRecipeBase::VPReductionSC:
3254 case VPRecipeBase::VPActiveLaneMaskPHISC:
3255 case VPRecipeBase::VPWidenCallSC:
3256 case VPRecipeBase::VPWidenCanonicalIVSC:
3257 case VPRecipeBase::VPWidenCastSC:
3258 case VPRecipeBase::VPWidenGEPSC:
3259 case VPRecipeBase::VPWidenIntrinsicSC:
3260 case VPRecipeBase::VPWidenMemIntrinsicSC:
3261 case VPRecipeBase::VPWidenSC:
3262 case VPRecipeBase::VPBlendSC:
3263 case VPRecipeBase::VPFirstOrderRecurrencePHISC:
3264 case VPRecipeBase::VPHistogramSC:
3265 case VPRecipeBase::VPWidenPHISC:
3266 case VPRecipeBase::VPWidenIntOrFpInductionSC:
3267 case VPRecipeBase::VPWidenPointerInductionSC:
3268 case VPRecipeBase::VPReductionPHISC:
3269 case VPRecipeBase::VPInterleaveEVLSC:
3270 case VPRecipeBase::VPInterleaveSC:
3271 case VPRecipeBase::VPWidenLoadEVLSC:
3272 case VPRecipeBase::VPWidenLoadSC:
3273 case VPRecipeBase::VPWidenStoreEVLSC:
3274 case VPRecipeBase::VPWidenStoreSC:
3275 break;
3276 default:
3277 llvm_unreachable("unhandled recipe");
3278 }
3279
3280 auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
3281 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
3282 if (!NumLegalParts)
3283 return false;
3284 if (VF.isScalable()) {
3285 // <vscale x 1 x iN> is assumed to be profitable over iN because
3286 // scalable registers are a distinct register class from scalar
3287 // ones. If we ever find a target which wants to lower scalable
3288 // vectors back to scalars, we'll need to update this code to
3289 // explicitly ask TTI about the register class uses for each part.
3290 return NumLegalParts <= VF.getKnownMinValue();
3291 }
3292 // Two or more elements that share a register - are vectorized.
3293 return NumLegalParts < VF.getFixedValue();
3294 };
3295
3296 // If no def nor is a store, e.g., branches, continue - no value to check.
3297 if (R.getNumDefinedValues() == 0 &&
3299 continue;
3300 // For multi-def recipes, currently only interleaved loads, suffice to
3301 // check first def only.
3302 // For stores check their stored value; for interleaved stores suffice
3303 // the check first stored value only. In all cases this is the second
3304 // operand.
3305 VPValue *ToCheck =
3306 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
3307 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
3308 if (!Visited.insert({ScalarTy}).second)
3309 continue;
3310 Type *WideTy = toVectorizedTy(ScalarTy, VF);
3311 if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
3312 return true;
3313 }
3314 }
3315
3316 return false;
3317}
3318
3319static bool hasReplicatorRegion(VPlan &Plan) {
3321 Plan.getVectorLoopRegion()->getEntry())),
3322 [](auto *VPRB) { return VPRB->isReplicator(); });
3323}
3324
3325/// Returns true if the VPlan contains a VPReductionPHIRecipe with
3326/// FindLast recurrence kind.
3327static bool hasFindLastReductionPhi(VPlan &Plan) {
3329 [](VPRecipeBase &R) {
3330 auto *RedPhi = dyn_cast<VPReductionPHIRecipe>(&R);
3331 return RedPhi &&
3332 RecurrenceDescriptor::isFindLastRecurrenceKind(
3333 RedPhi->getRecurrenceKind());
3334 });
3335}
3336
3337/// Returns true if the VPlan contains header phi recipes that are not currently
3338/// supported for epilogue vectorization.
3340 return any_of(
3342 [](VPRecipeBase &R) {
3343 switch (R.getVPRecipeID()) {
3344 case VPRecipeBase::VPFirstOrderRecurrencePHISC:
3345 // TODO: Add support for fixed-order recurrences.
3346 return true;
3347 case VPRecipeBase::VPWidenIntOrFpInductionSC:
3348 return !cast<VPWidenIntOrFpInductionRecipe>(&R)->getPHINode();
3349 case VPRecipeBase::VPReductionPHISC: {
3350 auto *RedPhi = cast<VPReductionPHIRecipe>(&R);
3351 // TODO: Support FMinNum/FMaxNum, FindLast reductions, and reductions
3352 // without underlying values.
3353 RecurKind Kind = RedPhi->getRecurrenceKind();
3354 if (RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(Kind) ||
3355 RecurrenceDescriptor::isFindLastRecurrenceKind(Kind) ||
3356 !RedPhi->getUnderlyingValue())
3357 return true;
3358 // TODO: Add support for FindIV reductions with sunk expressions: the
3359 // resume value from the main loop is in expression domain (e.g.,
3360 // mul(ReducedIV, 3)), but the epilogue tracks raw IV values. A sunk
3361 // expression is identified by a non-VPInstruction user of
3362 // ComputeReductionResult.
3363 if (RecurrenceDescriptor::isFindIVRecurrenceKind(Kind)) {
3364 auto *RdxResult = vputils::findComputeReductionResult(RedPhi);
3365 assert(RdxResult &&
3366 "FindIV reduction must have ComputeReductionResult");
3367 return any_of(RdxResult->users(),
3368 std::not_fn(IsaPred<VPInstruction>));
3369 }
3370 return false;
3371 }
3372 default:
3373 return false;
3374 };
3375 });
3376}
3377
3378bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
3379 VPlan &MainPlan) const {
3380 // Bail out if the plan contains header phi recipes not yet supported
3381 // for epilogue vectorization.
3382 if (hasUnsupportedHeaderPhiRecipe(MainPlan))
3383 return false;
3384
3385 // Epilogue vectorization code has not been auditted to ensure it handles
3386 // non-latch exits properly. It may be fine, but it needs auditted and
3387 // tested.
3388 // TODO: Add support for loops with an early exit.
3389 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
3390 return false;
3391
3392 return true;
3393}
3394
3396 const ElementCount VF, const unsigned IC) const {
3397 // FIXME: We need a much better cost-model to take different parameters such
3398 // as register pressure, code size increase and cost of extra branches into
3399 // account. For now we apply a very crude heuristic and only consider loops
3400 // with vectorization factors larger than a certain value.
3401
3402 // Allow the target to opt out.
3403 if (!TTI.preferEpilogueVectorization(VF * IC))
3404 return false;
3405
3406 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
3408 : TTI.getEpilogueVectorizationMinVF();
3409 return estimateElementCount(VF * IC, Config.getVScaleForTuning()) >=
3410 MinVFThreshold;
3411}
3412
3414 VPlan &MainPlan, ElementCount MainLoopVF, unsigned IC) {
3416 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
3417 return nullptr;
3418 }
3419
3420 if (!CM.isEpilogueAllowed()) {
3421 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
3422 "epilogue is allowed.\n");
3423 return nullptr;
3424 }
3425
3426 // Not really a cost consideration, but check for unsupported cases here to
3427 // simplify the logic.
3428 if (!isCandidateForEpilogueVectorization(MainPlan)) {
3429 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
3430 "is not a supported candidate.\n");
3431 return nullptr;
3432 }
3433
3436 IC * estimateElementCount(MainLoopVF, Config.getVScaleForTuning())) {
3437 // Note that the main loop leaves IC * MainLoopVF iterations iff a scalar
3438 // epilogue is required, but then the epilogue loop also requires a scalar
3439 // epilogue.
3440 LLVM_DEBUG(dbgs() << "LEV: Forced epilogue VF results in dead epilogue "
3441 "vector loop, skipping vectorizing epilogue.\n");
3442 return nullptr;
3443 }
3444
3445 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
3447 if (hasPlanWithVF(ForcedEC)) {
3448 std::unique_ptr<VPlan> Clone(getPlanFor(ForcedEC).duplicate());
3449 Clone->setVF(ForcedEC);
3450 return Clone;
3451 }
3452
3453 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
3454 "viable.\n");
3455 return nullptr;
3456 }
3457
3458 if (OrigLoop->getHeader()->getParent()->hasOptSize()) {
3459 LLVM_DEBUG(
3460 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
3461 return nullptr;
3462 }
3463
3464 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
3465 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
3466 "this loop\n");
3467 return nullptr;
3468 }
3469
3470 // Check if a plan's vector loop processes fewer iterations than VF (e.g. when
3471 // interleave groups have been narrowed) narrowInterleaveGroups) and return
3472 // the adjusted, effective VF.
3473 using namespace VPlanPatternMatch;
3474 auto GetEffectiveVF = [](VPlan &Plan, ElementCount VF) -> ElementCount {
3475 auto *Exiting = Plan.getVectorLoopRegion()->getExitingBasicBlock();
3476 if (match(&Exiting->back(),
3477 m_BranchOnCount(m_Add(m_CanonicalIV(), m_Specific(&Plan.getUF())),
3478 m_VPValue())))
3479 return ElementCount::get(1, VF.isScalable());
3480 return VF;
3481 };
3482
3483 // Check if the main loop processes fewer than MainLoopVF elements per
3484 // iteration (e.g. due to narrowing interleave groups). Adjust MainLoopVF
3485 // as needed.
3486 MainLoopVF = GetEffectiveVF(MainPlan, MainLoopVF);
3487
3488 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
3489 // the main loop handles 8 lanes per iteration. We could still benefit from
3490 // vectorizing the epilogue loop with VF=4.
3491 ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
3492 estimateElementCount(MainLoopVF, Config.getVScaleForTuning()));
3493
3494 Type *TCType = Legal->getWidestInductionType();
3495 const SCEV *RemainingIterations = nullptr;
3496 unsigned MaxTripCount = 0;
3497 const SCEV *TC = vputils::getSCEVExprForVPValue(MainPlan.getTripCount(), PSE);
3498 assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
3499 const SCEV *KnownMinTC;
3500 bool ScalableTC = match(TC, m_scev_c_Mul(m_SCEV(KnownMinTC), m_SCEVVScale()));
3501 bool ScalableRemIter = false;
3502 ScalarEvolution &SE = *PSE.getSE();
3503 // Use versions of TC and VF in which both are either scalable or fixed.
3504 if (ScalableTC == MainLoopVF.isScalable()) {
3505 ScalableRemIter = ScalableTC;
3506 RemainingIterations =
3507 SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
3508 } else if (ScalableTC) {
3509 const SCEV *EstimatedTC = SE.getMulExpr(
3510 KnownMinTC,
3511 SE.getConstant(TCType, Config.getVScaleForTuning().value_or(1)));
3512 RemainingIterations = SE.getURemExpr(
3513 EstimatedTC, SE.getElementCount(TCType, MainLoopVF * IC));
3514 } else
3515 RemainingIterations =
3516 SE.getURemExpr(TC, SE.getElementCount(TCType, EstimatedRuntimeVF * IC));
3517
3518 // No iterations left to process in the epilogue.
3519 if (RemainingIterations->isZero())
3520 return nullptr;
3521
3522 if (MainLoopVF.isFixed()) {
3523 MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
3524 if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
3525 SE.getConstant(TCType, MaxTripCount))) {
3526 MaxTripCount = SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
3527 }
3528 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
3529 << MaxTripCount << "\n");
3530 }
3531
3532 auto SkipVF = [&](const SCEV *VF, const SCEV *RemIter) -> bool {
3533 return SE.isKnownPredicate(CmpInst::ICMP_UGT, VF, RemIter);
3534 };
3536 VPlan *BestPlan = nullptr;
3537 for (auto &NextVF : ProfitableVFs) {
3538 // Skip candidate VFs without a corresponding VPlan.
3539 if (!hasPlanWithVF(NextVF.Width))
3540 continue;
3541
3542 VPlan &CurrentPlan = getPlanFor(NextVF.Width);
3543 ElementCount EffectiveVF = GetEffectiveVF(CurrentPlan, NextVF.Width);
3544 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
3545 // vectors) or > the VF of the main loop (fixed vectors).
3546 if ((!EffectiveVF.isScalable() && MainLoopVF.isScalable() &&
3547 ElementCount::isKnownGE(EffectiveVF, EstimatedRuntimeVF)) ||
3548 (EffectiveVF.isScalable() &&
3549 ElementCount::isKnownGE(EffectiveVF, MainLoopVF)) ||
3550 (!EffectiveVF.isScalable() && !MainLoopVF.isScalable() &&
3551 ElementCount::isKnownGT(EffectiveVF, MainLoopVF)))
3552 continue;
3553
3554 // If EffectiveVF is greater than the number of remaining iterations, the
3555 // epilogue loop would be dead. Skip such factors. If the epilogue plan
3556 // also has narrowed interleave groups, use the effective VF since
3557 // the epilogue step will be reduced to its IC.
3558 // TODO: We should also consider comparing against a scalable
3559 // RemainingIterations when SCEV be able to evaluate non-canonical
3560 // vscale-based expressions.
3561 if (!ScalableRemIter) {
3562 // Handle the case where EffectiveVF and RemainingIterations are in
3563 // different numerical spaces.
3564 if (EffectiveVF.isScalable())
3565 EffectiveVF = ElementCount::getFixed(
3566 estimateElementCount(EffectiveVF, Config.getVScaleForTuning()));
3567 if (SkipVF(SE.getElementCount(TCType, EffectiveVF), RemainingIterations))
3568 continue;
3569 }
3570
3571 if (Result.Width.isScalar() ||
3572 isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking(),
3573 /*IsEpilogue*/ true)) {
3574 Result = NextVF;
3575 BestPlan = &CurrentPlan;
3576 }
3577 }
3578
3579 if (!BestPlan)
3580 return nullptr;
3581
3582 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
3583 << Result.Width << "\n");
3584 std::unique_ptr<VPlan> Clone(BestPlan->duplicate());
3585 Clone->setVF(Result.Width);
3586 return Clone;
3587}
3588
3589unsigned
3591 InstructionCost LoopCost) {
3592 // -- The interleave heuristics --
3593 // We interleave the loop in order to expose ILP and reduce the loop overhead.
3594 // There are many micro-architectural considerations that we can't predict
3595 // at this level. For example, frontend pressure (on decode or fetch) due to
3596 // code size, or the number and capabilities of the execution ports.
3597 //
3598 // We use the following heuristics to select the interleave count:
3599 // 1. If the code has reductions, then we interleave to break the cross
3600 // iteration dependency.
3601 // 2. If the loop is really small, then we interleave to reduce the loop
3602 // overhead.
3603 // 3. We don't interleave if we think that we will spill registers to memory
3604 // due to the increased register pressure.
3605
3606 // Only interleave tail-folded loops if wide lane masks are requested, as the
3607 // overhead of multiple instructions to calculate the predicate is likely
3608 // not beneficial. If an epilogue is not allowed for any other reason,
3609 // do not interleave.
3610 if (!CM.isEpilogueAllowed() &&
3611 !(CM.preferTailFoldedLoop() && CM.useWideActiveLaneMask()))
3612 return 1;
3613
3616 LLVM_DEBUG(dbgs() << "LV: Loop requires variable-length step. "
3617 "Unroll factor forced to be 1.\n");
3618 return 1;
3619 }
3620
3621 // We used the distance for the interleave count.
3622 if (!Legal->isSafeForAnyVectorWidth())
3623 return 1;
3624
3625 // We don't attempt to perform interleaving for loops with uncountable early
3626 // exits because the VPInstruction::AnyOf code cannot currently handle
3627 // multiple parts.
3628 if (Plan.hasEarlyExit())
3629 return 1;
3630
3631 const bool HasReductions =
3634
3635 // FIXME: implement interleaving for FindLast transform correctly.
3636 if (hasFindLastReductionPhi(Plan))
3637 return 1;
3638
3639 VPRegisterUsage R =
3640 calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0];
3641
3642 // If we did not calculate the cost for VF (because the user selected the VF)
3643 // then we calculate the cost of VF here.
3644 if (LoopCost == 0) {
3645 if (VF.isScalar())
3646 LoopCost = CM.expectedCost(VF);
3647 else
3648 LoopCost = cost(Plan, VF, &R);
3649 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
3650
3651 // Loop body is free and there is no need for interleaving.
3652 if (LoopCost == 0)
3653 return 1;
3654 }
3655
3656 // We divide by these constants so assume that we have at least one
3657 // instruction that uses at least one register.
3658 for (auto &Pair : R.MaxLocalUsers) {
3659 Pair.second = std::max(Pair.second, 1U);
3660 }
3661
3662 // We calculate the interleave count using the following formula.
3663 // Subtract the number of loop invariants from the number of available
3664 // registers. These registers are used by all of the interleaved instances.
3665 // Next, divide the remaining registers by the number of registers that is
3666 // required by the loop, in order to estimate how many parallel instances
3667 // fit without causing spills. All of this is rounded down if necessary to be
3668 // a power of two. We want power of two interleave count to simplify any
3669 // addressing operations or alignment considerations.
3670 // We also want power of two interleave counts to ensure that the induction
3671 // variable of the vector loop wraps to zero, when tail is folded by masking;
3672 // this currently happens when OptForSize, in which case IC is set to 1 above.
3673 unsigned IC = UINT_MAX;
3674
3675 for (const auto &Pair : R.MaxLocalUsers) {
3676 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
3677 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
3678 << " registers of "
3679 << TTI.getRegisterClassName(Pair.first)
3680 << " register class\n");
3681 if (VF.isScalar()) {
3682 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
3683 TargetNumRegisters = ForceTargetNumScalarRegs;
3684 } else {
3685 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
3686 TargetNumRegisters = ForceTargetNumVectorRegs;
3687 }
3688 unsigned MaxLocalUsers = Pair.second;
3689 unsigned LoopInvariantRegs = 0;
3690 if (R.LoopInvariantRegs.contains(Pair.first))
3691 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
3692
3693 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
3694 MaxLocalUsers);
3695 // Don't count the induction variable as interleaved.
3697 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
3698 std::max(1U, (MaxLocalUsers - 1)));
3699 }
3700
3701 IC = std::min(IC, TmpIC);
3702 }
3703
3704 // Clamp the interleave ranges to reasonable counts.
3705 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
3706 LLVM_DEBUG(dbgs() << "LV: MaxInterleaveFactor for the target is "
3707 << MaxInterleaveCount << "\n");
3708
3709 // Check if the user has overridden the max.
3710 if (VF.isScalar()) {
3711 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
3712 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
3713 } else {
3714 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
3715 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
3716 }
3717
3718 // Try to get the exact trip count, or an estimate based on profiling data or
3719 // ConstantMax from PSE, failing that.
3720 auto BestKnownTC =
3721 getSmallBestKnownTC(PSE, OrigLoop,
3722 /*CanUseConstantMax=*/true,
3723 /*CanExcludeZeroTrips=*/CM.isEpilogueAllowed());
3724
3725 // For fixed length VFs treat a scalable trip count as unknown.
3726 if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) {
3727 // Re-evaluate trip counts and VFs to be in the same numerical space.
3728 unsigned AvailableTC =
3729 estimateElementCount(*BestKnownTC, Config.getVScaleForTuning());
3730 unsigned EstimatedVF =
3731 estimateElementCount(VF, Config.getVScaleForTuning());
3732
3733 // At least one iteration must be scalar when this constraint holds. So the
3734 // maximum available iterations for interleaving is one less.
3735 if (CM.requiresScalarEpilogue(VF.isVector()))
3736 --AvailableTC;
3737
3738 unsigned InterleaveCountLB = bit_floor(std::max(
3739 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
3740
3741 if (getSmallConstantTripCount(PSE.getSE(), OrigLoop).isNonZero()) {
3742 // If the best known trip count is exact, we select between two
3743 // prospective ICs, where
3744 //
3745 // 1) the aggressive IC is capped by the trip count divided by VF
3746 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
3747 //
3748 // The final IC is selected in a way that the epilogue loop trip count is
3749 // minimized while maximizing the IC itself, so that we either run the
3750 // vector loop at least once if it generates a small epilogue loop, or
3751 // else we run the vector loop at least twice.
3752
3753 unsigned InterleaveCountUB = bit_floor(std::max(
3754 1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
3755 MaxInterleaveCount = InterleaveCountLB;
3756
3757 if (InterleaveCountUB != InterleaveCountLB) {
3758 unsigned TailTripCountUB =
3759 (AvailableTC % (EstimatedVF * InterleaveCountUB));
3760 unsigned TailTripCountLB =
3761 (AvailableTC % (EstimatedVF * InterleaveCountLB));
3762 // If both produce same scalar tail, maximize the IC to do the same work
3763 // in fewer vector loop iterations
3764 if (TailTripCountUB == TailTripCountLB)
3765 MaxInterleaveCount = InterleaveCountUB;
3766 }
3767 } else {
3768 // If trip count is an estimated compile time constant, limit the
3769 // IC to be capped by the trip count divided by VF * 2, such that the
3770 // vector loop runs at least twice to make interleaving seem profitable
3771 // when there is an epilogue loop present. Since exact Trip count is not
3772 // known we choose to be conservative in our IC estimate.
3773 MaxInterleaveCount = InterleaveCountLB;
3774 }
3775 }
3776
3777 assert(MaxInterleaveCount > 0 &&
3778 "Maximum interleave count must be greater than 0");
3779
3780 // Clamp the calculated IC to be between the 1 and the max interleave count
3781 // that the target and trip count allows.
3782 if (IC > MaxInterleaveCount)
3783 IC = MaxInterleaveCount;
3784 else
3785 // Make sure IC is greater than 0.
3786 IC = std::max(1u, IC);
3787
3788 assert(IC > 0 && "Interleave count must be greater than 0.");
3789
3790 // Interleave if we vectorized this loop and there is a reduction that could
3791 // benefit from interleaving.
3792 if (VF.isVector() && HasReductions) {
3793 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
3794 return IC;
3795 }
3796
3797 // For any scalar loop that either requires runtime checks or tail-folding we
3798 // are better off leaving this to the unroller. Note that if we've already
3799 // vectorized the loop we will have done the runtime check and so interleaving
3800 // won't require further checks.
3801 bool ScalarInterleavingRequiresPredication =
3802 (VF.isScalar() && any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
3803 return Legal->blockNeedsPredication(BB);
3804 }));
3805 bool ScalarInterleavingRequiresRuntimePointerCheck =
3806 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
3807
3808 // We want to interleave small loops in order to reduce the loop overhead and
3809 // potentially expose ILP opportunities.
3810 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
3811 << "LV: IC is " << IC << '\n'
3812 << "LV: VF is " << VF << '\n');
3813 const bool AggressivelyInterleave =
3814 TTI.enableAggressiveInterleaving(HasReductions);
3815 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
3816 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
3817 // We assume that the cost overhead is 1 and we use the cost model
3818 // to estimate the cost of the loop and interleave until the cost of the
3819 // loop overhead is about 5% of the cost of the loop.
3820 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
3821 SmallLoopCost / LoopCost.getValue()));
3822
3823 // Interleave until store/load ports (estimated by max interleave count) are
3824 // saturated.
3825 unsigned NumStores = 0;
3826 unsigned NumLoads = 0;
3829 for (VPRecipeBase &R : *VPBB) {
3831 NumLoads++;
3832 continue;
3833 }
3835 NumStores++;
3836 continue;
3837 }
3838
3839 if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R)) {
3840 if (unsigned StoreOps = InterleaveR->getNumStoreOperands())
3841 NumStores += StoreOps;
3842 else
3843 NumLoads += InterleaveR->getNumDefinedValues();
3844 continue;
3845 }
3846 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
3847 NumLoads += isa<LoadInst>(RepR->getUnderlyingInstr());
3848 NumStores += isa<StoreInst>(RepR->getUnderlyingInstr());
3849 continue;
3850 }
3851 if (isa<VPHistogramRecipe>(&R)) {
3852 NumLoads++;
3853 NumStores++;
3854 continue;
3855 }
3856 }
3857 }
3858 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
3859 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
3860
3861 // There is little point in interleaving for reductions containing selects
3862 // and compares when VF=1 since it may just create more overhead than it's
3863 // worth for loops with small trip counts. This is because we still have to
3864 // do the final reduction after the loop.
3865 bool HasSelectCmpReductions =
3866 HasReductions &&
3868 [](VPRecipeBase &R) {
3869 auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
3870 return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind(
3871 RedR->getRecurrenceKind()) ||
3872 RecurrenceDescriptor::isFindIVRecurrenceKind(
3873 RedR->getRecurrenceKind()));
3874 });
3875 if (HasSelectCmpReductions) {
3876 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
3877 return 1;
3878 }
3879
3880 // If we have a scalar reduction (vector reductions are already dealt with
3881 // by this point), we can increase the critical path length if the loop
3882 // we're interleaving is inside another loop. For tree-wise reductions
3883 // set the limit to 2, and for ordered reductions it's best to disable
3884 // interleaving entirely.
3885 if (HasReductions && OrigLoop->getLoopDepth() > 1) {
3886 bool HasOrderedReductions =
3888 [](VPRecipeBase &R) {
3889 auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
3890
3891 return RedR && RedR->isOrdered();
3892 });
3893 if (HasOrderedReductions) {
3894 LLVM_DEBUG(
3895 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
3896 return 1;
3897 }
3898
3899 unsigned F = MaxNestedScalarReductionIC;
3900 SmallIC = std::min(SmallIC, F);
3901 StoresIC = std::min(StoresIC, F);
3902 LoadsIC = std::min(LoadsIC, F);
3903 }
3904
3906 std::max(StoresIC, LoadsIC) > SmallIC) {
3907 LLVM_DEBUG(
3908 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
3909 return std::max(StoresIC, LoadsIC);
3910 }
3911
3912 // If there are scalar reductions and TTI has enabled aggressive
3913 // interleaving for reductions, we will interleave to expose ILP.
3914 if (VF.isScalar() && AggressivelyInterleave) {
3915 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
3916 // Interleave no less than SmallIC but not as aggressive as the normal IC
3917 // to satisfy the rare situation when resources are too limited.
3918 return std::max(IC / 2, SmallIC);
3919 }
3920
3921 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
3922 return SmallIC;
3923 }
3924
3925 // Interleave if this is a large loop (small loops are already dealt with by
3926 // this point) that could benefit from interleaving.
3927 if (AggressivelyInterleave) {
3928 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
3929 return IC;
3930 }
3931
3932 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
3933 return 1;
3934}
3935
3937 ElementCount VF) {
3938 // TODO: Cost model for emulated masked load/store is completely
3939 // broken. This hack guides the cost model to use an artificially
3940 // high enough value to practically disable vectorization with such
3941 // operations, except where previously deployed legality hack allowed
3942 // using very low cost values. This is to avoid regressions coming simply
3943 // from moving "masked load/store" check from legality to cost model.
3944 // Masked Load/Gather emulation was previously never allowed.
3945 // Limited number of Masked Store/Scatter emulation was allowed.
3947 "Expecting a scalar emulated instruction");
3948 return isa<LoadInst>(I) ||
3949 (isa<StoreInst>(I) &&
3950 NumPredStores > NumberOfStoresToPredicate);
3951}
3952
3954 assert(VF.isVector() && "Expected VF >= 2");
3955
3956 // If we've already collected the instructions to scalarize or the predicated
3957 // BBs after vectorization, there's nothing to do. Collection may already have
3958 // occurred if we have a user-selected VF and are now computing the expected
3959 // cost for interleaving.
3960 if (InstsToScalarize.contains(VF) ||
3961 PredicatedBBsAfterVectorization.contains(VF))
3962 return;
3963
3964 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
3965 // not profitable to scalarize any instructions, the presence of VF in the
3966 // map will indicate that we've analyzed it already.
3967 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
3968
3969 // Find all the instructions that are scalar with predication in the loop and
3970 // determine if it would be better to not if-convert the blocks they are in.
3971 // If so, we also record the instructions to scalarize.
3972 for (BasicBlock *BB : TheLoop->blocks()) {
3974 continue;
3975 for (Instruction &I : *BB)
3976 if (isScalarWithPredication(&I, VF)) {
3977 ScalarCostsTy ScalarCosts;
3978 // Do not apply discount logic for:
3979 // 1. Scalars after vectorization, as there will only be a single copy
3980 // of the instruction.
3981 // 2. Scalable VF, as that would lead to invalid scalarization costs.
3982 // 3. Emulated masked memrefs, if a hacked cost is needed.
3983 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
3985 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
3986 for (const auto &[I, IC] : ScalarCosts)
3987 ScalarCostsVF.insert({I, IC});
3988 // Check if we decided to scalarize a call. If so, update the widening
3989 // decision of the call to CM_Scalarize with the computed scalar cost.
3990 for (const auto &[I, Cost] : ScalarCosts) {
3991 auto *CI = dyn_cast<CallInst>(I);
3992 if (!CI || !CallWideningDecisions.contains({CI, VF}))
3993 continue;
3994 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
3995 CallWideningDecisions[{CI, VF}].Cost = Cost;
3996 }
3997 }
3998 // Remember that BB will remain after vectorization.
3999 PredicatedBBsAfterVectorization[VF].insert(BB);
4000 for (auto *Pred : predecessors(BB)) {
4001 if (Pred->getSingleSuccessor() == BB)
4002 PredicatedBBsAfterVectorization[VF].insert(Pred);
4003 }
4004 }
4005 }
4006}
4007
4008InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
4009 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
4010 assert(!isUniformAfterVectorization(PredInst, VF) &&
4011 "Instruction marked uniform-after-vectorization will be predicated");
4012
4013 // Initialize the discount to zero, meaning that the scalar version and the
4014 // vector version cost the same.
4015 InstructionCost Discount = 0;
4016
4017 // Holds instructions to analyze. The instructions we visit are mapped in
4018 // ScalarCosts. Those instructions are the ones that would be scalarized if
4019 // we find that the scalar version costs less.
4021
4022 // Returns true if the given instruction can be scalarized.
4023 auto CanBeScalarized = [&](Instruction *I) -> bool {
4024 // We only attempt to scalarize instructions forming a single-use chain
4025 // from the original predicated block that would otherwise be vectorized.
4026 // Although not strictly necessary, we give up on instructions we know will
4027 // already be scalar to avoid traversing chains that are unlikely to be
4028 // beneficial.
4029 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
4031 return false;
4032
4033 // If the instruction is scalar with predication, it will be analyzed
4034 // separately. We ignore it within the context of PredInst.
4035 if (isScalarWithPredication(I, VF))
4036 return false;
4037
4038 // If any of the instruction's operands are uniform after vectorization,
4039 // the instruction cannot be scalarized. This prevents, for example, a
4040 // masked load from being scalarized.
4041 //
4042 // We assume we will only emit a value for lane zero of an instruction
4043 // marked uniform after vectorization, rather than VF identical values.
4044 // Thus, if we scalarize an instruction that uses a uniform, we would
4045 // create uses of values corresponding to the lanes we aren't emitting code
4046 // for. This behavior can be changed by allowing getScalarValue to clone
4047 // the lane zero values for uniforms rather than asserting.
4048 for (Use &U : I->operands())
4049 if (auto *J = dyn_cast<Instruction>(U.get()))
4050 if (isUniformAfterVectorization(J, VF))
4051 return false;
4052
4053 // Otherwise, we can scalarize the instruction.
4054 return true;
4055 };
4056
4057 // Compute the expected cost discount from scalarizing the entire expression
4058 // feeding the predicated instruction. We currently only consider expressions
4059 // that are single-use instruction chains.
4060 Worklist.push_back(PredInst);
4061 while (!Worklist.empty()) {
4062 Instruction *I = Worklist.pop_back_val();
4063
4064 // If we've already analyzed the instruction, there's nothing to do.
4065 if (ScalarCosts.contains(I))
4066 continue;
4067
4068 // Cannot scalarize fixed-order recurrence phis at the moment.
4069 if (isa<PHINode>(I) && Legal->isFixedOrderRecurrence(cast<PHINode>(I)))
4070 continue;
4071
4072 // Compute the cost of the vector instruction. Note that this cost already
4073 // includes the scalarization overhead of the predicated instruction.
4074 InstructionCost VectorCost = getInstructionCost(I, VF);
4075
4076 // Compute the cost of the scalarized instruction. This cost is the cost of
4077 // the instruction as if it wasn't if-converted and instead remained in the
4078 // predicated block. We will scale this cost by block probability after
4079 // computing the scalarization overhead.
4080 InstructionCost ScalarCost =
4082
4083 // Compute the scalarization overhead of needed insertelement instructions
4084 // and phi nodes.
4085 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
4086 Type *WideTy = toVectorizedTy(I->getType(), VF);
4087 for (Type *VectorTy : getContainedTypes(WideTy)) {
4088 ScalarCost += TTI.getScalarizationOverhead(
4090 /*Insert=*/true,
4091 /*Extract=*/false, Config.CostKind);
4092 }
4093 ScalarCost += VF.getFixedValue() *
4094 TTI.getCFInstrCost(Instruction::PHI, Config.CostKind);
4095 }
4096
4097 // Compute the scalarization overhead of needed extractelement
4098 // instructions. For each of the instruction's operands, if the operand can
4099 // be scalarized, add it to the worklist; otherwise, account for the
4100 // overhead.
4101 for (Use &U : I->operands())
4102 if (auto *J = dyn_cast<Instruction>(U.get())) {
4103 assert(canVectorizeTy(J->getType()) &&
4104 "Instruction has non-scalar type");
4105 if (CanBeScalarized(J))
4106 Worklist.push_back(J);
4107 else if (needsExtract(J, VF)) {
4108 Type *WideTy = toVectorizedTy(J->getType(), VF);
4109 for (Type *VectorTy : getContainedTypes(WideTy)) {
4110 ScalarCost += TTI.getScalarizationOverhead(
4111 cast<VectorType>(VectorTy),
4112 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
4113 /*Extract*/ true, Config.CostKind);
4114 }
4115 }
4116 }
4117
4118 // Scale the total scalar cost by block probability.
4119 ScalarCost /= getPredBlockCostDivisor(Config.CostKind, I->getParent());
4120
4121 // Compute the discount. A non-negative discount means the vector version
4122 // of the instruction costs more, and scalarizing would be beneficial.
4123 Discount += VectorCost - ScalarCost;
4124 ScalarCosts[I] = ScalarCost;
4125 }
4126
4127 return Discount;
4128}
4129
4132 assert(VF.isScalar() && "must only be called for scalar VFs");
4133
4134 // For each block.
4135 for (BasicBlock *BB : TheLoop->blocks()) {
4136 InstructionCost BlockCost;
4137
4138 // For each instruction in the old loop.
4139 for (Instruction &I : *BB) {
4140 // Skip ignored values.
4141 if (ValuesToIgnore.count(&I) ||
4142 (VF.isVector() && VecValuesToIgnore.count(&I)))
4143 continue;
4144
4146
4147 // Check if we should override the cost.
4148 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
4150
4151 BlockCost += C;
4152 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
4153 << VF << " For instruction: " << I << '\n');
4154 }
4155
4156 // In the scalar loop, we may not always execute the predicated block, if it
4157 // is an if-else block. Thus, scale the block's cost by the probability of
4158 // executing it. getPredBlockCostDivisor will return 1 for blocks that are
4159 // only predicated by the header mask when folding the tail.
4160 Cost += BlockCost / getPredBlockCostDivisor(Config.CostKind, BB);
4161 }
4162
4163 return Cost;
4164}
4165
4166/// Gets the address access SCEV for Ptr, if it should be used for cost modeling
4167/// according to isAddressSCEVForCost.
4168///
4169/// This SCEV can be sent to the Target in order to estimate the address
4170/// calculation cost.
4172 Value *Ptr,
4174 const Loop *TheLoop) {
4175 const SCEV *Addr = PSE.getSCEV(Ptr);
4176 return vputils::isAddressSCEVForCost(Addr, *PSE.getSE(), TheLoop) ? Addr
4177 : nullptr;
4178}
4179
4181LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
4182 ElementCount VF) {
4183 assert(VF.isVector() &&
4184 "Scalarization cost of instruction implies vectorization.");
4185 if (VF.isScalable())
4187
4188 Type *ValTy = getLoadStoreType(I);
4189 auto *SE = PSE.getSE();
4190
4191 unsigned AS = getLoadStoreAddressSpace(I);
4193 Type *PtrTy = toVectorTy(Ptr->getType(), VF);
4194 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
4195 // that it is being called from this specific place.
4196
4197 // Figure out whether the access is strided and get the stride value
4198 // if it's known in compile time
4199 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, PSE, TheLoop);
4200
4201 // Get the cost of the scalar memory instruction and address computation.
4203 VF.getFixedValue() *
4204 TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV, Config.CostKind);
4205
4206 // Don't pass *I here, since it is scalar but will actually be part of a
4207 // vectorized loop where the user of it is a vectorized instruction.
4208 const Align Alignment = getLoadStoreAlignment(I);
4209 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
4210 Cost += VF.getFixedValue() *
4211 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
4212 AS, Config.CostKind, OpInfo);
4213
4214 // Get the overhead of the extractelement and insertelement instructions
4215 // we might create due to scalarization.
4216 Cost += getScalarizationOverhead(I, VF);
4217
4218 // If we have a predicated load/store, it will need extra i1 extracts and
4219 // conditional branches, but may not be executed for each vector lane. Scale
4220 // the cost by the probability of executing the predicated block.
4221 if (isPredicatedInst(I)) {
4222 Cost /= getPredBlockCostDivisor(Config.CostKind, I->getParent());
4223
4224 // Add the cost of an i1 extract and a branch
4225 auto *VecI1Ty =
4227 Cost += TTI.getScalarizationOverhead(
4228 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
4229 /*Insert=*/false, /*Extract=*/true, Config.CostKind);
4230 Cost += TTI.getCFInstrCost(Instruction::CondBr, Config.CostKind);
4231
4233 // Artificially setting to a high enough value to practically disable
4234 // vectorization with such operations.
4235 Cost = 3000000;
4236 }
4237
4238 return Cost;
4239}
4240
4242LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
4243 ElementCount VF) {
4244 Type *ValTy = getLoadStoreType(I);
4245 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
4247 unsigned AS = getLoadStoreAddressSpace(I);
4248 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
4249
4250 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
4251 "Stride should be 1 or -1 for consecutive memory access");
4252 const Align Alignment = getLoadStoreAlignment(I);
4254 if (isMaskRequired(I)) {
4255 unsigned IID = I->getOpcode() == Instruction::Load
4256 ? Intrinsic::masked_load
4257 : Intrinsic::masked_store;
4258 Cost += TTI.getMemIntrinsicInstrCost(
4259 MemIntrinsicCostAttributes(IID, VectorTy, Alignment, AS),
4260 Config.CostKind);
4261 } else {
4262 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
4263 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
4264 Config.CostKind, OpInfo, I);
4265 }
4266
4267 bool Reverse = ConsecutiveStride < 0;
4268 if (Reverse)
4269 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
4270 VectorTy, {}, Config.CostKind, 0);
4271 return Cost;
4272}
4273
4275LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
4276 ElementCount VF) {
4277 assert(Legal->isUniformMemOp(*I, VF));
4278
4279 Type *ValTy = getLoadStoreType(I);
4281 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
4282 const Align Alignment = getLoadStoreAlignment(I);
4283 unsigned AS = getLoadStoreAddressSpace(I);
4284 if (isa<LoadInst>(I)) {
4285 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
4286 Config.CostKind) +
4287 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
4288 Config.CostKind) +
4289 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy,
4290 VectorTy, {}, Config.CostKind);
4291 }
4292 StoreInst *SI = cast<StoreInst>(I);
4293
4294 bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
4295 // TODO: We have existing tests that request the cost of extracting element
4296 // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
4297 // the actual generated code, which involves extracting the last element of
4298 // a scalable vector where the lane to extract is unknown at compile time.
4300 TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, Config.CostKind) +
4301 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
4302 Config.CostKind);
4303 if (!IsLoopInvariantStoreValue)
4304 Cost += TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
4305 VectorTy, Config.CostKind, 0);
4306 return Cost;
4307}
4308
4310LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
4311 ElementCount VF) {
4312 Type *ValTy = getLoadStoreType(I);
4313 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
4314 const Align Alignment = getLoadStoreAlignment(I);
4316 Type *PtrTy = Ptr->getType();
4317
4318 if (!Legal->isUniform(Ptr, VF))
4319 PtrTy = toVectorTy(PtrTy, VF);
4320
4321 unsigned IID = I->getOpcode() == Instruction::Load
4322 ? Intrinsic::masked_gather
4323 : Intrinsic::masked_scatter;
4324 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
4325 Config.CostKind) +
4326 TTI.getMemIntrinsicInstrCost(
4327 MemIntrinsicCostAttributes(IID, VectorTy, Ptr, isMaskRequired(I),
4328 Alignment, I),
4329 Config.CostKind);
4330}
4331
4333LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
4334 ElementCount VF) {
4335 const auto *Group = getInterleavedAccessGroup(I);
4336 assert(Group && "Fail to get an interleaved access group.");
4337
4338 Instruction *InsertPos = Group->getInsertPos();
4339 Type *ValTy = getLoadStoreType(InsertPos);
4340 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
4341 unsigned AS = getLoadStoreAddressSpace(InsertPos);
4342
4343 unsigned InterleaveFactor = Group->getFactor();
4344 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
4345
4346 // Holds the indices of existing members in the interleaved group.
4347 SmallVector<unsigned, 4> Indices;
4348 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
4349 if (Group->getMember(IF))
4350 Indices.push_back(IF);
4351
4352 // Calculate the cost of the whole interleaved group.
4353 bool UseMaskForGaps =
4354 (Group->requiresScalarEpilogue() && !isEpilogueAllowed()) ||
4355 (isa<StoreInst>(I) && !Group->isFull());
4356 InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
4357 InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
4358 Group->getAlign(), AS, Config.CostKind, isMaskRequired(I),
4359 UseMaskForGaps);
4360
4361 if (Group->isReverse()) {
4362 // TODO: Add support for reversed masked interleaved access.
4364 "Reverse masked interleaved access not supported.");
4365 Cost += Group->getNumMembers() *
4366 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
4367 VectorTy, {}, Config.CostKind, 0);
4368 }
4369 return Cost;
4370}
4371
4372std::optional<InstructionCost>
4374 ElementCount VF,
4375 Type *Ty) const {
4376 using namespace llvm::PatternMatch;
4377 // Early exit for no inloop reductions
4378 if (Config.getInLoopReductions().empty() || VF.isScalar() ||
4379 !isa<VectorType>(Ty))
4380 return std::nullopt;
4381 auto *VectorTy = cast<VectorType>(Ty);
4382
4383 // We are looking for a pattern of, and finding the minimal acceptable cost:
4384 // reduce(mul(ext(A), ext(B))) or
4385 // reduce(mul(A, B)) or
4386 // reduce(ext(A)) or
4387 // reduce(A).
4388 // The basic idea is that we walk down the tree to do that, finding the root
4389 // reduction instruction in InLoopReductionImmediateChains. From there we find
4390 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
4391 // of the components. If the reduction cost is lower then we return it for the
4392 // reduction instruction and 0 for the other instructions in the pattern. If
4393 // it is not we return an invalid cost specifying the orignal cost method
4394 // should be used.
4395 Instruction *RetI = I;
4396 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
4397 if (!RetI->hasOneUser())
4398 return std::nullopt;
4399 RetI = RetI->user_back();
4400 }
4401
4402 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
4403 RetI->user_back()->getOpcode() == Instruction::Add) {
4404 RetI = RetI->user_back();
4405 }
4406
4407 // Test if the found instruction is a reduction, and if not return an invalid
4408 // cost specifying the parent to use the original cost modelling.
4409 Instruction *LastChain = Config.getInLoopReductionImmediateChain(RetI);
4410 if (!LastChain)
4411 return std::nullopt;
4412
4413 // Find the reduction this chain is a part of and calculate the basic cost of
4414 // the reduction on its own.
4415 Instruction *ReductionPhi = LastChain;
4416 while (!isa<PHINode>(ReductionPhi))
4417 ReductionPhi = Config.getInLoopReductionImmediateChain(ReductionPhi);
4418
4419 const RecurrenceDescriptor &RdxDesc =
4420 Legal->getRecurrenceDescriptor(cast<PHINode>(ReductionPhi));
4421
4422 InstructionCost BaseCost;
4423 RecurKind RK = RdxDesc.getRecurrenceKind();
4426 BaseCost = TTI.getMinMaxReductionCost(
4427 MinMaxID, VectorTy, RdxDesc.getFastMathFlags(), Config.CostKind);
4428 } else {
4429 BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), VectorTy,
4430 RdxDesc.getFastMathFlags(),
4431 Config.CostKind);
4432 }
4433
4434 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
4435 // normal fmul instruction to the cost of the fadd reduction.
4436 if (RK == RecurKind::FMulAdd)
4437 BaseCost += TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy,
4438 Config.CostKind);
4439
4440 // If we're using ordered reductions then we can just return the base cost
4441 // here, since getArithmeticReductionCost calculates the full ordered
4442 // reduction cost when FP reassociation is not allowed.
4443 if (Config.useOrderedReductions(RdxDesc))
4444 return BaseCost;
4445
4446 // Get the operand that was not the reduction chain and match it to one of the
4447 // patterns, returning the better cost if it is found.
4448 Instruction *RedOp = RetI->getOperand(1) == LastChain
4451
4452 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
4453
4454 Instruction *Op0, *Op1;
4455 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
4456 match(RedOp,
4458 match(Op0, m_ZExtOrSExt(m_Value())) &&
4459 Op0->getOpcode() == Op1->getOpcode() &&
4460 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
4461 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
4462 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
4463
4464 // Matched reduce.add(ext(mul(ext(A), ext(B)))
4465 // Note that the extend opcodes need to all match, or if A==B they will have
4466 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
4467 // which is equally fine.
4468 bool IsUnsigned = isa<ZExtInst>(Op0);
4469 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
4470 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
4471
4472 InstructionCost ExtCost =
4473 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
4474 TTI::CastContextHint::None, Config.CostKind, Op0);
4475 InstructionCost MulCost =
4476 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, Config.CostKind);
4477 InstructionCost Ext2Cost = TTI.getCastInstrCost(
4478 RedOp->getOpcode(), VectorTy, MulType, TTI::CastContextHint::None,
4479 Config.CostKind, RedOp);
4480
4481 InstructionCost RedCost = TTI.getMulAccReductionCost(
4482 IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
4483 Config.CostKind);
4484
4485 if (RedCost.isValid() &&
4486 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
4487 return I == RetI ? RedCost : 0;
4488 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
4489 !TheLoop->isLoopInvariant(RedOp)) {
4490 // Matched reduce(ext(A))
4491 bool IsUnsigned = isa<ZExtInst>(RedOp);
4492 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
4493 InstructionCost RedCost = TTI.getExtendedReductionCost(
4494 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
4495 RdxDesc.getFastMathFlags(), Config.CostKind);
4496
4497 InstructionCost ExtCost = TTI.getCastInstrCost(
4498 RedOp->getOpcode(), VectorTy, ExtType, TTI::CastContextHint::None,
4499 Config.CostKind, RedOp);
4500 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
4501 return I == RetI ? RedCost : 0;
4502 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
4503 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
4504 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
4505 Op0->getOpcode() == Op1->getOpcode() &&
4506 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
4507 bool IsUnsigned = isa<ZExtInst>(Op0);
4508 Type *Op0Ty = Op0->getOperand(0)->getType();
4509 Type *Op1Ty = Op1->getOperand(0)->getType();
4510 Type *LargestOpTy =
4511 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
4512 : Op0Ty;
4513 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
4514
4515 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
4516 // different sizes. We take the largest type as the ext to reduce, and add
4517 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
4518 InstructionCost ExtCost0 = TTI.getCastInstrCost(
4519 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
4520 TTI::CastContextHint::None, Config.CostKind, Op0);
4521 InstructionCost ExtCost1 = TTI.getCastInstrCost(
4522 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
4523 TTI::CastContextHint::None, Config.CostKind, Op1);
4524 InstructionCost MulCost = TTI.getArithmeticInstrCost(
4525 Instruction::Mul, VectorTy, Config.CostKind);
4526
4527 InstructionCost RedCost = TTI.getMulAccReductionCost(
4528 IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
4529 Config.CostKind);
4530 InstructionCost ExtraExtCost = 0;
4531 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
4532 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
4533 ExtraExtCost = TTI.getCastInstrCost(
4534 ExtraExtOp->getOpcode(), ExtType,
4535 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
4536 TTI::CastContextHint::None, Config.CostKind, ExtraExtOp);
4537 }
4538
4539 if (RedCost.isValid() &&
4540 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
4541 return I == RetI ? RedCost : 0;
4542 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
4543 // Matched reduce.add(mul())
4544 InstructionCost MulCost = TTI.getArithmeticInstrCost(
4545 Instruction::Mul, VectorTy, Config.CostKind);
4546
4547 InstructionCost RedCost = TTI.getMulAccReductionCost(
4548 true, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), VectorTy,
4549 Config.CostKind);
4550
4551 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
4552 return I == RetI ? RedCost : 0;
4553 }
4554 }
4555
4556 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
4557}
4558
4560LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
4561 ElementCount VF) {
4562 // Calculate scalar cost only. Vectorization cost should be ready at this
4563 // moment.
4564 if (VF.isScalar()) {
4565 Type *ValTy = getLoadStoreType(I);
4567 const Align Alignment = getLoadStoreAlignment(I);
4568 unsigned AS = getLoadStoreAddressSpace(I);
4569
4570 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
4571 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
4572 Config.CostKind) +
4573 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
4574 Config.CostKind, OpInfo, I);
4575 }
4576 return getWideningCost(I, VF);
4577}
4578
4580LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
4581 ElementCount VF) const {
4582
4583 // There is no mechanism yet to create a scalable scalarization loop,
4584 // so this is currently Invalid.
4585 if (VF.isScalable())
4587
4588 if (VF.isScalar())
4589 return 0;
4590
4592 Type *RetTy = toVectorizedTy(I->getType(), VF);
4593 if (!RetTy->isVoidTy() &&
4594 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) {
4595
4597 if (isa<LoadInst>(I))
4599 else if (isa<StoreInst>(I))
4601
4602 for (Type *VectorTy : getContainedTypes(RetTy)) {
4603 Cost += TTI.getScalarizationOverhead(
4605 /*Insert=*/true, /*Extract=*/false, Config.CostKind,
4606 /*ForPoisonSrc=*/true, {}, VIC);
4607 }
4608 }
4609
4610 // Some targets keep addresses scalar.
4611 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
4612 return Cost;
4613
4614 // Some targets support efficient element stores.
4615 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
4616 return Cost;
4617
4618 // Collect operands to consider.
4619 CallInst *CI = dyn_cast<CallInst>(I);
4620 Instruction::op_range Ops = CI ? CI->args() : I->operands();
4621
4622 // Skip operands that do not require extraction/scalarization and do not incur
4623 // any overhead.
4625 for (auto *V : filterExtractingOperands(Ops, VF))
4626 Tys.push_back(maybeVectorizeType(V->getType(), VF));
4627
4631 return Cost +
4632 TTI.getOperandsScalarizationOverhead(Tys, Config.CostKind, OperandVIC);
4633}
4634
4636 if (VF.isScalar())
4637 return;
4638 NumPredStores = 0;
4639 for (BasicBlock *BB : TheLoop->blocks()) {
4640 // For each instruction in the old loop.
4641 for (Instruction &I : *BB) {
4643 if (!Ptr)
4644 continue;
4645
4646 // TODO: We should generate better code and update the cost model for
4647 // predicated uniform stores. Today they are treated as any other
4648 // predicated store (see added test cases in
4649 // invariant-store-vectorization.ll).
4651 NumPredStores++;
4652
4653 if (Legal->isUniformMemOp(I, VF)) {
4654 auto IsLegalToScalarize = [&]() {
4655 if (!VF.isScalable())
4656 // Scalarization of fixed length vectors "just works".
4657 return true;
4658
4659 // We have dedicated lowering for unpredicated uniform loads and
4660 // stores. Note that even with tail folding we know that at least
4661 // one lane is active (i.e. generalized predication is not possible
4662 // here), and the logic below depends on this fact.
4663 if (!foldTailByMasking())
4664 return true;
4665
4666 // For scalable vectors, a uniform memop load is always
4667 // uniform-by-parts and we know how to scalarize that.
4668 if (isa<LoadInst>(I))
4669 return true;
4670
4671 // A uniform store isn't neccessarily uniform-by-part
4672 // and we can't assume scalarization.
4673 auto &SI = cast<StoreInst>(I);
4674 return TheLoop->isLoopInvariant(SI.getValueOperand());
4675 };
4676
4677 const InstructionCost GatherScatterCost =
4678 Config.isLegalGatherOrScatter(&I, VF)
4679 ? getGatherScatterCost(&I, VF)
4681
4682 // Load: Scalar load + broadcast
4683 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
4684 // FIXME: This cost is a significant under-estimate for tail folded
4685 // memory ops.
4686 const InstructionCost ScalarizationCost =
4687 IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
4689
4690 // Choose better solution for the current VF, Note that Invalid
4691 // costs compare as maximumal large. If both are invalid, we get
4692 // scalable invalid which signals a failure and a vectorization abort.
4693 if (GatherScatterCost < ScalarizationCost)
4694 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
4695 else
4696 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
4697 continue;
4698 }
4699
4700 // We assume that widening is the best solution when possible.
4701 if (memoryInstructionCanBeWidened(&I, VF)) {
4702 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
4703 int ConsecutiveStride = Legal->isConsecutivePtr(
4705 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
4706 "Expected consecutive stride.");
4707 InstWidening Decision =
4708 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
4709 setWideningDecision(&I, VF, Decision, Cost);
4710 continue;
4711 }
4712
4713 // Choose between Interleaving, Gather/Scatter or Scalarization.
4715 unsigned NumAccesses = 1;
4716 if (isAccessInterleaved(&I)) {
4717 const auto *Group = getInterleavedAccessGroup(&I);
4718 assert(Group && "Fail to get an interleaved access group.");
4719
4720 // Make one decision for the whole group.
4721 if (getWideningDecision(&I, VF) != CM_Unknown)
4722 continue;
4723
4724 NumAccesses = Group->getNumMembers();
4726 InterleaveCost = getInterleaveGroupCost(&I, VF);
4727 }
4728
4729 InstructionCost GatherScatterCost =
4730 Config.isLegalGatherOrScatter(&I, VF)
4731 ? getGatherScatterCost(&I, VF) * NumAccesses
4733
4734 InstructionCost ScalarizationCost =
4735 getMemInstScalarizationCost(&I, VF) * NumAccesses;
4736
4737 // Choose better solution for the current VF,
4738 // write down this decision and use it during vectorization.
4740 InstWidening Decision;
4741 if (InterleaveCost <= GatherScatterCost &&
4742 InterleaveCost < ScalarizationCost) {
4743 Decision = CM_Interleave;
4744 Cost = InterleaveCost;
4745 } else if (GatherScatterCost < ScalarizationCost) {
4746 Decision = CM_GatherScatter;
4747 Cost = GatherScatterCost;
4748 } else {
4749 Decision = CM_Scalarize;
4750 Cost = ScalarizationCost;
4751 }
4752 // If the instructions belongs to an interleave group, the whole group
4753 // receives the same decision. The whole group receives the cost, but
4754 // the cost will actually be assigned to one instruction.
4755 if (const auto *Group = getInterleavedAccessGroup(&I)) {
4756 if (Decision == CM_Scalarize) {
4757 for (Instruction *I : Group->members())
4758 setWideningDecision(I, VF, Decision,
4759 getMemInstScalarizationCost(I, VF));
4760 } else {
4761 setWideningDecision(Group, VF, Decision, Cost);
4762 }
4763 } else
4764 setWideningDecision(&I, VF, Decision, Cost);
4765 }
4766 }
4767
4768 // Make sure that any load of address and any other address computation
4769 // remains scalar unless there is gather/scatter support. This avoids
4770 // inevitable extracts into address registers, and also has the benefit of
4771 // activating LSR more, since that pass can't optimize vectorized
4772 // addresses.
4773 if (TTI.prefersVectorizedAddressing())
4774 return;
4775
4776 // Start with all scalar pointer uses.
4778 for (BasicBlock *BB : TheLoop->blocks())
4779 for (Instruction &I : *BB) {
4780 Instruction *PtrDef =
4782 if (PtrDef && TheLoop->contains(PtrDef) &&
4784 AddrDefs.insert(PtrDef);
4785 }
4786
4787 // Add all instructions used to generate the addresses.
4789 append_range(Worklist, AddrDefs);
4790 while (!Worklist.empty()) {
4791 Instruction *I = Worklist.pop_back_val();
4792 for (auto &Op : I->operands())
4793 if (auto *InstOp = dyn_cast<Instruction>(Op))
4794 if (TheLoop->contains(InstOp) && !isa<PHINode>(InstOp) &&
4795 AddrDefs.insert(InstOp).second)
4796 Worklist.push_back(InstOp);
4797 }
4798
4799 auto UpdateMemOpUserCost = [this, VF](LoadInst *LI) {
4800 // If there are direct memory op users of the newly scalarized load,
4801 // their cost may have changed because there's no scalarization
4802 // overhead for the operand. Update it.
4803 for (User *U : LI->users()) {
4805 continue;
4807 continue;
4810 getMemInstScalarizationCost(cast<Instruction>(U), VF));
4811 }
4812 };
4813 for (auto *I : AddrDefs) {
4814 if (isa<LoadInst>(I)) {
4815 // Setting the desired widening decision should ideally be handled in
4816 // by cost functions, but since this involves the task of finding out
4817 // if the loaded register is involved in an address computation, it is
4818 // instead changed here when we know this is the case.
4819 InstWidening Decision = getWideningDecision(I, VF);
4820 if (!isPredicatedInst(I) &&
4821 (Decision == CM_Widen || Decision == CM_Widen_Reverse ||
4822 (!Legal->isUniformMemOp(*I, VF) && Decision == CM_Scalarize))) {
4823 // Scalarize a widened load of address or update the cost of a scalar
4824 // load of an address.
4826 I, VF, CM_Scalarize,
4827 (VF.getKnownMinValue() *
4828 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
4829 UpdateMemOpUserCost(cast<LoadInst>(I));
4830 } else if (const auto *Group = getInterleavedAccessGroup(I)) {
4831 // Scalarize all members of this interleaved group when any member
4832 // is used as an address. The address-used load skips scalarization
4833 // overhead, other members include it.
4834 for (Instruction *Member : Group->members()) {
4835 InstructionCost Cost = AddrDefs.contains(Member)
4836 ? (VF.getKnownMinValue() *
4837 getMemoryInstructionCost(
4838 Member, ElementCount::getFixed(1)))
4839 : getMemInstScalarizationCost(Member, VF);
4841 UpdateMemOpUserCost(cast<LoadInst>(Member));
4842 }
4843 }
4844 } else {
4845 // Cannot scalarize fixed-order recurrence phis at the moment.
4846 if (isa<PHINode>(I) && Legal->isFixedOrderRecurrence(cast<PHINode>(I)))
4847 continue;
4848
4849 // Make sure I gets scalarized and a cost estimate without
4850 // scalarization overhead.
4851 ForcedScalars[VF].insert(I);
4852 }
4853 }
4854}
4855
4857 assert(!VF.isScalar() &&
4858 "Trying to set a vectorization decision for a scalar VF");
4859
4860 auto ForcedScalar = ForcedScalars.find(VF);
4861 for (BasicBlock *BB : TheLoop->blocks()) {
4862 // For each instruction in the old loop.
4863 for (Instruction &I : *BB) {
4865
4866 if (!CI)
4867 continue;
4868
4872 Function *ScalarFunc = CI->getCalledFunction();
4873 Type *ScalarRetTy = CI->getType();
4874 SmallVector<Type *, 4> Tys, ScalarTys;
4875 for (auto &ArgOp : CI->args())
4876 ScalarTys.push_back(ArgOp->getType());
4877
4878 // Estimate cost of scalarized vector call. The source operands are
4879 // assumed to be vectors, so we need to extract individual elements from
4880 // there, execute VF scalar calls, and then gather the result into the
4881 // vector return value.
4882 if (VF.isFixed()) {
4883 InstructionCost ScalarCallCost = TTI.getCallInstrCost(
4884 ScalarFunc, ScalarRetTy, ScalarTys, Config.CostKind);
4885
4886 // Compute costs of unpacking argument values for the scalar calls and
4887 // packing the return values to a vector.
4888 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
4889 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
4890 } else {
4891 // There is no point attempting to calculate the scalar cost for a
4892 // scalable VF as we know it will be Invalid.
4893 assert(!getScalarizationOverhead(CI, VF).isValid() &&
4894 "Unexpected valid cost for scalarizing scalable vectors");
4895 ScalarCost = InstructionCost::getInvalid();
4896 }
4897
4898 // Honor ForcedScalars and UniformAfterVectorization decisions.
4899 // TODO: For calls, it might still be more profitable to widen. Use
4900 // VPlan-based cost model to compare different options.
4901 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
4902 ForcedScalar->second.contains(CI)) ||
4903 isUniformAfterVectorization(CI, VF))) {
4904 setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
4905 Intrinsic::not_intrinsic, ScalarCost);
4906 continue;
4907 }
4908
4909 bool MaskRequired = isMaskRequired(CI);
4910 // Compute corresponding vector type for return value and arguments.
4911 Type *RetTy = toVectorizedTy(ScalarRetTy, VF);
4912 for (Type *ScalarTy : ScalarTys)
4913 Tys.push_back(toVectorizedTy(ScalarTy, VF));
4914
4915 // An in-loop reduction using an fmuladd intrinsic is a special case;
4916 // we don't want the normal cost for that intrinsic.
4918 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
4921 *RedCost);
4922 continue;
4923 }
4924
4925 // Find the cost of vectorizing the call, if we can find a suitable
4926 // vector variant of the function.
4927 VFInfo FuncInfo;
4928 Function *VecFunc = nullptr;
4929 // Search through any available variants for one we can use at this VF.
4930 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
4931 // Must match requested VF.
4932 if (Info.Shape.VF != VF)
4933 continue;
4934
4935 // Must take a mask argument if one is required
4936 if (MaskRequired && !Info.isMasked())
4937 continue;
4938
4939 // Check that all parameter kinds are supported
4940 bool ParamsOk = true;
4941 for (VFParameter Param : Info.Shape.Parameters) {
4942 switch (Param.ParamKind) {
4944 break;
4946 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
4947 // Make sure the scalar parameter in the loop is invariant.
4948 if (!PSE.getSE()->isSCEVable(ScalarParam->getType()) ||
4949 !PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
4950 TheLoop))
4951 ParamsOk = false;
4952 break;
4953 }
4955 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
4956 // Find the stride for the scalar parameter in this loop and see if
4957 // it matches the stride for the variant.
4958 // TODO: do we need to figure out the cost of an extract to get the
4959 // first lane? Or do we hope that it will be folded away?
4960 ScalarEvolution *SE = PSE.getSE();
4961 if (!SE->isSCEVable(ScalarParam->getType()) ||
4962 !match(SE->getSCEV(ScalarParam),
4964 m_SCEV(), m_scev_SpecificSInt(Param.LinearStepOrPos),
4966 ParamsOk = false;
4967 break;
4968 }
4970 break;
4971 default:
4972 ParamsOk = false;
4973 break;
4974 }
4975 }
4976
4977 if (!ParamsOk)
4978 continue;
4979
4980 // Found a suitable candidate, stop here.
4981 VecFunc = CI->getModule()->getFunction(Info.VectorName);
4982 FuncInfo = Info;
4983 break;
4984 }
4985
4986 if (TLI && VecFunc && !CI->isNoBuiltin())
4987 VectorCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, Config.CostKind);
4988
4989 // Find the cost of an intrinsic; some targets may have instructions that
4990 // perform the operation without needing an actual call.
4992 if (IID != Intrinsic::not_intrinsic)
4994
4995 InstructionCost Cost = ScalarCost;
4996 InstWidening Decision = CM_Scalarize;
4997
4998 if (VectorCost.isValid() && VectorCost <= Cost) {
4999 Cost = VectorCost;
5000 Decision = CM_VectorCall;
5001 }
5002
5003 if (IntrinsicCost.isValid() && IntrinsicCost <= Cost) {
5005 Decision = CM_IntrinsicCall;
5006 }
5007
5008 setCallWideningDecision(CI, VF, Decision, VecFunc, IID, Cost);
5009 }
5010 }
5011}
5012
5014 if (!Legal->isInvariant(Op))
5015 return false;
5016 // Consider Op invariant, if it or its operands aren't predicated
5017 // instruction in the loop. In that case, it is not trivially hoistable.
5018 auto *OpI = dyn_cast<Instruction>(Op);
5019 return !OpI || !TheLoop->contains(OpI) ||
5020 (!isPredicatedInst(OpI) &&
5021 (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
5022 all_of(OpI->operands(),
5023 [this](Value *Op) { return shouldConsiderInvariant(Op); }));
5024}
5025
5028 ElementCount VF) {
5029 // If we know that this instruction will remain uniform, check the cost of
5030 // the scalar version.
5032 VF = ElementCount::getFixed(1);
5033
5034 if (VF.isVector() && isProfitableToScalarize(I, VF))
5035 return InstsToScalarize[VF][I];
5036
5037 // Forced scalars do not have any scalarization overhead.
5038 auto ForcedScalar = ForcedScalars.find(VF);
5039 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
5040 auto InstSet = ForcedScalar->second;
5041 if (InstSet.count(I))
5043 VF.getKnownMinValue();
5044 }
5045
5046 const auto &MinBWs = Config.getMinimalBitwidths();
5047 uint64_t InstrMinBWs = MinBWs.lookup(I);
5048 Type *RetTy = I->getType();
5050 RetTy = IntegerType::get(RetTy->getContext(), InstrMinBWs);
5051 auto *SE = PSE.getSE();
5052
5053 Type *VectorTy;
5054 if (isScalarAfterVectorization(I, VF)) {
5055 [[maybe_unused]] auto HasSingleCopyAfterVectorization =
5056 [this](Instruction *I, ElementCount VF) -> bool {
5057 if (VF.isScalar())
5058 return true;
5059
5060 auto Scalarized = InstsToScalarize.find(VF);
5061 assert(Scalarized != InstsToScalarize.end() &&
5062 "VF not yet analyzed for scalarization profitability");
5063 return !Scalarized->second.count(I) &&
5064 llvm::all_of(I->users(), [&](User *U) {
5065 auto *UI = cast<Instruction>(U);
5066 return !Scalarized->second.count(UI);
5067 });
5068 };
5069
5070 // With the exception of GEPs and PHIs, after scalarization there should
5071 // only be one copy of the instruction generated in the loop. This is
5072 // because the VF is either 1, or any instructions that need scalarizing
5073 // have already been dealt with by the time we get here. As a result,
5074 // it means we don't have to multiply the instruction cost by VF.
5075 assert(I->getOpcode() == Instruction::GetElementPtr ||
5076 I->getOpcode() == Instruction::PHI ||
5077 (I->getOpcode() == Instruction::BitCast &&
5078 I->getType()->isPointerTy()) ||
5079 HasSingleCopyAfterVectorization(I, VF));
5080 VectorTy = RetTy;
5081 } else
5082 VectorTy = toVectorizedTy(RetTy, VF);
5083
5084 if (VF.isVector() && VectorTy->isVectorTy() &&
5085 !TTI.getNumberOfParts(VectorTy))
5087
5088 // TODO: We need to estimate the cost of intrinsic calls.
5089 switch (I->getOpcode()) {
5090 case Instruction::GetElementPtr:
5091 // We mark this instruction as zero-cost because the cost of GEPs in
5092 // vectorized code depends on whether the corresponding memory instruction
5093 // is scalarized or not. Therefore, we handle GEPs with the memory
5094 // instruction cost.
5095 return 0;
5096 case Instruction::UncondBr:
5097 case Instruction::CondBr: {
5098 // In cases of scalarized and predicated instructions, there will be VF
5099 // predicated blocks in the vectorized loop. Each branch around these
5100 // blocks requires also an extract of its vector compare i1 element.
5101 // Note that the conditional branch from the loop latch will be replaced by
5102 // a single branch controlling the loop, so there is no extra overhead from
5103 // scalarization.
5104 bool ScalarPredicatedBB = false;
5106 if (VF.isVector() && BI &&
5107 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
5108 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
5109 BI->getParent() != TheLoop->getLoopLatch())
5110 ScalarPredicatedBB = true;
5111
5112 if (ScalarPredicatedBB) {
5113 // Not possible to scalarize scalable vector with predicated instructions.
5114 if (VF.isScalable())
5116 // Return cost for branches around scalarized and predicated blocks.
5117 auto *VecI1Ty =
5119 return (TTI.getScalarizationOverhead(
5120 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
5121 /*Insert*/ false, /*Extract*/ true, Config.CostKind) +
5122 (TTI.getCFInstrCost(Instruction::CondBr, Config.CostKind) *
5123 VF.getFixedValue()));
5124 }
5125
5126 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
5127 // The back-edge branch will remain, as will all scalar branches.
5128 return TTI.getCFInstrCost(Instruction::UncondBr, Config.CostKind);
5129
5130 // This branch will be eliminated by if-conversion.
5131 return 0;
5132 // Note: We currently assume zero cost for an unconditional branch inside
5133 // a predicated block since it will become a fall-through, although we
5134 // may decide in the future to call TTI for all branches.
5135 }
5136 case Instruction::Switch: {
5137 if (VF.isScalar())
5138 return TTI.getCFInstrCost(Instruction::Switch, Config.CostKind);
5139 auto *Switch = cast<SwitchInst>(I);
5140 return Switch->getNumCases() *
5141 TTI.getCmpSelInstrCost(
5142 Instruction::ICmp,
5143 toVectorTy(Switch->getCondition()->getType(), VF),
5144 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
5145 CmpInst::ICMP_EQ, Config.CostKind);
5146 }
5147 case Instruction::PHI: {
5148 auto *Phi = cast<PHINode>(I);
5149
5150 // First-order recurrences are replaced by vector shuffles inside the loop.
5151 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
5152 return TTI.getShuffleCost(
5154 cast<VectorType>(VectorTy), {}, Config.CostKind, -1);
5155 }
5156
5157 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
5158 // converted into select instructions. We require N - 1 selects per phi
5159 // node, where N is the number of incoming values.
5160 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
5161 Type *ResultTy = Phi->getType();
5162
5163 // All instructions in an Any-of reduction chain are narrowed to bool.
5164 // Check if that is the case for this phi node.
5165 auto *HeaderUser = cast_if_present<PHINode>(
5166 find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
5167 auto *Phi = dyn_cast<PHINode>(U);
5168 if (Phi && Phi->getParent() == TheLoop->getHeader())
5169 return Phi;
5170 return nullptr;
5171 }));
5172 if (HeaderUser) {
5173 auto &ReductionVars = Legal->getReductionVars();
5174 auto Iter = ReductionVars.find(HeaderUser);
5175 if (Iter != ReductionVars.end() &&
5177 Iter->second.getRecurrenceKind()))
5178 ResultTy = Type::getInt1Ty(Phi->getContext());
5179 }
5180 return (Phi->getNumIncomingValues() - 1) *
5181 TTI.getCmpSelInstrCost(
5182 Instruction::Select, toVectorTy(ResultTy, VF),
5183 toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
5184 CmpInst::BAD_ICMP_PREDICATE, Config.CostKind);
5185 }
5186
5187 // When tail folding with EVL, if the phi is part of an out of loop
5188 // reduction then it will be transformed into a wide vp_merge.
5189 if (VF.isVector() && foldTailWithEVL() &&
5190 Legal->getReductionVars().contains(Phi) &&
5191 !Config.isInLoopReduction(Phi)) {
5193 Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
5194 {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
5195 return TTI.getIntrinsicInstrCost(ICA, Config.CostKind);
5196 }
5197
5198 return TTI.getCFInstrCost(Instruction::PHI, Config.CostKind);
5199 }
5200 case Instruction::UDiv:
5201 case Instruction::SDiv:
5202 case Instruction::URem:
5203 case Instruction::SRem:
5204 if (VF.isVector() && isPredicatedInst(I)) {
5205 const auto [ScalarCost, MaskedCost] = getDivRemSpeculationCost(I, VF);
5206 return isDivRemScalarWithPredication(ScalarCost, MaskedCost) ? ScalarCost
5207 : MaskedCost;
5208 }
5209 // We've proven all lanes safe to speculate, fall through.
5210 [[fallthrough]];
5211 case Instruction::Add:
5212 case Instruction::Sub: {
5213 auto Info = Legal->getHistogramInfo(I);
5214 if (Info && VF.isVector()) {
5215 const HistogramInfo *HGram = Info.value();
5216 // Assume that a non-constant update value (or a constant != 1) requires
5217 // a multiply, and add that into the cost.
5219 ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
5220 if (!RHS || RHS->getZExtValue() != 1)
5221 MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
5222 Config.CostKind);
5223
5224 // Find the cost of the histogram operation itself.
5225 Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
5226 Type *ScalarTy = I->getType();
5227 Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
5228 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
5229 Type::getVoidTy(I->getContext()),
5230 {PtrTy, ScalarTy, MaskTy});
5231
5232 // Add the costs together with the add/sub operation.
5233 return TTI.getIntrinsicInstrCost(ICA, Config.CostKind) + MulCost +
5234 TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy,
5235 Config.CostKind);
5236 }
5237 [[fallthrough]];
5238 }
5239 case Instruction::FAdd:
5240 case Instruction::FSub:
5241 case Instruction::Mul:
5242 case Instruction::FMul:
5243 case Instruction::FDiv:
5244 case Instruction::FRem:
5245 case Instruction::Shl:
5246 case Instruction::LShr:
5247 case Instruction::AShr:
5248 case Instruction::And:
5249 case Instruction::Or:
5250 case Instruction::Xor: {
5251 // If we're speculating on the stride being 1, the multiplication may
5252 // fold away. We can generalize this for all operations using the notion
5253 // of neutral elements. (TODO)
5254 if (I->getOpcode() == Instruction::Mul &&
5255 ((TheLoop->isLoopInvariant(I->getOperand(0)) &&
5256 PSE.getSCEV(I->getOperand(0))->isOne()) ||
5257 (TheLoop->isLoopInvariant(I->getOperand(1)) &&
5258 PSE.getSCEV(I->getOperand(1))->isOne())))
5259 return 0;
5260
5261 // Detect reduction patterns
5262 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
5263 return *RedCost;
5264
5265 // Certain instructions can be cheaper to vectorize if they have a constant
5266 // second vector operand. One example of this are shifts on x86.
5267 Value *Op2 = I->getOperand(1);
5268 if (!isa<Constant>(Op2) && TheLoop->isLoopInvariant(Op2) &&
5269 PSE.getSE()->isSCEVable(Op2->getType()) &&
5270 isa<SCEVConstant>(PSE.getSCEV(Op2))) {
5271 Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
5272 }
5273 auto Op2Info = TTI.getOperandInfo(Op2);
5274 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
5277
5278 SmallVector<const Value *, 4> Operands(I->operand_values());
5279 return TTI.getArithmeticInstrCost(
5280 I->getOpcode(), VectorTy, Config.CostKind,
5281 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
5282 Op2Info, Operands, I, TLI);
5283 }
5284 case Instruction::FNeg: {
5285 return TTI.getArithmeticInstrCost(
5286 I->getOpcode(), VectorTy, Config.CostKind,
5287 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
5288 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
5289 I->getOperand(0), I);
5290 }
5291 case Instruction::Select: {
5293 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
5294 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
5295
5296 const Value *Op0, *Op1;
5297 using namespace llvm::PatternMatch;
5298 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
5299 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
5300 // select x, y, false --> x & y
5301 // select x, true, y --> x | y
5302 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
5303 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
5304 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
5305 Op1->getType()->getScalarSizeInBits() == 1);
5306
5307 return TTI.getArithmeticInstrCost(
5308 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And,
5309 VectorTy, Config.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, {Op0, Op1},
5310 I);
5311 }
5312
5313 Type *CondTy = SI->getCondition()->getType();
5314 if (!ScalarCond)
5315 CondTy = VectorType::get(CondTy, VF);
5316
5318 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
5319 Pred = Cmp->getPredicate();
5320 return TTI.getCmpSelInstrCost(
5321 I->getOpcode(), VectorTy, CondTy, Pred, Config.CostKind,
5322 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, I);
5323 }
5324 case Instruction::ICmp:
5325 case Instruction::FCmp: {
5326 Type *ValTy = I->getOperand(0)->getType();
5327
5329 [[maybe_unused]] Instruction *Op0AsInstruction =
5330 dyn_cast<Instruction>(I->getOperand(0));
5331 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
5332 InstrMinBWs == MinBWs.lookup(Op0AsInstruction)) &&
5333 "if both the operand and the compare are marked for "
5334 "truncation, they must have the same bitwidth");
5335 ValTy = IntegerType::get(ValTy->getContext(), InstrMinBWs);
5336 }
5337
5338 VectorTy = toVectorTy(ValTy, VF);
5339 return TTI.getCmpSelInstrCost(
5340 I->getOpcode(), VectorTy, CmpInst::makeCmpResultType(VectorTy),
5341 cast<CmpInst>(I)->getPredicate(), Config.CostKind,
5342 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, I);
5343 }
5344 case Instruction::Store:
5345 case Instruction::Load: {
5346 ElementCount Width = VF;
5347 if (Width.isVector()) {
5348 InstWidening Decision = getWideningDecision(I, Width);
5349 assert(Decision != CM_Unknown &&
5350 "CM decision should be taken at this point");
5353 if (Decision == CM_Scalarize)
5354 Width = ElementCount::getFixed(1);
5355 }
5356 VectorTy = toVectorTy(getLoadStoreType(I), Width);
5357 return getMemoryInstructionCost(I, VF);
5358 }
5359 case Instruction::BitCast:
5360 if (I->getType()->isPointerTy())
5361 return 0;
5362 [[fallthrough]];
5363 case Instruction::ZExt:
5364 case Instruction::SExt:
5365 case Instruction::FPToUI:
5366 case Instruction::FPToSI:
5367 case Instruction::FPExt:
5368 case Instruction::PtrToInt:
5369 case Instruction::IntToPtr:
5370 case Instruction::SIToFP:
5371 case Instruction::UIToFP:
5372 case Instruction::Trunc:
5373 case Instruction::FPTrunc: {
5374 // Computes the CastContextHint from a Load/Store instruction.
5375 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
5377 "Expected a load or a store!");
5378
5379 if (VF.isScalar() || !TheLoop->contains(I))
5381
5382 switch (getWideningDecision(I, VF)) {
5394 llvm_unreachable("Instr did not go through cost modelling?");
5397 llvm_unreachable_internal("Instr has invalid widening decision");
5400 }
5401
5402 llvm_unreachable("Unhandled case!");
5403 };
5404
5405 unsigned Opcode = I->getOpcode();
5407 // For Trunc, the context is the only user, which must be a StoreInst.
5408 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
5409 if (I->hasOneUse())
5410 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
5411 CCH = ComputeCCH(Store);
5412 }
5413 // For Z/Sext, the context is the operand, which must be a LoadInst.
5414 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
5415 Opcode == Instruction::FPExt) {
5416 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
5417 CCH = ComputeCCH(Load);
5418 }
5419
5420 // We optimize the truncation of induction variables having constant
5421 // integer steps. The cost of these truncations is the same as the scalar
5422 // operation.
5423 if (isOptimizableIVTruncate(I, VF)) {
5424 auto *Trunc = cast<TruncInst>(I);
5425 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
5426 Trunc->getSrcTy(), CCH, Config.CostKind,
5427 Trunc);
5428 }
5429
5430 // Detect reduction patterns
5431 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
5432 return *RedCost;
5433
5434 Type *SrcScalarTy = I->getOperand(0)->getType();
5435 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
5436 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
5437 SrcScalarTy = IntegerType::get(SrcScalarTy->getContext(),
5438 MinBWs.lookup(Op0AsInstruction));
5439 Type *SrcVecTy =
5440 VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
5441
5443 // If the result type is <= the source type, there will be no extend
5444 // after truncating the users to the minimal required bitwidth.
5445 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
5446 (I->getOpcode() == Instruction::ZExt ||
5447 I->getOpcode() == Instruction::SExt))
5448 return 0;
5449 }
5450
5451 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH,
5452 Config.CostKind, I);
5453 }
5454 case Instruction::Call:
5455 return getVectorCallCost(cast<CallInst>(I), VF);
5456 case Instruction::ExtractValue:
5457 return TTI.getInstructionCost(I, Config.CostKind);
5458 case Instruction::Alloca:
5459 // We cannot easily widen alloca to a scalable alloca, as
5460 // the result would need to be a vector of pointers.
5461 if (VF.isScalable())
5463 return TTI.getArithmeticInstrCost(Instruction::Mul, RetTy, Config.CostKind);
5464 case Instruction::Freeze:
5465 return TTI::TCC_Free;
5466 default:
5467 // This opcode is unknown. Assume that it is the same as 'mul'.
5468 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
5469 Config.CostKind);
5470 } // end of switch.
5471}
5472
5474 // Ignore ephemeral values.
5476
5477 SmallVector<Value *, 4> DeadInterleavePointerOps;
5479
5480 // If a scalar epilogue is required, users outside the loop won't use
5481 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
5482 // that is the case.
5483 bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
5484 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
5485 return RequiresScalarEpilogue &&
5486 !TheLoop->contains(cast<Instruction>(U)->getParent());
5487 };
5488
5490 DFS.perform(LI);
5491 for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
5492 for (Instruction &I : reverse(*BB)) {
5493 if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
5494 continue;
5495
5496 // Add instructions that would be trivially dead and are only used by
5497 // values already ignored to DeadOps to seed worklist.
5499 all_of(I.users(), [this, IsLiveOutDead](User *U) {
5500 return VecValuesToIgnore.contains(U) ||
5501 ValuesToIgnore.contains(U) || IsLiveOutDead(U);
5502 }))
5503 DeadOps.push_back(&I);
5504
5505 // For interleave groups, we only create a pointer for the start of the
5506 // interleave group. Queue up addresses of group members except the insert
5507 // position for further processing.
5508 if (isAccessInterleaved(&I)) {
5509 auto *Group = getInterleavedAccessGroup(&I);
5510 if (Group->getInsertPos() == &I)
5511 continue;
5512 Value *PointerOp = getLoadStorePointerOperand(&I);
5513 DeadInterleavePointerOps.push_back(PointerOp);
5514 }
5515
5516 // Queue branches for analysis. They are dead, if their successors only
5517 // contain dead instructions.
5518 if (isa<CondBrInst>(&I))
5519 DeadOps.push_back(&I);
5520 }
5521
5522 // Mark ops feeding interleave group members as free, if they are only used
5523 // by other dead computations.
5524 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
5525 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
5526 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
5527 Instruction *UI = cast<Instruction>(U);
5528 return !VecValuesToIgnore.contains(U) &&
5529 (!isAccessInterleaved(UI) ||
5530 getInterleavedAccessGroup(UI)->getInsertPos() == UI);
5531 }))
5532 continue;
5533 VecValuesToIgnore.insert(Op);
5534 append_range(DeadInterleavePointerOps, Op->operands());
5535 }
5536
5537 // Mark ops that would be trivially dead and are only used by ignored
5538 // instructions as free.
5539 BasicBlock *Header = TheLoop->getHeader();
5540
5541 // Returns true if the block contains only dead instructions. Such blocks will
5542 // be removed by VPlan-to-VPlan transforms and won't be considered by the
5543 // VPlan-based cost model, so skip them in the legacy cost-model as well.
5544 auto IsEmptyBlock = [this](BasicBlock *BB) {
5545 return all_of(*BB, [this](Instruction &I) {
5546 return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
5548 });
5549 };
5550 for (unsigned I = 0; I != DeadOps.size(); ++I) {
5551 auto *Op = dyn_cast<Instruction>(DeadOps[I]);
5552
5553 // Check if the branch should be considered dead.
5554 if (auto *Br = dyn_cast_or_null<CondBrInst>(Op)) {
5555 BasicBlock *ThenBB = Br->getSuccessor(0);
5556 BasicBlock *ElseBB = Br->getSuccessor(1);
5557 // Don't considers branches leaving the loop for simplification.
5558 if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
5559 continue;
5560 bool ThenEmpty = IsEmptyBlock(ThenBB);
5561 bool ElseEmpty = IsEmptyBlock(ElseBB);
5562 if ((ThenEmpty && ElseEmpty) ||
5563 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
5564 ElseBB->phis().empty()) ||
5565 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
5566 ThenBB->phis().empty())) {
5567 VecValuesToIgnore.insert(Br);
5568 DeadOps.push_back(Br->getCondition());
5569 }
5570 continue;
5571 }
5572
5573 // Skip any op that shouldn't be considered dead.
5574 if (!Op || !TheLoop->contains(Op) ||
5575 (isa<PHINode>(Op) && Op->getParent() == Header) ||
5577 any_of(Op->users(), [this, IsLiveOutDead](User *U) {
5578 return !VecValuesToIgnore.contains(U) &&
5579 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
5580 }))
5581 continue;
5582
5583 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
5584 // which applies for both scalar and vector versions. Otherwise it is only
5585 // dead in vector versions, so only add it to VecValuesToIgnore.
5586 if (all_of(Op->users(),
5587 [this](User *U) { return ValuesToIgnore.contains(U); }))
5588 ValuesToIgnore.insert(Op);
5589
5590 VecValuesToIgnore.insert(Op);
5591 append_range(DeadOps, Op->operands());
5592 }
5593
5594 // Ignore type-promoting instructions we identified during reduction
5595 // detection.
5596 for (const auto &Reduction : Legal->getReductionVars()) {
5597 const RecurrenceDescriptor &RedDes = Reduction.second;
5598 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
5599 VecValuesToIgnore.insert_range(Casts);
5600 }
5601 // Ignore type-casting instructions we identified during induction
5602 // detection.
5603 for (const auto &Induction : Legal->getInductionVars()) {
5604 const InductionDescriptor &IndDes = Induction.second;
5605 VecValuesToIgnore.insert_range(IndDes.getCastInsts());
5606 }
5607}
5608
5609void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
5610 CM.collectValuesToIgnore();
5611 Config.collectElementTypesForWidening(&CM.ValuesToIgnore);
5612
5613 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
5614 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
5615 return;
5616
5617 if (!OrigLoop->isInnermost()) {
5618 // For outer loops, computeMaxVF returns a single non-scalar VF; build a
5619 // plan for only that VF.
5620 ElementCount VF =
5621 MaxFactors.FixedVF ? MaxFactors.FixedVF : MaxFactors.ScalableVF;
5622 buildVPlans(VF, VF);
5624 return;
5625 }
5626
5627 // Compute the minimal bitwidths required for integer operations in the loop
5628 // for later use by the cost model.
5629 Config.computeMinimalBitwidths();
5630
5631 // Invalidate interleave groups if all blocks of loop will be predicated.
5632 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
5634 LLVM_DEBUG(
5635 dbgs()
5636 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
5637 "which requires masked-interleaved support.\n");
5638 if (CM.InterleaveInfo.invalidateGroups())
5639 // Invalidating interleave groups also requires invalidating all decisions
5640 // based on them, which includes widening decisions and uniform and scalar
5641 // values.
5642 CM.invalidateCostModelingDecisions();
5643 }
5644
5645 if (CM.foldTailByMasking())
5646 Legal->prepareToFoldTailByMasking();
5647
5648 ElementCount MaxUserVF =
5649 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
5650 if (UserVF) {
5651 if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
5653 "UserVF ignored because it may be larger than the maximal safe VF",
5654 "InvalidUserVF", ORE, OrigLoop);
5655 } else {
5657 "VF needs to be a power of two");
5658 // Collect the instructions (and their associated costs) that will be more
5659 // profitable to scalarize.
5660 Config.collectInLoopReductions();
5661 CM.collectNonVectorizedAndSetWideningDecisions(UserVF);
5662 ElementCount EpilogueUserVF =
5664 if (EpilogueUserVF.isVector() &&
5665 ElementCount::isKnownLT(EpilogueUserVF, UserVF)) {
5666 CM.collectNonVectorizedAndSetWideningDecisions(EpilogueUserVF);
5667 buildVPlans(EpilogueUserVF, EpilogueUserVF);
5668 }
5669 buildVPlans(UserVF, UserVF);
5670 if (!VPlans.empty() && VPlans.back()->getSingleVF() == UserVF) {
5671 // For scalar VF, skip VPlan cost check as VPlan cost is designed for
5672 // vector VFs only.
5673 if (UserVF.isScalar() ||
5674 cost(*VPlans.back(), UserVF, /*RU=*/nullptr).isValid()) {
5675 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
5677 return;
5678 }
5679 }
5680 VPlans.clear();
5681 reportVectorizationInfo("UserVF ignored because of invalid costs.",
5682 "InvalidCost", ORE, OrigLoop);
5683 }
5684 }
5685
5686 // Collect the Vectorization Factor Candidates.
5687 SmallVector<ElementCount> VFCandidates;
5688 for (auto VF = ElementCount::getFixed(1);
5689 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
5690 VFCandidates.push_back(VF);
5691 for (auto VF = ElementCount::getScalable(1);
5692 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
5693 VFCandidates.push_back(VF);
5694
5695 Config.collectInLoopReductions();
5696 for (const auto &VF : VFCandidates) {
5697 // Collect Uniform and Scalar instructions after vectorization with VF.
5698 CM.collectNonVectorizedAndSetWideningDecisions(VF);
5699 }
5700
5701 buildVPlans(ElementCount::getFixed(1), MaxFactors.FixedVF);
5702 buildVPlans(ElementCount::getScalable(1), MaxFactors.ScalableVF);
5703
5705}
5706
5708 ElementCount VF) const {
5709 InstructionCost Cost = CM.getInstructionCost(UI, VF);
5710 if (Cost.isValid() && ForceTargetInstructionCost.getNumOccurrences())
5712 return Cost;
5713}
5714
5715bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
5716 return CM.ValuesToIgnore.contains(UI) ||
5717 (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
5718 SkipCostComputation.contains(UI);
5719}
5720
5726
5728 return CM.getPredBlockCostDivisor(CostKind, BB);
5729}
5730
5732 return CM.isScalarWithPredication(I, VF) ||
5733 CM.isUniformAfterVectorization(I, VF) || CM.isForcedScalar(I, VF) ||
5734 (VF.isVector() && CM.isProfitableToScalarize(I, VF));
5735}
5736
5738 return CM.isMaskRequired(I);
5739}
5740
5741std::optional<VPCostContext::CallWideningKind>
5743 if (VF.isScalar())
5745 switch (CM.getCallWideningDecision(CI, VF).Kind) {
5752 default:
5753 return std::nullopt;
5754 }
5755}
5756
5758LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
5759 VPCostContext &CostCtx) const {
5761 // Cost modeling for inductions is inaccurate in the legacy cost model
5762 // compared to the recipes that are generated. To match here initially during
5763 // VPlan cost model bring up directly use the induction costs from the legacy
5764 // cost model. Note that we do this as pre-processing; the VPlan may not have
5765 // any recipes associated with the original induction increment instruction
5766 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
5767 // the cost of induction phis and increments (both that are represented by
5768 // recipes and those that are not), to avoid distinguishing between them here,
5769 // and skip all recipes that represent induction phis and increments (the
5770 // former case) later on, if they exist, to avoid counting them twice.
5771 // Similarly we pre-compute the cost of any optimized truncates.
5772 // TODO: Switch to more accurate costing based on VPlan.
5773 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
5775 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
5776 SmallVector<Instruction *> IVInsts = {IVInc};
5777 for (unsigned I = 0; I != IVInsts.size(); I++) {
5778 for (Value *Op : IVInsts[I]->operands()) {
5779 auto *OpI = dyn_cast<Instruction>(Op);
5780 if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
5781 continue;
5782 IVInsts.push_back(OpI);
5783 }
5784 }
5785 IVInsts.push_back(IV);
5786 for (User *U : IV->users()) {
5787 auto *CI = cast<Instruction>(U);
5788 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
5789 continue;
5790 IVInsts.push_back(CI);
5791 }
5792
5793 // If the vector loop gets executed exactly once with the given VF, ignore
5794 // the costs of comparison and induction instructions, as they'll get
5795 // simplified away.
5796 // TODO: Remove this code after stepping away from the legacy cost model and
5797 // adding code to simplify VPlans before calculating their costs.
5798 auto TC = getSmallConstantTripCount(PSE.getSE(), OrigLoop);
5799 if (TC == VF && !CM.foldTailByMasking())
5800 addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
5801 CostCtx.SkipCostComputation);
5802
5803 for (Instruction *IVInst : IVInsts) {
5804 if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
5805 continue;
5806 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
5807 LLVM_DEBUG({
5808 dbgs() << "Cost of " << InductionCost << " for VF " << VF
5809 << ": induction instruction " << *IVInst << "\n";
5810 });
5811 Cost += InductionCost;
5812 CostCtx.SkipCostComputation.insert(IVInst);
5813 }
5814 }
5815
5816 /// Compute the cost of all exiting conditions of the loop using the legacy
5817 /// cost model. This is to match the legacy behavior, which adds the cost of
5818 /// all exit conditions. Note that this over-estimates the cost, as there will
5819 /// be a single condition to control the vector loop.
5821 CM.TheLoop->getExitingBlocks(Exiting);
5822 SetVector<Instruction *> ExitInstrs;
5823 // Collect all exit conditions.
5824 for (BasicBlock *EB : Exiting) {
5825 auto *Term = dyn_cast<CondBrInst>(EB->getTerminator());
5826 if (!Term || CostCtx.skipCostComputation(Term, VF.isVector()))
5827 continue;
5828 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
5829 ExitInstrs.insert(CondI);
5830 }
5831 }
5832 // Compute the cost of all instructions only feeding the exit conditions.
5833 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
5834 Instruction *CondI = ExitInstrs[I];
5835 if (!OrigLoop->contains(CondI) ||
5836 !CostCtx.SkipCostComputation.insert(CondI).second)
5837 continue;
5838 InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
5839 LLVM_DEBUG({
5840 dbgs() << "Cost of " << CondICost << " for VF " << VF
5841 << ": exit condition instruction " << *CondI << "\n";
5842 });
5843 Cost += CondICost;
5844 for (Value *Op : CondI->operands()) {
5845 auto *OpI = dyn_cast<Instruction>(Op);
5846 if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) ||
5847 any_of(OpI->users(), [&ExitInstrs](User *U) {
5848 return !ExitInstrs.contains(cast<Instruction>(U));
5849 }))
5850 continue;
5851 ExitInstrs.insert(OpI);
5852 }
5853 }
5854
5855 // Pre-compute the costs for branches except for the backedge, as the number
5856 // of replicate regions in a VPlan may not directly match the number of
5857 // branches, which would lead to different decisions.
5858 // TODO: Compute cost of branches for each replicate region in the VPlan,
5859 // which is more accurate than the legacy cost model.
5860 for (BasicBlock *BB : OrigLoop->blocks()) {
5861 if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
5862 continue;
5863 CostCtx.SkipCostComputation.insert(BB->getTerminator());
5864 if (BB == OrigLoop->getLoopLatch())
5865 continue;
5866 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
5867 Cost += BranchCost;
5868 }
5869
5870 // Don't apply special costs when instruction cost is forced to make sure the
5871 // forced cost is used for each recipe.
5872 if (ForceTargetInstructionCost.getNumOccurrences())
5873 return Cost;
5874
5875 // Pre-compute costs for instructions that are forced-scalar or profitable to
5876 // scalarize. For most such instructions, their scalarization costs are
5877 // accounted for here using the legacy cost model. However, some opcodes
5878 // are excluded from these precomputed scalarization costs and are instead
5879 // modeled later by the VPlan cost model (see UseVPlanCostModel below).
5880 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
5881 if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
5882 continue;
5883 CostCtx.SkipCostComputation.insert(ForcedScalar);
5884 InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
5885 LLVM_DEBUG({
5886 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
5887 << ": forced scalar " << *ForcedScalar << "\n";
5888 });
5889 Cost += ForcedCost;
5890 }
5891
5892 auto UseVPlanCostModel = [](Instruction *I) -> bool {
5893 switch (I->getOpcode()) {
5894 case Instruction::SDiv:
5895 case Instruction::UDiv:
5896 case Instruction::SRem:
5897 case Instruction::URem:
5898 return true;
5899 default:
5900 return false;
5901 }
5902 };
5903 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
5904 if (UseVPlanCostModel(Scalarized) ||
5905 CostCtx.skipCostComputation(Scalarized, VF.isVector()))
5906 continue;
5907 CostCtx.SkipCostComputation.insert(Scalarized);
5908 LLVM_DEBUG({
5909 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
5910 << ": profitable to scalarize " << *Scalarized << "\n";
5911 });
5912 Cost += ScalarCost;
5913 }
5914
5915 return Cost;
5916}
5917
5918InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF,
5919 VPRegisterUsage *RU) const {
5920 VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, Config.CostKind, PSE,
5921 OrigLoop);
5922 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
5923
5924 // Now compute and add the VPlan-based cost.
5925 Cost += Plan.cost(VF, CostCtx);
5926
5927 // Add the cost of spills due to excess register usage
5928 if (RU && Config.shouldConsiderRegPressureForVF(VF))
5929 Cost += RU->spillCost(CM.TTI, Config.CostKind, ForceTargetNumVectorRegs);
5930
5931#ifndef NDEBUG
5932 unsigned EstimatedWidth =
5933 estimateElementCount(VF, Config.getVScaleForTuning());
5934 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
5935 << " (Estimated cost per lane: ");
5936 if (Cost.isValid()) {
5937 double CostPerLane = double(Cost.getValue()) / EstimatedWidth;
5938 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
5939 } else /* No point dividing an invalid cost - it will still be invalid */
5940 LLVM_DEBUG(dbgs() << "Invalid");
5941 LLVM_DEBUG(dbgs() << ")\n");
5942#endif
5943 return Cost;
5944}
5945
5946std::pair<VectorizationFactor, VPlan *>
5948 if (VPlans.empty())
5949 return {VectorizationFactor::Disabled(), nullptr};
5950 // If there is a single VPlan with a single VF, return it directly.
5951 VPlan &FirstPlan = *VPlans[0];
5952
5953 ElementCount UserVF = Hints.getWidth();
5954 if (VPlans.size() == 1) {
5955 // For outer loops, the plan has a single vector VF determined by the
5956 // heuristic.
5957 assert((FirstPlan.hasScalarVFOnly() || hasPlanWithVF(UserVF) ||
5958 FirstPlan.isOuterLoop()) &&
5959 "must have a single scalar VF, UserVF or an outer loop");
5960 return {VectorizationFactor(FirstPlan.getSingleVF(), 0, 0), &FirstPlan};
5961 }
5962
5963 if (hasPlanWithVF(UserVF) && EpilogueVectorizationForceVF > 1) {
5964 assert(VPlans.size() == 2 && "Must have exactly 2 VPlans built");
5965 assert(VPlans[0]->getSingleVF() ==
5967 "expected first plan to be for the forced epilogue VF");
5968 assert(VPlans[1]->getSingleVF() == UserVF &&
5969 "expected second plan to be for the forced UserVF");
5970 return {VectorizationFactor(UserVF, 0, 0), VPlans[1].get()};
5971 }
5972
5973 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
5974 << (Config.CostKind == TTI::TCK_RecipThroughput
5975 ? "Reciprocal Throughput\n"
5976 : Config.CostKind == TTI::TCK_Latency
5977 ? "Instruction Latency\n"
5978 : Config.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
5979 : Config.CostKind == TTI::TCK_SizeAndLatency
5980 ? "Code Size and Latency\n"
5981 : "Unknown\n"));
5982
5984 assert(FirstPlan.hasVF(ScalarVF) &&
5985 "More than a single plan/VF w/o any plan having scalar VF");
5986
5987 // TODO: Compute scalar cost using VPlan-based cost model.
5988 InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
5989 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
5990 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
5991 VectorizationFactor BestFactor = ScalarFactor;
5992
5993 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
5994 if (ForceVectorization) {
5995 // Ignore scalar width, because the user explicitly wants vectorization.
5996 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5997 // evaluation.
5998 BestFactor.Cost = InstructionCost::getMax();
5999 }
6000
6001 VPlan *PlanForBestVF = &FirstPlan;
6002
6003 for (auto &P : VPlans) {
6004 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
6005 P->vectorFactors().end());
6006
6008 bool ConsiderRegPressure = any_of(VFs, [this](ElementCount VF) {
6009 return Config.shouldConsiderRegPressureForVF(VF);
6010 });
6012 RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
6013
6014 for (unsigned I = 0; I < VFs.size(); I++) {
6015 ElementCount VF = VFs[I];
6016 if (VF.isScalar())
6017 continue;
6018 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
6019 LLVM_DEBUG(
6020 dbgs()
6021 << "LV: Not considering vector loop of width " << VF
6022 << " because it will not generate any vector instructions.\n");
6023 continue;
6024 }
6025 if (Config.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
6026 LLVM_DEBUG(
6027 dbgs()
6028 << "LV: Not considering vector loop of width " << VF
6029 << " because it would cause replicated blocks to be generated,"
6030 << " which isn't allowed when optimizing for size.\n");
6031 continue;
6032 }
6033
6035 cost(*P, VF, ConsiderRegPressure ? &RUs[I] : nullptr);
6036 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
6037
6038 if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail())) {
6039 BestFactor = CurrentFactor;
6040 PlanForBestVF = P.get();
6041 }
6042
6043 // If profitable add it to ProfitableVF list.
6044 if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
6045 ProfitableVFs.push_back(CurrentFactor);
6046 }
6047 }
6048
6049 VPlan &BestPlan = *PlanForBestVF;
6050
6051 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
6052 "when vectorizing, the scalar cost must be computed.");
6053
6054 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
6055 return {BestFactor, &BestPlan};
6056}
6057
6059 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
6061 EpilogueVectorizationKind EpilogueVecKind) {
6062 assert(BestVPlan.hasVF(BestVF) &&
6063 "Trying to execute plan with unsupported VF");
6064 assert(BestVPlan.hasUF(BestUF) &&
6065 "Trying to execute plan with unsupported UF");
6066 if (BestVPlan.hasEarlyExit())
6067 ++LoopsEarlyExitVectorized;
6068
6070 BestVPlan, *PSE.getSE(), CM.TTI, Config.CostKind, BestVF, BestUF,
6071 CM.ValuesToIgnore);
6072 // TODO: Move to VPlan transform stage once the transition to the VPlan-based
6073 // cost model is complete for better cost estimates.
6074 RUN_VPLAN_PASS(VPlanTransforms::unrollByUF, BestVPlan, BestUF);
6078 bool HasBranchWeights =
6079 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator());
6080 if (HasBranchWeights) {
6081 std::optional<unsigned> VScale = Config.getVScaleForTuning();
6083 BestVPlan, BestVF, VScale);
6084 }
6085
6086 // Retrieving VectorPH now when it's easier while VPlan still has Regions.
6087 VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
6088
6090 BestVF, BestUF, PSE);
6091 RUN_VPLAN_PASS(VPlanTransforms::optimizeForVFAndUF, BestVPlan, BestVF, BestUF,
6092 PSE);
6094 if (EpilogueVecKind == EpilogueVectorizationKind::None)
6096 /*OnlyLatches=*/false);
6097 if (BestVPlan.getEntry()->getSingleSuccessor() ==
6098 BestVPlan.getScalarPreheader()) {
6099 // TODO: The vector loop would be dead, should not even try to vectorize.
6100 ORE->emit([&]() {
6101 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationDead",
6102 OrigLoop->getStartLoc(),
6103 OrigLoop->getHeader())
6104 << "Created vector loop never executes due to insufficient trip "
6105 "count.";
6106 });
6108 }
6109
6111
6113 // Convert the exit condition to AVLNext == 0 for EVL tail folded loops.
6115 // Regions are dissolved after optimizing for VF and UF, which completely
6116 // removes unneeded loop regions first.
6118 // Expand BranchOnTwoConds after dissolution, when latch has direct access to
6119 // its successors.
6121 // Convert loops with variable-length stepping after regions are dissolved.
6123 // Remove dead back-edges for single-iteration loops with BranchOnCond(true).
6124 // Only process loop latches to avoid removing edges from the middle block,
6125 // which may be needed for epilogue vectorization.
6126 VPlanTransforms::removeBranchOnConst(BestVPlan, /*OnlyLatches=*/true);
6128 std::optional<uint64_t> MaxRuntimeStep;
6129 if (auto MaxVScale = getMaxVScale(*CM.TheFunction, CM.TTI))
6130 MaxRuntimeStep = uint64_t(*MaxVScale) * BestVF.getKnownMinValue() * BestUF;
6132 BestVPlan, VectorPH, CM.foldTailByMasking(),
6133 CM.requiresScalarEpilogue(BestVF.isVector()), &BestVPlan.getVFxUF(),
6134 MaxRuntimeStep);
6135 VPlanTransforms::materializeFactors(BestVPlan, VectorPH, BestVF);
6136 VPlanTransforms::cse(BestVPlan);
6138 VPlanTransforms::simplifyKnownEVL(BestVPlan, BestVF, PSE);
6139
6140 // 0. Generate SCEV-dependent code in the entry, including TripCount, before
6141 // making any changes to the CFG.
6142 DenseMap<const SCEV *, Value *> ExpandedSCEVs =
6143 VPlanTransforms::expandSCEVs(BestVPlan, *PSE.getSE());
6144
6145 // Perform the actual loop transformation.
6146 VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
6147 OrigLoop->getParentLoop(),
6148 Legal->getWidestInductionType());
6149
6150#ifdef EXPENSIVE_CHECKS
6151 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
6152#endif
6153
6154 // 1. Set up the skeleton for vectorization, including vector pre-header and
6155 // middle block. The vector loop is created during VPlan execution.
6156 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6157 if (VPBasicBlock *ScalarPH = BestVPlan.getScalarPreheader())
6158 replaceVPBBWithIRVPBB(ScalarPH, State.CFG.PrevBB->getSingleSuccessor(),
6159 &BestVPlan);
6161
6162 assert(verifyVPlanIsValid(BestVPlan) && "final VPlan is invalid");
6163
6164 // After vectorization, the exit blocks of the original loop will have
6165 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
6166 // looked through single-entry phis.
6167 ScalarEvolution &SE = *PSE.getSE();
6168 for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) {
6169 if (!Exit->hasPredecessors())
6170 continue;
6171 for (VPRecipeBase &PhiR : Exit->phis())
6173 &cast<VPIRPhi>(PhiR).getIRPhi());
6174 }
6175 // Forget the original loop and block dispositions.
6176 SE.forgetLoop(OrigLoop);
6178
6180
6181 //===------------------------------------------------===//
6182 //
6183 // Notice: any optimization or new instruction that go
6184 // into the code below should also be implemented in
6185 // the cost-model.
6186 //
6187 //===------------------------------------------------===//
6188
6189 // Retrieve loop information before executing the plan, which may remove the
6190 // original loop, if it becomes unreachable.
6191 MDNode *LID = OrigLoop->getLoopID();
6192 unsigned OrigLoopInvocationWeight = 0;
6193 std::optional<unsigned> OrigAverageTripCount =
6194 getLoopEstimatedTripCount(OrigLoop, &OrigLoopInvocationWeight);
6195
6196 BestVPlan.execute(&State);
6197
6198 // 2.6. Maintain Loop Hints
6199 // Keep all loop hints from the original loop on the vector loop (we'll
6200 // replace the vectorizer-specific hints below).
6201 VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(BestVPlan, State.VPDT);
6202 // Add metadata to disable runtime unrolling a scalar loop when there
6203 // are no runtime checks about strides and memory. A scalar loop that is
6204 // rarely used is not worth unrolling.
6205 bool DisableRuntimeUnroll = !ILV.RTChecks.hasChecks() && !BestVF.isScalar();
6207 HeaderVPBB ? LI->getLoopFor(State.CFG.VPBB2IRBB.lookup(HeaderVPBB))
6208 : nullptr,
6209 HeaderVPBB, BestVPlan,
6210 EpilogueVecKind == EpilogueVectorizationKind::Epilogue, LID,
6211 OrigAverageTripCount, OrigLoopInvocationWeight,
6212 estimateElementCount(BestVF * BestUF, Config.getVScaleForTuning()),
6213 DisableRuntimeUnroll);
6214
6215 // 3. Fix the vectorized code: take care of header phi's, live-outs,
6216 // predication, updating analyses.
6217 ILV.fixVectorizedLoop(State);
6218
6220
6221 return ExpandedSCEVs;
6222}
6223
6224//===--------------------------------------------------------------------===//
6225// EpilogueVectorizerMainLoop
6226//===--------------------------------------------------------------------===//
6227
6229 LLVM_DEBUG({
6230 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
6231 << "Main Loop VF:" << EPI.MainLoopVF
6232 << ", Main Loop UF:" << EPI.MainLoopUF
6233 << ", Epilogue Loop VF:" << EPI.EpilogueVF
6234 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
6235 });
6236}
6237
6240 dbgs() << "intermediate fn:\n"
6241 << *OrigLoop->getHeader()->getParent() << "\n";
6242 });
6243}
6244
6245//===--------------------------------------------------------------------===//
6246// EpilogueVectorizerEpilogueLoop
6247//===--------------------------------------------------------------------===//
6248
6249/// This function creates a new scalar preheader, using the previous one as
6250/// entry block to the epilogue VPlan. The minimum iteration check is being
6251/// represented in VPlan.
6253 BasicBlock *NewScalarPH = createScalarPreheader("vec.epilog.");
6254 BasicBlock *OriginalScalarPH = NewScalarPH->getSinglePredecessor();
6255 OriginalScalarPH->setName("vec.epilog.iter.check");
6256 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(OriginalScalarPH);
6257 VPBasicBlock *OldEntry = Plan.getEntry();
6258 for (auto &R : make_early_inc_range(*OldEntry)) {
6259 // Skip moving VPIRInstructions (including VPIRPhis), which are unmovable by
6260 // defining.
6261 if (isa<VPIRInstruction>(&R))
6262 continue;
6263 R.moveBefore(*NewEntry, NewEntry->end());
6264 }
6265
6266 VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
6267 Plan.setEntry(NewEntry);
6268 // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
6269
6270 return OriginalScalarPH;
6271}
6272
6274 LLVM_DEBUG({
6275 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
6276 << "Epilogue Loop VF:" << EPI.EpilogueVF
6277 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
6278 });
6279}
6280
6283 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
6284 });
6285}
6286
6288 VFRange &Range) {
6289 assert((VPI->getOpcode() == Instruction::Load ||
6290 VPI->getOpcode() == Instruction::Store) &&
6291 "Must be called with either a load or store");
6293
6294 auto WillWiden = [&](ElementCount VF) -> bool {
6296 CM.getWideningDecision(I, VF);
6298 "CM decision should be taken at this point.");
6300 return true;
6301 if (CM.isScalarAfterVectorization(I, VF) ||
6302 CM.isProfitableToScalarize(I, VF))
6303 return false;
6305 };
6306
6308 return nullptr;
6309
6310 // If a mask is not required, drop it - use unmasked version for safe loads.
6311 // TODO: Determine if mask is needed in VPlan.
6312 VPValue *Mask = CM.isMaskRequired(I) ? VPI->getMask() : nullptr;
6313
6314 // Determine if the pointer operand of the access is either consecutive or
6315 // reverse consecutive.
6317 CM.getWideningDecision(I, Range.Start);
6319 bool Consecutive =
6321
6322 VPValue *Ptr = VPI->getOpcode() == Instruction::Load ? VPI->getOperand(0)
6323 : VPI->getOperand(1);
6324 if (Consecutive) {
6326 VPSingleDefRecipe *VectorPtr;
6327 if (Reverse) {
6328 // When folding the tail, we may compute an address that we don't in the
6329 // original scalar loop: drop the GEP no-wrap flags in this case.
6330 // Otherwise preserve existing flags without no-unsigned-wrap, as we will
6331 // emit negative indices.
6332 GEPNoWrapFlags ReverseFlags = CM.foldTailByMasking()
6334 : Flags.withoutNoUnsignedWrap();
6335 VectorPtr = new VPVectorEndPointerRecipe(
6336 Ptr, &Plan.getVF(), getLoadStoreType(I),
6337 /*Stride*/ -1, ReverseFlags, VPI->getDebugLoc());
6338 } else {
6339 const DataLayout &DL = I->getDataLayout();
6340 auto *StrideTy = DL.getIndexType(Ptr->getUnderlyingValue()->getType());
6341 VPValue *StrideOne = Plan.getConstantInt(StrideTy, 1);
6342 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), StrideOne,
6343 Flags, VPI->getDebugLoc());
6344 }
6345 Builder.setInsertPoint(VPI);
6346 Builder.insert(VectorPtr);
6347 Ptr = VectorPtr;
6348 }
6349
6350 if (Reverse && Mask)
6351 Mask = Builder.createNaryOp(VPInstruction::Reverse, Mask, I->getDebugLoc());
6352
6353 if (VPI->getOpcode() == Instruction::Load) {
6354 auto *Load = cast<LoadInst>(I);
6355 auto *LoadR = new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, *VPI,
6356 Load->getDebugLoc());
6357 if (Reverse) {
6358 Builder.insert(LoadR);
6359 return new VPInstruction(VPInstruction::Reverse, LoadR, {}, {},
6360 LoadR->getDebugLoc());
6361 }
6362 return LoadR;
6363 }
6364
6365 StoreInst *Store = cast<StoreInst>(I);
6366 VPValue *StoredVal = VPI->getOperand(0);
6367 if (Reverse)
6368 StoredVal = Builder.createNaryOp(VPInstruction::Reverse, StoredVal,
6369 Store->getDebugLoc());
6370 return new VPWidenStoreRecipe(*Store, Ptr, StoredVal, Mask, Consecutive, *VPI,
6371 Store->getDebugLoc());
6372}
6373
6375VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI,
6376 VFRange &Range) {
6377 auto *I = cast<TruncInst>(VPI->getUnderlyingInstr());
6378 // Optimize the special case where the source is a constant integer
6379 // induction variable. Notice that we can only optimize the 'trunc' case
6380 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6381 // (c) other casts depend on pointer size.
6382
6383 // Determine whether \p K is a truncation based on an induction variable that
6384 // can be optimized.
6387 I),
6388 Range))
6389 return nullptr;
6390
6392 VPI->getOperand(0)->getDefiningRecipe());
6393 PHINode *Phi = WidenIV->getPHINode();
6394 VPIRValue *Start = WidenIV->getStartValue();
6395 const InductionDescriptor &IndDesc = WidenIV->getInductionDescriptor();
6396
6397 // Wrap flags from the original induction do not apply to the truncated type,
6398 // so do not propagate them.
6399 VPIRFlags Flags = VPIRFlags::WrapFlagsTy(false, false);
6400 VPValue *Step =
6403 Phi, Start, Step, &Plan.getVF(), IndDesc, I, Flags, VPI->getDebugLoc());
6404}
6405
6406bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
6408 "Instruction should have been handled earlier");
6409 // Instruction should be widened, unless it is scalar after vectorization,
6410 // scalarization is profitable or it is predicated.
6411 auto WillScalarize = [this, I](ElementCount VF) -> bool {
6412 return CM.isScalarAfterVectorization(I, VF) ||
6413 CM.isProfitableToScalarize(I, VF) ||
6414 CM.isScalarWithPredication(I, VF);
6415 };
6417 Range);
6418}
6419
6420VPRecipeWithIRFlags *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
6421 auto *I = VPI->getUnderlyingInstr();
6422 switch (VPI->getOpcode()) {
6423 default:
6424 return nullptr;
6425 case Instruction::SDiv:
6426 case Instruction::UDiv:
6427 case Instruction::SRem:
6428 case Instruction::URem:
6429 // If not provably safe, use a masked intrinsic.
6430 if (CM.isPredicatedInst(I))
6431 return new VPWidenIntrinsicRecipe(
6433 I->getType(), {}, {}, VPI->getDebugLoc());
6434 [[fallthrough]];
6435 case Instruction::Add:
6436 case Instruction::And:
6437 case Instruction::AShr:
6438 case Instruction::FAdd:
6439 case Instruction::FCmp:
6440 case Instruction::FDiv:
6441 case Instruction::FMul:
6442 case Instruction::FNeg:
6443 case Instruction::FRem:
6444 case Instruction::FSub:
6445 case Instruction::ICmp:
6446 case Instruction::LShr:
6447 case Instruction::Mul:
6448 case Instruction::Or:
6449 case Instruction::Select:
6450 case Instruction::Shl:
6451 case Instruction::Sub:
6452 case Instruction::Xor:
6453 case Instruction::Freeze:
6454 return new VPWidenRecipe(*I, VPI->operandsWithoutMask(), *VPI, *VPI,
6455 VPI->getDebugLoc());
6456 case Instruction::ExtractValue: {
6458 auto *EVI = cast<ExtractValueInst>(I);
6459 assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
6460 unsigned Idx = EVI->getIndices()[0];
6461 NewOps.push_back(Plan.getConstantInt(32, Idx));
6462 return new VPWidenRecipe(*I, NewOps, *VPI, *VPI, VPI->getDebugLoc());
6463 }
6464 };
6465}
6466
6468 if (VPI->getOpcode() != Instruction::Store)
6469 return nullptr;
6470
6471 auto HistInfo =
6472 Legal->getHistogramInfo(cast<StoreInst>(VPI->getUnderlyingInstr()));
6473 if (!HistInfo)
6474 return nullptr;
6475
6476 const HistogramInfo *HI = *HistInfo;
6477 // FIXME: Support other operations.
6478 unsigned Opcode = HI->Update->getOpcode();
6479 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
6480 "Histogram update operation must be an Add or Sub");
6481
6483 // Bucket address.
6484 HGramOps.push_back(VPI->getOperand(1));
6485 // Increment value.
6486 HGramOps.push_back(Plan.getOrAddLiveIn(HI->Update->getOperand(1)));
6487
6488 // In case of predicated execution (due to tail-folding, or conditional
6489 // execution, or both), pass the relevant mask.
6490 if (CM.isMaskRequired(HI->Store))
6491 HGramOps.push_back(VPI->getMask());
6492
6493 return new VPHistogramRecipe(Opcode, HGramOps, VPI->getDebugLoc());
6494}
6495
6497 VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder) {
6498 StoreInst *SI;
6499 if ((SI = dyn_cast<StoreInst>(VPI->getUnderlyingInstr())) &&
6500 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
6501 // Only create recipe for the final invariant store of the reduction.
6502 if (Legal->isInvariantStoreOfReduction(SI)) {
6503 VPValue *Val = VPI->getOperand(0);
6504 VPValue *Addr = VPI->getOperand(1);
6505 // We need to store the exiting value of the reduction, so use the blend
6506 // if tail folded.
6507 if (auto *Blend = vputils::findUserOf<VPBlendRecipe>(Val))
6508 Val = Blend;
6509 assert(
6510 vputils::findUserOf<VPReductionPHIRecipe>(Val)->getBackedgeValue() ==
6511 Val &&
6512 "Store isn't backedge value?");
6513 auto *Recipe = new VPReplicateRecipe(
6514 SI, {Val, Addr}, true /* IsUniform */, nullptr /*Mask*/, *VPI, *VPI,
6515 VPI->getDebugLoc());
6516 FinalRedStoresBuilder.insert(Recipe);
6517 }
6518 VPI->eraseFromParent();
6519 return true;
6520 }
6521
6522 return false;
6523}
6524
6526 VFRange &Range) {
6527 auto *I = VPI->getUnderlyingInstr();
6529 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
6530 Range);
6531
6532 bool IsPredicated = CM.isPredicatedInst(I);
6533
6534 // Even if the instruction is not marked as uniform, there are certain
6535 // intrinsic calls that can be effectively treated as such, so we check for
6536 // them here. Conservatively, we only do this for scalable vectors, since
6537 // for fixed-width VFs we can always fall back on full scalarization.
6538 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
6539 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
6540 case Intrinsic::assume:
6541 case Intrinsic::lifetime_start:
6542 case Intrinsic::lifetime_end:
6543 // For scalable vectors if one of the operands is variant then we still
6544 // want to mark as uniform, which will generate one instruction for just
6545 // the first lane of the vector. We can't scalarize the call in the same
6546 // way as for fixed-width vectors because we don't know how many lanes
6547 // there are.
6548 //
6549 // The reasons for doing it this way for scalable vectors are:
6550 // 1. For the assume intrinsic generating the instruction for the first
6551 // lane is still be better than not generating any at all. For
6552 // example, the input may be a splat across all lanes.
6553 // 2. For the lifetime start/end intrinsics the pointer operand only
6554 // does anything useful when the input comes from a stack object,
6555 // which suggests it should always be uniform. For non-stack objects
6556 // the effect is to poison the object, which still allows us to
6557 // remove the call.
6558 IsUniform = true;
6559 break;
6560 default:
6561 break;
6562 }
6563 }
6564 VPValue *BlockInMask = nullptr;
6565 if (!IsPredicated) {
6566 // Finalize the recipe for Instr, first if it is not predicated.
6567 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6568 } else {
6569 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6570 // Instructions marked for predication are replicated and a mask operand is
6571 // added initially. Masked replicate recipes will later be placed under an
6572 // if-then construct to prevent side-effects. Generate recipes to compute
6573 // the block mask for this region.
6574 BlockInMask = VPI->getMask();
6575 }
6576
6577 // Note that there is some custom logic to mark some intrinsics as uniform
6578 // manually above for scalable vectors, which this assert needs to account for
6579 // as well.
6580 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
6581 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
6582 "Should not predicate a uniform recipe");
6583 auto *Recipe =
6584 new VPReplicateRecipe(I, VPI->operandsWithoutMask(), IsUniform,
6585 BlockInMask, *VPI, *VPI, VPI->getDebugLoc());
6586 return Recipe;
6587}
6588
6591 VFRange &Range) {
6592 assert(!R->isPhi() && "phis must be handled earlier");
6593 // First, check for specific widening recipes that deal with optimizing
6594 // truncates and memory operations.
6595 auto *VPI = cast<VPInstruction>(R);
6596 assert(VPI->getOpcode() != Instruction::Call &&
6597 "Call should have been handled by makeCallWideningDecisions");
6598
6599 VPRecipeBase *Recipe;
6600 if (VPI->getOpcode() == Instruction::Trunc &&
6601 (Recipe = tryToOptimizeInductionTruncate(VPI, Range)))
6602 return Recipe;
6603
6604 // All widen recipes below deal only with VF > 1.
6606 [&](ElementCount VF) { return VF.isScalar(); }, Range))
6607 return nullptr;
6608
6609 Instruction *Instr = R->getUnderlyingInstr();
6610 assert(!is_contained({Instruction::Load, Instruction::Store},
6611 VPI->getOpcode()) &&
6612 "Should have been handled prior to this!");
6613
6614 if (!shouldWiden(Instr, Range))
6615 return nullptr;
6616
6617 if (VPI->getOpcode() == Instruction::GetElementPtr)
6618 return new VPWidenGEPRecipe(cast<GetElementPtrInst>(Instr),
6619 VPI->operandsWithoutMask(), *VPI,
6620 VPI->getDebugLoc());
6621
6622 if (Instruction::isCast(VPI->getOpcode())) {
6623 auto *CI = cast<CastInst>(Instr);
6624 auto *CastR = cast<VPInstructionWithType>(VPI);
6625 return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(0),
6626 CastR->getResultType(), CI, *VPI, *VPI,
6627 VPI->getDebugLoc());
6628 }
6629
6630 return tryToWiden(VPI);
6631}
6632
6633// To allow RUN_VPLAN_PASS to print the VPlan after VF/UF independent
6634// optimizations.
6636
6637void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
6638 ElementCount MaxVF) {
6639 if (ElementCount::isKnownGT(MinVF, MaxVF))
6640 return;
6641
6642 bool IsInnerLoop = OrigLoop->isInnermost();
6643
6644 // Set up loop versioning for inner loops with memory runtime checks.
6645 // Outer loops don't have LoopAccessInfo since canVectorizeMemory() is not
6646 // called for them.
6647 std::optional<LoopVersioning> LVer;
6648 if (IsInnerLoop) {
6649 const LoopAccessInfo *LAI = Legal->getLAI();
6650 LVer.emplace(*LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop,
6651 LI, DT, PSE.getSE());
6652 if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
6654 // Only use noalias metadata when using memory checks guaranteeing no
6655 // overlap across all iterations.
6656 LVer->prepareNoAliasMetadata();
6657 }
6658 }
6659
6660 // Create initial base VPlan0, to serve as common starting point for all
6661 // candidates built later for specific VF ranges.
6662 auto VPlan0 = VPlanTransforms::buildVPlan0(OrigLoop, *LI,
6663 Legal->getWidestInductionType(),
6664 PSE, LVer ? &*LVer : nullptr);
6665
6666 // Create recipes for header phis. For outer loops, reductions, recurrences
6667 // and in-loop reductions are empty since legality doesn't detect them.
6669 *OrigLoop, Legal->getInductionVars(),
6670 Legal->getReductionVars(),
6671 Legal->getFixedOrderRecurrences(),
6672 Config.getInLoopReductions(), Hints.allowReordering()))
6673 return;
6674
6675 if (const LoopAccessInfo *LAI = Legal->getLAI())
6677 LAI->getSymbolicStrides());
6681 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()));
6682 // If we're vectorizing a loop with an uncountable exit, make sure that the
6683 // recipes are safe to handle.
6684 // TODO: Remove this once we can properly check the VPlan itself for both
6685 // the presence of an uncountable exit and the presence of stores in
6686 // the loop inside handleEarlyExits itself.
6688 if (Legal->hasUncountableEarlyExit())
6689 EEStyle = Legal->hasUncountableExitWithSideEffects()
6692
6694 OrigLoop, PSE, *DT, Legal->getAssumptionCache()))
6695 return;
6696
6698 CM.foldTailByMasking());
6700 if (CM.foldTailByMasking())
6703
6704 auto MaxVFTimes2 = MaxVF * 2;
6705 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
6706 VFRange SubRange = {VF, MaxVFTimes2};
6707 auto Plan =
6708 tryToBuildVPlan(std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange);
6709 VF = SubRange.End;
6710
6711 if (!Plan)
6712 continue;
6713
6714 // Now optimize the initial VPlan.
6718 Config.getMinimalBitwidths());
6720 // TODO: try to put addExplicitVectorLength close to addActiveLaneMask
6721 if (CM.foldTailWithEVL()) {
6723 Config.getMaxSafeElements());
6725 }
6726
6727 if (auto P = VPlanTransforms::narrowInterleaveGroups(*Plan, TTI))
6728 VPlans.push_back(std::move(P));
6729
6731 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
6732 VPlans.push_back(std::move(Plan));
6733 }
6734}
6735
6736VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VPlanPtr Plan,
6737 VFRange &Range) {
6738
6739 // For outer loops, the plan only needs basic recipe conversion and induction
6740 // live-out optimization; the full inner-loop recipe building below does not
6741 // apply (no widening decisions, interleave groups, reductions, etc.).
6742 if (Plan->isOuterLoop()) {
6743 for (ElementCount VF : Range)
6744 Plan->addVF(VF);
6746 return nullptr;
6748 /*FoldTail=*/false);
6749 return Plan;
6750 }
6751
6752 using namespace llvm::VPlanPatternMatch;
6753 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
6754
6755 // ---------------------------------------------------------------------------
6756 // Build initial VPlan: Scan the body of the loop in a topological order to
6757 // visit each basic block after having visited its predecessor basic blocks.
6758 // ---------------------------------------------------------------------------
6759
6760 bool RequiresScalarEpilogueCheck =
6762 [this](ElementCount VF) {
6763 return !CM.requiresScalarEpilogue(VF.isVector());
6764 },
6765 Range);
6766 // Update the branch in the middle block if a scalar epilogue is required.
6767 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
6768 if (!RequiresScalarEpilogueCheck && MiddleVPBB->getNumSuccessors() == 2) {
6769 auto *BranchOnCond = cast<VPInstruction>(MiddleVPBB->getTerminator());
6770 assert(MiddleVPBB->getSuccessors()[1] == Plan->getScalarPreheader() &&
6771 "second successor must be scalar preheader");
6772 BranchOnCond->setOperand(0, Plan->getFalse());
6773 }
6774
6775 // Don't use getDecisionAndClampRange here, because we don't know the UF
6776 // so this function is better to be conservative, rather than to split
6777 // it up into different VPlans.
6778 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
6779 bool IVUpdateMayOverflow = false;
6780 for (ElementCount VF : Range)
6781 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
6782
6783 TailFoldingStyle Style = CM.getTailFoldingStyle();
6784 // Use NUW for the induction increment if we proved that it won't overflow in
6785 // the vector loop or when not folding the tail. In the later case, we know
6786 // that the canonical induction increment will not overflow as the vector trip
6787 // count is >= increment and a multiple of the increment.
6788 VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
6789 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
6790 if (!HasNUW) {
6791 auto *IVInc =
6792 LoopRegion->getExitingBasicBlock()->getTerminator()->getOperand(0);
6793 assert(match(IVInc,
6794 m_VPInstruction<Instruction::Add>(
6795 m_Specific(LoopRegion->getCanonicalIV()), m_VPValue())) &&
6796 "Did not find the canonical IV increment");
6797 LoopRegion->clearCanonicalIVNUW(cast<VPInstruction>(IVInc));
6798 }
6799
6800 // ---------------------------------------------------------------------------
6801 // Pre-construction: record ingredients whose recipes we'll need to further
6802 // process after constructing the initial VPlan.
6803 // ---------------------------------------------------------------------------
6804
6805 // For each interleave group which is relevant for this (possibly trimmed)
6806 // Range, add it to the set of groups to be later applied to the VPlan and add
6807 // placeholders for its members' Recipes which we'll be replacing with a
6808 // single VPInterleaveRecipe.
6809 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
6810 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
6811 bool Result = (VF.isVector() && // Query is illegal for VF == 1
6812 CM.getWideningDecision(IG->getInsertPos(), VF) ==
6814 // For scalable vectors, the interleave factors must be <= 8 since we
6815 // require the (de)interleaveN intrinsics instead of shufflevectors.
6816 assert((!Result || !VF.isScalable() || IG->getFactor() <= 8) &&
6817 "Unsupported interleave factor for scalable vectors");
6818 return Result;
6819 };
6820 if (!getDecisionAndClampRange(ApplyIG, Range))
6821 continue;
6822 InterleaveGroups.insert(IG);
6823 }
6824
6825 // ---------------------------------------------------------------------------
6826 // Construct wide recipes and apply predication for original scalar
6827 // VPInstructions in the loop.
6828 // ---------------------------------------------------------------------------
6829 VPRecipeBuilder RecipeBuilder(*Plan, Legal, CM, Builder);
6830
6831 // Scan the body of the loop in a topological order to visit each basic block
6832 // after having visited its predecessor basic blocks.
6833 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
6834 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
6835 HeaderVPBB);
6836
6838 Range.Start);
6839
6840 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, Config.CostKind, CM.PSE,
6841 OrigLoop);
6842
6844 RecipeBuilder);
6845
6847
6849 RecipeBuilder, CostCtx);
6850
6851 // Now process all other blocks and instructions.
6852 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
6853 // Convert input VPInstructions to widened recipes.
6854 for (VPRecipeBase &R : make_early_inc_range(
6855 make_range(VPBB->getFirstNonPhi(), VPBB->end()))) {
6856 // Skip recipes that do not need transforming or have already been
6857 // transformed.
6858 if (isa<VPWidenCanonicalIVRecipe, VPBlendRecipe, VPReductionRecipe,
6859 VPReplicateRecipe, VPWidenLoadRecipe, VPWidenStoreRecipe,
6860 VPWidenCallRecipe, VPWidenIntrinsicRecipe, VPVectorPointerRecipe,
6861 VPVectorEndPointerRecipe, VPHistogramRecipe>(&R))
6862 continue;
6863 auto *VPI = cast<VPInstruction>(&R);
6864 if (!VPI->getUnderlyingValue())
6865 continue;
6866
6867 // TODO: Gradually replace uses of underlying instruction by analyses on
6868 // VPlan. Migrate code relying on the underlying instruction from VPlan0
6869 // to construct recipes below to not use the underlying instruction.
6871 Builder.setInsertPoint(VPI);
6872
6873 VPRecipeBase *Recipe =
6874 RecipeBuilder.tryToCreateWidenNonPhiRecipe(VPI, Range);
6875 if (!Recipe)
6876 Recipe =
6877 RecipeBuilder.handleReplication(cast<VPInstruction>(VPI), Range);
6878
6879 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) {
6880 // Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
6881 // moved to the phi section in the header.
6882 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
6883 } else {
6884 Builder.insert(Recipe);
6885 }
6886 if (Recipe->getNumDefinedValues() == 1) {
6887 VPI->replaceAllUsesWith(Recipe->getVPSingleValue());
6888 } else {
6889 assert(Recipe->getNumDefinedValues() == 0 &&
6890 "Unexpected multidef recipe");
6891 }
6892 R.eraseFromParent();
6893 }
6894 }
6895
6896 assert(isa<VPRegionBlock>(LoopRegion) &&
6897 !LoopRegion->getEntryBasicBlock()->empty() &&
6898 "entry block must be set to a VPRegionBlock having a non-empty entry "
6899 "VPBasicBlock");
6900
6902 Range);
6903
6904 // ---------------------------------------------------------------------------
6905 // Transform initial VPlan: Apply previously taken decisions, in order, to
6906 // bring the VPlan to its final state.
6907 // ---------------------------------------------------------------------------
6908
6909 addReductionResultComputation(Plan, RecipeBuilder, Range.Start);
6910
6911 // Optimize FindIV reductions to use sentinel-based approach when possible.
6913 *OrigLoop);
6915 CM.foldTailByMasking());
6916
6917 // Apply mandatory transformation to handle reductions with multiple in-loop
6918 // uses if possible, bail out otherwise.
6920 OrigLoop))
6921 return nullptr;
6922 // Apply mandatory transformation to handle FP maxnum/minnum reduction with
6923 // NaNs if possible, bail out otherwise.
6925 return nullptr;
6926
6927 // Create whole-vector selects for find-last recurrences.
6929 return nullptr;
6930
6932
6933 // Create partial reduction recipes for scaled reductions and transform
6934 // recipes to abstract recipes if it is legal and beneficial and clamp the
6935 // range for better cost estimation.
6936 // TODO: Enable following transform when the EVL-version of extended-reduction
6937 // and mulacc-reduction are implemented.
6938 if (!CM.foldTailWithEVL()) {
6939 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, Config.CostKind, CM.PSE,
6940 OrigLoop);
6942 Range);
6944 Range);
6945 }
6946
6947 // Interleave memory: for each Interleave Group we marked earlier as relevant
6948 // for this VPlan, replace the Recipes widening its memory instructions with a
6949 // single VPInterleaveRecipe at its insertion point.
6951 InterleaveGroups, CM.isEpilogueAllowed());
6952
6953 // Convert memory recipes to strided access recipes if the strided access is
6954 // legal and profitable. Use a new VPCostContext to ensure type inference
6955 // reflects the current plan state.
6956 // TODO: Remove this VPCostContext scope once VPTypeAnalysis is removed.
6957 {
6958 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, Config.CostKind, CM.PSE,
6959 OrigLoop);
6961 *OrigLoop, CostCtx, Range);
6962 }
6963
6964 // Ensure scalar VF plans only contain VF=1, as required by hasScalarVFOnly.
6965 if (Range.Start.isScalar())
6966 Range.End = Range.Start * 2;
6967
6968 for (ElementCount VF : Range)
6969 Plan->addVF(VF);
6970 Plan->setName("Initial VPlan");
6971
6973
6974 if (useActiveLaneMask(Style)) {
6975 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
6976 // TailFoldingStyle is visible there.
6977 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
6978 RUN_VPLAN_PASS(VPlanTransforms::addActiveLaneMask, *Plan, ForControlFlow);
6979 }
6980
6981 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
6982 return Plan;
6983}
6984
6985void LoopVectorizationPlanner::addReductionResultComputation(
6986 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
6987 using namespace VPlanPatternMatch;
6988 VPTypeAnalysis TypeInfo(*Plan);
6989 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
6990 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
6992 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
6993 Builder.setInsertPoint(&*std::prev(std::prev(LatchVPBB->end())));
6994 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
6995 VPValue *HeaderMask = vputils::findHeaderMask(*Plan);
6996 for (VPRecipeBase &R :
6997 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
6998 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
6999 if (!PhiR)
7000 continue;
7001
7002 RecurKind RecurrenceKind = PhiR->getRecurrenceKind();
7003 const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(
7005 Type *PhiTy = TypeInfo.inferScalarType(PhiR);
7006
7007 // Convert a VPBlendRecipe backedge to a select.
7008 if (auto *Blend = dyn_cast<VPBlendRecipe>(PhiR->getBackedgeValue())) {
7009 if (Blend->getNumIncomingValues() == 2 &&
7010 Blend->getMask(0) == HeaderMask) {
7011 auto *Sel = VPBuilder(Blend).createSelect(
7012 Blend->getMask(0), Blend->getIncomingValue(0),
7013 Blend->getIncomingValue(1), {}, "", *Blend);
7014 Blend->replaceAllUsesWith(Sel);
7015 Blend->eraseFromParent();
7016 }
7017 }
7018
7019 auto *OrigExitingVPV = PhiR->getBackedgeValue();
7020 auto *NewExitingVPV = PhiR->getBackedgeValue();
7021
7022 // Remove the predicated select if the target doesn't want it.
7023 VPValue *V;
7024 if (!CM.usePredicatedReductionSelect(RecurrenceKind) &&
7025 match(PhiR->getBackedgeValue(),
7026 m_Select(m_Specific(HeaderMask), m_VPValue(V), m_Specific(PhiR))))
7027 PhiR->setBackedgeValue(V);
7028
7029 // We want code in the middle block to appear to execute on the location of
7030 // the scalar loop's latch terminator because: (a) it is all compiler
7031 // generated, (b) these instructions are always executed after evaluating
7032 // the latch conditional branch, and (c) other passes may add new
7033 // predecessors which terminate on this line. This is the easiest way to
7034 // ensure we don't accidentally cause an extra step back into the loop while
7035 // debugging.
7036 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
7037
7038 // TODO: At the moment ComputeReductionResult also drives creation of the
7039 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
7040 // even for in-loop reductions, until the reduction resume value handling is
7041 // also modeled in VPlan.
7042 VPInstruction *FinalReductionResult;
7043 VPBuilder::InsertPointGuard Guard(Builder);
7044 Builder.setInsertPoint(MiddleVPBB, IP);
7045 // For AnyOf reductions, find the select among PhiR's users and convert
7046 // the reduction phi to operate on bools before creating the final
7047 // reduction result.
7048 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RecurrenceKind)) {
7049 auto *AnyOfSelect =
7050 cast<VPSingleDefRecipe>(*find_if(PhiR->users(), [](VPUser *U) {
7051 return match(U, m_Select(m_VPValue(), m_VPValue(), m_VPValue()));
7052 }));
7053 VPValue *Start = PhiR->getStartValue();
7054 bool TrueValIsPhi = AnyOfSelect->getOperand(1) == PhiR;
7055 // NewVal is the non-phi operand of the select.
7056 VPValue *NewVal = TrueValIsPhi ? AnyOfSelect->getOperand(2)
7057 : AnyOfSelect->getOperand(1);
7058
7059 // Adjust AnyOf reductions; replace the reduction phi for the selected
7060 // value with a boolean reduction phi node to check if the condition is
7061 // true in any iteration. The final value is selected by the final
7062 // ComputeReductionResult.
7063 VPValue *Cmp = AnyOfSelect->getOperand(0);
7064 // If the compare is checking the reduction PHI node, adjust it to check
7065 // the start value.
7066 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe())
7067 CmpR->replaceUsesOfWith(PhiR, PhiR->getStartValue());
7068 Builder.setInsertPoint(AnyOfSelect);
7069
7070 // If the true value of the select is the reduction phi, the new value
7071 // is selected if the negated condition is true in any iteration.
7072 if (TrueValIsPhi)
7073 Cmp = Builder.createNot(Cmp);
7074
7075 // Convert the reduction phi to operate on bools.
7076 auto *NewPhiR =
7077 PhiR->cloneWithOperands(Plan->getFalse(), PhiR->getBackedgeValue());
7078 NewPhiR->insertBefore(PhiR);
7079 PhiR->replaceAllUsesWith(NewPhiR);
7080
7081 VPValue *Or = Builder.createOr(NewPhiR, Cmp);
7082 // Only replace uses inside the vector region with Or. External uses
7083 // (e.g. scalar preheader resume phis) must be replaced by the user
7084 // update loop below with FinalReductionResult.
7085 AnyOfSelect->replaceUsesWithIf(Or, [](VPUser &U, unsigned) {
7086 return cast<VPRecipeBase>(&U)->getRegion();
7087 });
7088 ToDelete.push_back(AnyOfSelect);
7089
7090 // Update NewExitingVPV if it was pointing to the now-replaced select.
7091 if (NewExitingVPV == AnyOfSelect)
7092 NewExitingVPV = Or;
7093
7094 Builder.setInsertPoint(MiddleVPBB, IP);
7095
7096 FinalReductionResult =
7097 Builder.createAnyOfReduction(NewExitingVPV, NewVal, Start, ExitDL);
7098 } else {
7099 VPIRFlags Flags(RecurrenceKind, PhiR->isOrdered(), PhiR->isInLoop(),
7100 PhiR->getFastMathFlags());
7101 FinalReductionResult =
7102 Builder.createNaryOp(VPInstruction::ComputeReductionResult,
7103 {NewExitingVPV}, Flags, ExitDL);
7104 }
7105 // If the vector reduction can be performed in a smaller type, we truncate
7106 // then extend the loop exit value to enable InstCombine to evaluate the
7107 // entire expression in the smaller type.
7108 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
7110 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
7112 "Unexpected truncated min-max recurrence!");
7113 Type *RdxTy = RdxDesc.getRecurrenceType();
7114 VPWidenCastRecipe *Trunc;
7115 Instruction::CastOps ExtendOpc =
7116 RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
7117 VPWidenCastRecipe *Extnd;
7118 {
7119 VPBuilder::InsertPointGuard Guard(Builder);
7120 Builder.setInsertPoint(
7121 NewExitingVPV->getDefiningRecipe()->getParent(),
7122 std::next(NewExitingVPV->getDefiningRecipe()->getIterator()));
7123 Trunc =
7124 Builder.createWidenCast(Instruction::Trunc, NewExitingVPV, RdxTy);
7125 Extnd = Builder.createWidenCast(ExtendOpc, Trunc, PhiTy);
7126 }
7127 if (PhiR->getOperand(1) == NewExitingVPV)
7128 PhiR->setOperand(1, Extnd->getVPSingleValue());
7129
7130 // Update ComputeReductionResult with the truncated exiting value and
7131 // extend its result. Operand 0 provides the values to be reduced.
7132 FinalReductionResult->setOperand(0, Trunc);
7133 FinalReductionResult =
7134 Builder.createScalarCast(ExtendOpc, FinalReductionResult, PhiTy, {});
7135 }
7136
7137 // Update all users outside the vector region. Also replace redundant
7138 // extracts.
7139 for (auto *U : to_vector(OrigExitingVPV->users())) {
7140 auto *Parent = cast<VPRecipeBase>(U)->getParent();
7141 if (FinalReductionResult == U || Parent->getParent())
7142 continue;
7143 // Skip ComputeReductionResult and FindIV reductions when they are not the
7144 // final result.
7145 if (match(U, m_VPInstruction<VPInstruction::ComputeReductionResult>()) ||
7147 match(U, m_VPInstruction<Instruction::ICmp>())))
7148 continue;
7149 U->replaceUsesOfWith(OrigExitingVPV, FinalReductionResult);
7150
7151 // Look through ExtractLastPart.
7153 U = cast<VPInstruction>(U)->getSingleUser();
7154
7157 cast<VPInstruction>(U)->replaceAllUsesWith(FinalReductionResult);
7158 }
7159
7160 RecurKind RK = PhiR->getRecurrenceKind();
7165 VPBuilder PHBuilder(Plan->getVectorPreheader());
7166 VPValue *Iden = Plan->getOrAddLiveIn(
7167 getRecurrenceIdentity(RK, PhiTy, PhiR->getFastMathFlags()));
7168 auto *ScaleFactorVPV = Plan->getConstantInt(32, 1);
7169 VPValue *StartV = PHBuilder.createNaryOp(
7171 {PhiR->getStartValue(), Iden, ScaleFactorVPV}, *PhiR);
7172 PhiR->setOperand(0, StartV);
7173 }
7174 }
7175 for (VPRecipeBase *R : ToDelete)
7176 R->eraseFromParent();
7177
7179}
7180
7182 VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
7183 const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
7184 if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(0)) {
7185 assert((!Config.OptForSize ||
7186 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
7187 "Cannot SCEV check stride or overflow when optimizing for size");
7189 SCEVCheckBlock, HasBranchWeights);
7190 }
7191 const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
7192 if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) {
7193 // VPlan-native path does not do any analysis for runtime checks
7194 // currently.
7196 "Runtime checks are not supported for outer loops yet");
7197
7198 if (Config.OptForSize) {
7199 assert(
7200 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
7201 "Cannot emit memory checks when optimizing for size, unless forced "
7202 "to vectorize.");
7203 ORE->emit([&]() {
7204 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
7205 OrigLoop->getStartLoc(),
7206 OrigLoop->getHeader())
7207 << "Code-size may be reduced by not forcing "
7208 "vectorization, or by source-code modifications "
7209 "eliminating the need for runtime checks "
7210 "(e.g., adding 'restrict').";
7211 });
7212 }
7214 MemCheckBlock, HasBranchWeights);
7215 }
7216}
7217
7219 VPlan &Plan, ElementCount VF, unsigned UF,
7220 ElementCount MinProfitableTripCount) const {
7221 const uint32_t *BranchWeights =
7222 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())
7224 : nullptr;
7226 MinProfitableTripCount,
7227 CM.requiresScalarEpilogue(VF.isVector()),
7228 CM.foldTailByMasking(), OrigLoop, BranchWeights,
7229 OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(),
7230 PSE, Plan.getEntry());
7231}
7232
7233// Determine how to lower the epilogue, which depends on 1) optimising
7234// for minimum code-size, 2) tail-folding compiler options, 3) loop
7235// hints forcing tail-folding, and 4) a TTI hook that analyses whether the loop
7236// is suitable for tail-folding.
7237// This function determines epilogue lowering for the main vector loop while
7238// epilogue lowering for the tail-folded epilogue path will be handled
7239// separately in getEpilogueTailLowering.
7240static EpilogueLowering
7242 bool OptForSize, TargetTransformInfo *TTI,
7244 InterleavedAccessInfo *IAI) {
7245 // 1) OptSize takes precedence over all other options, i.e. if this is set,
7246 // don't look at hints or options, and don't request an epilogue.
7247 if (F->hasOptSize() ||
7248 (OptForSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled))
7250
7251 // 2) If set, obey the directives
7252 if (TailFoldingPolicy.getNumOccurrences()) {
7253 switch (TailFoldingPolicy) {
7255 return CM_EpilogueAllowed;
7260 };
7261 }
7262
7263 // 3) If set, obey the hints
7264 switch (Hints.getPredicate()) {
7268 return CM_EpilogueAllowed;
7269 };
7270
7271 // 4) if the TTI hook indicates this is profitable, request tail-folding.
7272 TailFoldingInfo TFI(TLI, &LVL, IAI);
7273 if (TTI->preferTailFoldingOverEpilogue(&TFI))
7275
7276 return CM_EpilogueAllowed;
7277}
7278
7279/// Determine how to lower the epilogue for the vector epilogue loop.
7280/// Check if there are any conflicts that prevent tail-folding the epilogue.
7281/// \return CM_EpilogueNotNeededFoldTail if epilogue tail-folding is possible,
7282/// otherwise CM_EpilogueAllowed.
7283static EpilogueLowering
7286 // Epilogue TF is only enabled when explicitly requested via command line.
7287 if (!EpilogueTailFoldingPolicy.getNumOccurrences() ||
7289 return CM_EpilogueAllowed;
7290
7293 "Options conflict, epilogue vectorization is disallowed while "
7294 "epilogue tail-folding allowed!\n",
7295 "UnsupportedEpilogueTailFoldingPolicy", ORE, L);
7296 return CM_EpilogueAllowed;
7297 }
7298
7299 // If scalar epilogue is explicitly required, we can't apply TF.
7300 if (MainCM.requiresScalarEpilogue(/*IsVectorizing*/ true)) {
7301 LLVM_DEBUG(dbgs() << "LV: Epilogue tail-folding can't be applied because "
7302 "scalar epilogue is required\n"
7303 "LV: Fall back to a normal epilogue\n");
7304 return CM_EpilogueAllowed;
7305 }
7306
7307 // If having epilogue is NOT allowed, then no epilogue to apply TF for.
7308 if (!MainCM.isEpilogueAllowed()) {
7309 LLVM_DEBUG(dbgs() << "LV: No epilogue to apply tail-folding for.\n"
7310 "LV: Fall back to a normal epilogue\n");
7311 return CM_EpilogueAllowed;
7312 }
7313
7314 // We can apply tail-folding on the vectorized epilogue loop.
7316}
7317
7318// Emit a remark if there are stores to floats that required a floating point
7319// extension. If the vectorized loop was generated with floating point there
7320// will be a performance penalty from the conversion overhead and the change in
7321// the vector width.
7324 for (BasicBlock *BB : L->getBlocks()) {
7325 for (Instruction &Inst : *BB) {
7326 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
7327 if (S->getValueOperand()->getType()->isFloatTy())
7328 Worklist.push_back(S);
7329 }
7330 }
7331 }
7332
7333 // Traverse the floating point stores upwards searching, for floating point
7334 // conversions.
7337 while (!Worklist.empty()) {
7338 auto *I = Worklist.pop_back_val();
7339 if (!L->contains(I))
7340 continue;
7341 if (!Visited.insert(I).second)
7342 continue;
7343
7344 // Emit a remark if the floating point store required a floating
7345 // point conversion.
7346 // TODO: More work could be done to identify the root cause such as a
7347 // constant or a function return type and point the user to it.
7348 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
7349 ORE->emit([&]() {
7350 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
7351 I->getDebugLoc(), L->getHeader())
7352 << "floating point conversion changes vector width. "
7353 << "Mixed floating point precision requires an up/down "
7354 << "cast that will negatively impact performance.";
7355 });
7356
7357 for (Use &Op : I->operands())
7358 if (auto *OpI = dyn_cast<Instruction>(Op))
7359 Worklist.push_back(OpI);
7360 }
7361}
7362
7363/// For loops with uncountable early exits, find the cost of doing work when
7364/// exiting the loop early, such as calculating the final exit values of
7365/// variables used outside the loop.
7366/// TODO: This is currently overly pessimistic because the loop may not take
7367/// the early exit, but better to keep this conservative for now. In future,
7368/// it might be possible to relax this by using branch probabilities.
7370 VPlan &Plan, ElementCount VF) {
7371 InstructionCost Cost = 0;
7372 for (auto *ExitVPBB : Plan.getExitBlocks()) {
7373 for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
7374 // If the predecessor is not the middle.block, then it must be the
7375 // vector.early.exit block, which may contain work to calculate the exit
7376 // values of variables used outside the loop.
7377 if (PredVPBB != Plan.getMiddleBlock()) {
7378 LLVM_DEBUG(dbgs() << "Calculating cost of work in exit block "
7379 << PredVPBB->getName() << ":\n");
7380 Cost += PredVPBB->cost(VF, CostCtx);
7381 }
7382 }
7383 }
7384 return Cost;
7385}
7386
7387/// This function determines whether or not it's still profitable to vectorize
7388/// the loop given the extra work we have to do outside of the loop:
7389/// 1. Perform the runtime checks before entering the loop to ensure it's safe
7390/// to vectorize.
7391/// 2. In the case of loops with uncountable early exits, we may have to do
7392/// extra work when exiting the loop early, such as calculating the final
7393/// exit values of variables used outside the loop.
7394/// 3. The middle block.
7395static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
7396 VectorizationFactor &VF, Loop *L,
7398 VPCostContext &CostCtx, VPlan &Plan,
7399 EpilogueLowering SEL,
7400 std::optional<unsigned> VScale) {
7401 InstructionCost RtC = Checks.getCost();
7402 if (!RtC.isValid())
7403 return false;
7404
7405 // When interleaving only scalar and vector cost will be equal, which in turn
7406 // would lead to a divide by 0. Fall back to hard threshold.
7407 if (VF.Width.isScalar()) {
7408 // TODO: Should we rename VectorizeMemoryCheckThreshold?
7410 LLVM_DEBUG(
7411 dbgs()
7412 << "LV: Interleaving only is not profitable due to runtime checks\n");
7413 return false;
7414 }
7415 return true;
7416 }
7417
7418 // The scalar cost should only be 0 when vectorizing with a user specified
7419 // VF/IC. In those cases, runtime checks should always be generated.
7420 uint64_t ScalarC = VF.ScalarCost.getValue();
7421 if (ScalarC == 0)
7422 return true;
7423
7424 InstructionCost TotalCost = RtC;
7425 // Add on the cost of any work required in the vector early exit block, if
7426 // one exists.
7427 TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF.Width);
7428 TotalCost += Plan.getMiddleBlock()->cost(VF.Width, CostCtx);
7429
7430 // First, compute the minimum iteration count required so that the vector
7431 // loop outperforms the scalar loop.
7432 // The total cost of the scalar loop is
7433 // ScalarC * TC
7434 // where
7435 // * TC is the actual trip count of the loop.
7436 // * ScalarC is the cost of a single scalar iteration.
7437 //
7438 // The total cost of the vector loop is
7439 // TotalCost + VecC * (TC / VF) + EpiC
7440 // where
7441 // * TotalCost is the sum of the costs cost of
7442 // - the generated runtime checks, i.e. RtC
7443 // - performing any additional work in the vector.early.exit block for
7444 // loops with uncountable early exits.
7445 // - the middle block, if ExpectedTC <= VF.Width.
7446 // * VecC is the cost of a single vector iteration.
7447 // * TC is the actual trip count of the loop
7448 // * VF is the vectorization factor
7449 // * EpiCost is the cost of the generated epilogue, including the cost
7450 // of the remaining scalar operations.
7451 //
7452 // Vectorization is profitable once the total vector cost is less than the
7453 // total scalar cost:
7454 // TotalCost + VecC * (TC / VF) + EpiC < ScalarC * TC
7455 //
7456 // Now we can compute the minimum required trip count TC as
7457 // VF * (TotalCost + EpiC) / (ScalarC * VF - VecC) < TC
7458 //
7459 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
7460 // the computations are performed on doubles, not integers and the result
7461 // is rounded up, hence we get an upper estimate of the TC.
7462 unsigned IntVF = estimateElementCount(VF.Width, VScale);
7463 uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
7464 uint64_t MinTC1 =
7465 Div == 0 ? 0 : divideCeil(TotalCost.getValue() * IntVF, Div);
7466
7467 // Second, compute a minimum iteration count so that the cost of the
7468 // runtime checks is only a fraction of the total scalar loop cost. This
7469 // adds a loop-dependent bound on the overhead incurred if the runtime
7470 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
7471 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
7472 // cost, compute
7473 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
7474 uint64_t MinTC2 = divideCeil(RtC.getValue() * 10, ScalarC);
7475
7476 // Now pick the larger minimum. If it is not a multiple of VF and an epilogue
7477 // is allowed, choose the next closest multiple of VF. This should partly
7478 // compensate for ignoring the epilogue cost.
7479 uint64_t MinTC = std::max(MinTC1, MinTC2);
7480 if (SEL == CM_EpilogueAllowed)
7481 MinTC = alignTo(MinTC, IntVF);
7483
7484 LLVM_DEBUG(
7485 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
7486 << VF.MinProfitableTripCount << "\n");
7487
7488 // Skip vectorization if the expected trip count is less than the minimum
7489 // required trip count.
7490 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
7491 if (ElementCount::isKnownLT(*ExpectedTC, VF.MinProfitableTripCount)) {
7492 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
7493 "trip count < minimum profitable VF ("
7494 << *ExpectedTC << " < " << VF.MinProfitableTripCount
7495 << ")\n");
7496
7497 return false;
7498 }
7499 }
7500 return true;
7501}
7502
7504 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
7506 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
7508
7509/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
7510/// vectorization.
7513 using namespace VPlanPatternMatch;
7514 // When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
7515 // introduce multiple uses of undef/poison. If the reduction start value may
7516 // be undef or poison it needs to be frozen and the frozen start has to be
7517 // used when computing the reduction result. We also need to use the frozen
7518 // value in the resume phi generated by the main vector loop, as this is also
7519 // used to compute the reduction result after the epilogue vector loop.
7520 auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
7521 bool UpdateResumePhis) {
7522 VPBuilder Builder(Plan.getEntry());
7523 for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
7524 auto *VPI = dyn_cast<VPInstruction>(&R);
7525 if (!VPI)
7526 continue;
7527 VPValue *OrigStart;
7528 if (!matchFindIVResult(VPI, m_VPValue(), m_VPValue(OrigStart)))
7529 continue;
7531 continue;
7532 VPInstruction *Freeze =
7533 Builder.createNaryOp(Instruction::Freeze, {OrigStart}, {}, "fr");
7534 VPI->setOperand(2, Freeze);
7535 if (UpdateResumePhis)
7536 OrigStart->replaceUsesWithIf(Freeze, [Freeze](VPUser &U, unsigned) {
7537 return Freeze != &U && isa<VPPhi>(&U);
7538 });
7539 }
7540 };
7541 AddFreezeForFindLastIVReductions(MainPlan, true);
7542 AddFreezeForFindLastIVReductions(EpiPlan, false);
7543
7544 VPValue *VectorTC = nullptr;
7545 auto *Term =
7547 [[maybe_unused]] bool MatchedTC =
7548 match(Term, m_BranchOnCount(m_VPValue(), m_VPValue(VectorTC)));
7549 assert(MatchedTC && "must match vector trip count");
7550
7551 // If there is a suitable resume value for the canonical induction in the
7552 // scalar (which will become vector) epilogue loop, use it and move it to the
7553 // beginning of the scalar preheader. Otherwise create it below.
7554 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
7555 auto ResumePhiIter =
7556 find_if(MainScalarPH->phis(), [VectorTC](VPRecipeBase &R) {
7557 return match(&R, m_VPInstruction<Instruction::PHI>(m_Specific(VectorTC),
7558 m_ZeroInt()));
7559 });
7560 VPPhi *ResumePhi = nullptr;
7561 if (ResumePhiIter == MainScalarPH->phis().end()) {
7562 Type *Ty = VPTypeAnalysis(MainPlan).inferScalarType(VectorTC);
7563 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
7564 ResumePhi = ScalarPHBuilder.createScalarPhi(
7565 {VectorTC, MainPlan.getZero(Ty)}, {}, "vec.epilog.resume.val");
7566 } else {
7567 ResumePhi = cast<VPPhi>(&*ResumePhiIter);
7568 ResumePhi->setName("vec.epilog.resume.val");
7569 if (&MainScalarPH->front() != ResumePhi)
7570 ResumePhi->moveBefore(*MainScalarPH, MainScalarPH->begin());
7571 }
7572
7573 // Create a ResumeForEpilogue for the canonical IV resume as the
7574 // first non-phi, to keep it alive for the epilogue.
7575 VPBuilder ResumeBuilder(MainScalarPH);
7576 ResumeBuilder.createNaryOp(VPInstruction::ResumeForEpilogue, ResumePhi);
7577
7578 // Create ResumeForEpilogue instructions for the resume phis of the
7579 // VPIRPhis in the scalar header of the main plan and return them so they can
7580 // be used as resume values when vectorizing the epilogue.
7581 return to_vector(
7582 map_range(MainPlan.getScalarHeader()->phis(), [&](VPRecipeBase &R) {
7583 assert(isa<VPIRPhi>(R) &&
7584 "only VPIRPhis expected in the scalar header");
7585 return ResumeBuilder.createNaryOp(VPInstruction::ResumeForEpilogue,
7586 R.getOperand(0));
7587 }));
7588}
7589
7590/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
7591/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. Some
7592/// reductions require creating new instructions to compute the resume values.
7593/// They are collected in a vector and returned. They must be moved to the
7594/// preheader of the vector epilogue loop, after created by the execution of \p
7595/// Plan.
7597 VPlan &MainPlan, VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs,
7599 VFSelectionContext &Config, ScalarEvolution &SE) {
7600 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
7601 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
7602 Header->setName("vec.epilog.vector.body");
7603
7604 VPValue *IV = VectorLoop->getCanonicalIV();
7605 // When vectorizing the epilogue loop, the canonical induction needs to start
7606 // at the resume value from the main vector loop. Find the resume value
7607 // created during execution of the main VPlan. Add this resume value as an
7608 // offset to the canonical IV of the epilogue loop.
7609 using namespace llvm::PatternMatch;
7610 VPInstruction *ResumeForEpilogue =
7612 Value *EPResumeVal = ResumeForEpilogue->getUnderlyingValue();
7613 if (auto *ResumePhi = dyn_cast<PHINode>(EPResumeVal)) {
7614 for (Value *Inc : ResumePhi->incoming_values()) {
7615 if (match(Inc, m_SpecificInt(0)))
7616 continue;
7617 assert(!EPI.VectorTripCount &&
7618 "Must only have a single non-zero incoming value");
7619 EPI.VectorTripCount = Inc;
7620 }
7621 // If we didn't find a non-zero vector trip count, all incoming values
7622 // must be zero, which also means the vector trip count is zero.
7623 if (!EPI.VectorTripCount) {
7624 assert(ResumePhi->getNumIncomingValues() > 0 &&
7625 all_of(ResumePhi->incoming_values(), match_fn(m_SpecificInt(0))) &&
7626 "all incoming values must be 0");
7627 EPI.VectorTripCount = ResumePhi->getIncomingValue(0);
7628 }
7629 } else {
7630 EPI.VectorTripCount = EPResumeVal;
7631 }
7632 VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
7633 assert(all_of(IV->users(),
7634 [](const VPUser *U) {
7635 if (isa<VPScalarIVStepsRecipe, VPDerivedIVRecipe>(U))
7636 return true;
7637 unsigned Opc = cast<VPInstruction>(U)->getOpcode();
7638 return Instruction::isCast(Opc) || Opc == Instruction::Add;
7639 }) &&
7640 "the canonical IV should only be used by its increment or "
7641 "ScalarIVSteps when resetting the start value");
7642 VPBuilder Builder(Header, Header->getFirstNonPhi());
7643 VPInstruction *Add = Builder.createAdd(IV, VPV);
7644 // Replace all users of the canonical IV and its increment with the offset
7645 // version, except for the Add itself and the canonical IV increment.
7647 assert(Increment && "Must have a canonical IV increment at this point");
7648 IV->replaceUsesWithIf(Add, [Add, Increment](VPUser &U, unsigned) {
7649 return &U != Add && &U != Increment;
7650 });
7651 VPInstruction *OffsetIVInc =
7653 Increment->replaceAllUsesWith(OffsetIVInc);
7654 OffsetIVInc->setOperand(0, Increment);
7655
7657 SmallVector<Instruction *> InstsToMove;
7658 // Ensure that the start values for all header phi recipes are updated before
7659 // vectorizing the epilogue loop.
7660 for (VPRecipeBase &R : Header->phis()) {
7661 Value *ResumeV = nullptr;
7662 // TODO: Move setting of resume values to prepareToExecute.
7663 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
7664 // Find the reduction result by searching users of the phi or its backedge
7665 // value.
7666 auto IsReductionResult = [](VPRecipeBase *R) {
7667 auto *VPI = dyn_cast<VPInstruction>(R);
7668 return VPI && VPI->getOpcode() == VPInstruction::ComputeReductionResult;
7669 };
7670 auto *RdxResult = cast<VPInstruction>(
7671 vputils::findRecipe(ReductionPhi->getBackedgeValue(), IsReductionResult));
7672 assert(RdxResult && "expected to find reduction result");
7673
7674 ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
7675 ->getIncomingValueForBlock(L->getLoopPreheader());
7676
7677 // Check for FindIV pattern by looking for icmp user of RdxResult.
7678 // The pattern is: select(icmp ne RdxResult, Sentinel), RdxResult, Start
7679 using namespace VPlanPatternMatch;
7680 VPValue *SentinelVPV = nullptr;
7681 bool IsFindIV = any_of(RdxResult->users(), [&](VPUser *U) {
7682 return match(U, VPlanPatternMatch::m_SpecificICmp(
7683 ICmpInst::ICMP_NE, m_Specific(RdxResult),
7684 m_VPValue(SentinelVPV)));
7685 });
7686
7687 RecurKind RK = ReductionPhi->getRecurrenceKind();
7688 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || IsFindIV) {
7689 auto *ResumePhi = cast<PHINode>(ResumeV);
7690 Value *StartV = ResumePhi->getIncomingValueForBlock(
7692 IRBuilder<> Builder(ResumePhi->getParent(),
7693 ResumePhi->getParent()->getFirstNonPHIIt());
7694
7696 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
7697 // start value; compare the final value from the main vector loop
7698 // to the start value.
7699 ResumeV = Builder.CreateICmpNE(ResumeV, StartV);
7700 if (auto *I = dyn_cast<Instruction>(ResumeV))
7701 InstsToMove.push_back(I);
7702 } else {
7703 assert(SentinelVPV && "expected to find icmp using RdxResult");
7704 if (auto *FreezeI = dyn_cast<FreezeInst>(StartV))
7705 ToFrozen[FreezeI->getOperand(0)] = StartV;
7706
7707 // Adjust resume: select(icmp eq ResumeV, StartV), Sentinel, ResumeV
7708 Value *Cmp = Builder.CreateICmpEQ(ResumeV, StartV);
7709 if (auto *I = dyn_cast<Instruction>(Cmp))
7710 InstsToMove.push_back(I);
7711 ResumeV = Builder.CreateSelect(Cmp, SentinelVPV->getLiveInIRValue(),
7712 ResumeV);
7713 if (auto *I = dyn_cast<Instruction>(ResumeV))
7714 InstsToMove.push_back(I);
7715 }
7716 } else {
7717 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
7718 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
7719 if (auto *VPI = dyn_cast<VPInstruction>(PhiR->getStartValue())) {
7721 "unexpected start value");
7722 // Partial sub-reductions always start at 0 and account for the
7723 // reduction start value in a final subtraction. Update it to use the
7724 // resume value from the main vector loop.
7725 if (PhiR->getVFScaleFactor() > 1 &&
7727 PhiR->getRecurrenceKind())) {
7728 auto *Sub = cast<VPInstruction>(RdxResult->getSingleUser());
7729 assert((Sub->getOpcode() == Instruction::Sub ||
7730 Sub->getOpcode() == Instruction::FSub) &&
7731 "Unexpected opcode");
7732 assert(isa<VPIRValue>(Sub->getOperand(0)) &&
7733 "Expected operand to match the original start value of the "
7734 "reduction");
7735 // For integer sub-reductions, verify start value is zero.
7736 // For FP sub-reductions, verify start value is negative zero.
7737 [[maybe_unused]] auto StartValueIsIdentity = [&] {
7738 Value *IdentityValue = getRecurrenceIdentity(
7739 PhiR->getRecurrenceKind(), ResumeV->getType(),
7740 PhiR->getFastMathFlags());
7741 auto *StartValue = dyn_cast<VPIRValue>(VPI->getOperand(0));
7742 return StartValue && StartValue->getValue() == IdentityValue;
7743 };
7744 assert(StartValueIsIdentity() &&
7745 "Expected start value for partial sub-reduction to be zero "
7746 "(or negative zero)");
7747
7748 Sub->setOperand(0, StartVal);
7749 } else
7750 VPI->setOperand(0, StartVal);
7751 continue;
7752 }
7753 }
7754 } else {
7755 // Retrieve the induction resume values for wide inductions from
7756 // their original phi nodes in the scalar loop.
7757 PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
7758 // Hook up to the PHINode generated by a ResumePhi recipe of main
7759 // loop VPlan, which feeds the scalar loop.
7760 ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
7761 }
7762 assert(ResumeV && "Must have a resume value");
7763 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
7764 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
7765 }
7766
7767 // For some VPValues in the epilogue plan we must re-use the generated IR
7768 // values from the main plan. Replace them with live-in VPValues.
7769 // TODO: This is a workaround needed for epilogue vectorization and it
7770 // should be removed once induction resume value creation is done
7771 // directly in VPlan.
7772 for (auto &R : make_early_inc_range(*Plan.getEntry())) {
7773 // Re-use frozen values from the main plan for Freeze VPInstructions in the
7774 // epilogue plan. This ensures all users use the same frozen value.
7775 auto *VPI = dyn_cast<VPInstruction>(&R);
7776 if (VPI && VPI->getOpcode() == Instruction::Freeze) {
7778 ToFrozen.lookup(VPI->getOperand(0)->getLiveInIRValue())));
7779 continue;
7780 }
7781
7782 // Re-use the trip count and steps expanded for the main loop, as
7783 // skeleton creation needs it as a value that dominates both the scalar
7784 // and vector epilogue loops
7785 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
7786 if (!ExpandR)
7787 continue;
7788 VPValue *ExpandedVal =
7789 Plan.getOrAddLiveIn(ExpandedSCEVs.lookup(ExpandR->getSCEV()));
7790 ExpandR->replaceAllUsesWith(ExpandedVal);
7791 if (Plan.getTripCount() == ExpandR)
7792 Plan.resetTripCount(ExpandedVal);
7793 ExpandR->eraseFromParent();
7794 }
7795
7796 auto VScale = Config.getVScaleForTuning();
7797 unsigned MainLoopStep =
7798 estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale);
7799 unsigned EpilogueLoopStep =
7800 estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale);
7804 EPI.EpilogueVF, EPI.EpilogueUF, MainLoopStep, EpilogueLoopStep, SE);
7805
7806 return InstsToMove;
7807}
7808
7809static void
7811 VPlan &BestEpiPlan,
7812 ArrayRef<VPInstruction *> ResumeValues) {
7813 // Fix resume values from the additional bypass block.
7814 BasicBlock *PH = L->getLoopPreheader();
7815 for (auto *Pred : predecessors(PH)) {
7816 for (PHINode &Phi : PH->phis()) {
7817 if (Phi.getBasicBlockIndex(Pred) != -1)
7818 continue;
7819 Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred);
7820 }
7821 }
7822 auto *ScalarPH = cast<VPIRBasicBlock>(BestEpiPlan.getScalarPreheader());
7823 if (ScalarPH->hasPredecessors()) {
7824 // Fix resume values for inductions and reductions from the additional
7825 // bypass block using the incoming values from the main loop's resume phis.
7826 // ResumeValues correspond 1:1 with the scalar loop header phis.
7827 for (auto [ResumeV, HeaderPhi] :
7828 zip(ResumeValues, BestEpiPlan.getScalarHeader()->phis())) {
7829 auto *HeaderPhiR = cast<VPIRPhi>(&HeaderPhi);
7830 auto *EpiResumePhi =
7831 cast<PHINode>(HeaderPhiR->getIRPhi().getIncomingValueForBlock(PH));
7832 if (EpiResumePhi->getBasicBlockIndex(BypassBlock) == -1)
7833 continue;
7834 auto *MainResumePhi = cast<PHINode>(ResumeV->getUnderlyingValue());
7835 EpiResumePhi->setIncomingValueForBlock(
7836 BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
7837 }
7838 }
7839}
7840
7841/// Connect the epilogue vector loop generated for \p EpiPlan to the main vector
7842/// loop, after both plans have executed, updating branches from the iteration
7843/// and runtime checks of the main loop, as well as updating various phis. \p
7844/// InstsToMove contains instructions that need to be moved to the preheader of
7845/// the epilogue vector loop.
7846static void connectEpilogueVectorLoop(VPlan &EpiPlan, Loop *L,
7848 DominatorTree *DT,
7849 GeneratedRTChecks &Checks,
7850 ArrayRef<Instruction *> InstsToMove,
7851 ArrayRef<VPInstruction *> ResumeValues) {
7852 BasicBlock *VecEpilogueIterationCountCheck =
7853 cast<VPIRBasicBlock>(EpiPlan.getEntry())->getIRBasicBlock();
7854
7855 BasicBlock *VecEpiloguePreHeader =
7856 cast<CondBrInst>(VecEpilogueIterationCountCheck->getTerminator())
7857 ->getSuccessor(1);
7858 // Adjust the control flow taking the state info from the main loop
7859 // vectorization into account.
7861 "expected this to be saved from the previous pass.");
7862 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
7863
7864 // Helper to redirect an edge from \p BB to \p VecEpilogueIterationCountCheck
7865 // to \p NewSucc instead, updating the DomTree.
7866 auto RedirectEdge = [&](BasicBlock *BB, BasicBlock *NewSucc) {
7867 BB->getTerminator()->replaceUsesOfWith(VecEpilogueIterationCountCheck,
7868 NewSucc);
7869 DTU.applyUpdates(
7870 {{DominatorTree::Delete, BB, VecEpilogueIterationCountCheck},
7871 {DominatorTree::Insert, BB, NewSucc}});
7872 };
7873
7874 RedirectEdge(EPI.MainLoopIterationCountCheck, VecEpiloguePreHeader);
7875
7876 BasicBlock *ScalarPH =
7877 cast<VPIRBasicBlock>(EpiPlan.getScalarPreheader())->getIRBasicBlock();
7878 RedirectEdge(EPI.EpilogueIterationCountCheck, ScalarPH);
7879
7880 // Adjust the terminators of runtime check blocks and phis using them.
7881 BasicBlock *SCEVCheckBlock = Checks.getSCEVChecks().second;
7882 BasicBlock *MemCheckBlock = Checks.getMemRuntimeChecks().second;
7883 if (SCEVCheckBlock)
7884 RedirectEdge(SCEVCheckBlock, ScalarPH);
7885 if (MemCheckBlock)
7886 RedirectEdge(MemCheckBlock, ScalarPH);
7887
7888 // The vec.epilog.iter.check block may contain Phi nodes from inductions
7889 // or reductions which merge control-flow from the latch block and the
7890 // middle block. Update the incoming values here and move the Phi into the
7891 // preheader.
7892 SmallVector<PHINode *, 4> PhisInBlock(
7893 llvm::make_pointer_range(VecEpilogueIterationCountCheck->phis()));
7894
7895 for (PHINode *Phi : PhisInBlock) {
7896 Phi->moveBefore(VecEpiloguePreHeader->getFirstNonPHIIt());
7897 Phi->replaceIncomingBlockWith(
7898 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7899 VecEpilogueIterationCountCheck);
7900
7901 // If the phi doesn't have an incoming value from the
7902 // EpilogueIterationCountCheck, we are done. Otherwise remove the
7903 // incoming value and also those from other check blocks. This is needed
7904 // for reduction phis only.
7905 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7906 return EPI.EpilogueIterationCountCheck == IncB;
7907 }))
7908 continue;
7909 for (BasicBlock *BB :
7910 {EPI.EpilogueIterationCountCheck, SCEVCheckBlock, MemCheckBlock}) {
7911 if (BB)
7912 Phi->removeIncomingValue(BB);
7913 }
7914 }
7915
7916 auto IP = VecEpiloguePreHeader->getFirstNonPHIIt();
7917 for (auto *I : InstsToMove)
7918 I->moveBefore(IP);
7919
7920 // VecEpilogueIterationCountCheck conditionally skips over the epilogue loop
7921 // after executing the main loop. We need to update the resume values of
7922 // inductions and reductions during epilogue vectorization.
7923 fixScalarResumeValuesFromBypass(VecEpilogueIterationCountCheck, L, EpiPlan,
7924 ResumeValues);
7925
7926 // Remove dead phis that were moved to the epilogue preheader but are unused
7927 // (e.g., resume phis for inductions not widened in the epilogue vector loop).
7928 for (PHINode &Phi : make_early_inc_range(VecEpiloguePreHeader->phis()))
7929 if (Phi.use_empty())
7930 Phi.eraseFromParent();
7931}
7932
7934 assert((EnableVPlanNativePath || L->isInnermost()) &&
7935 "VPlan-native path is not enabled. Only process inner loops.");
7936
7937 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
7938 << L->getHeader()->getParent()->getName() << "' from "
7939 << L->getLocStr() << "\n");
7940
7941 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
7942
7943 LLVM_DEBUG(
7944 dbgs() << "LV: Loop hints:"
7945 << " force="
7947 ? "disabled"
7949 ? "enabled"
7950 : "?"))
7951 << " width=" << Hints.getWidth()
7952 << " interleave=" << Hints.getInterleave() << "\n");
7953
7954 // Function containing loop
7955 Function *F = L->getHeader()->getParent();
7956
7957 // Looking at the diagnostic output is the only way to determine if a loop
7958 // was vectorized (other than looking at the IR or machine code), so it
7959 // is important to generate an optimization remark for each loop. Most of
7960 // these messages are generated as OptimizationRemarkAnalysis. Remarks
7961 // generated as OptimizationRemark and OptimizationRemarkMissed are
7962 // less verbose reporting vectorized loops and unvectorized loops that may
7963 // benefit from vectorization, respectively.
7964
7965 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7966 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7967 return false;
7968 }
7969
7970 PredicatedScalarEvolution PSE(*SE, *L);
7971
7972 // Query this against the original loop and save it here because the profile
7973 // of the original loop header may change as the transformation happens.
7974 bool OptForSize = llvm::shouldOptimizeForSize(
7975 L->getHeader(), PSI,
7976 PSI && PSI->hasProfileSummary() ? &GetBFI() : nullptr,
7978
7979 // Check if it is legal to vectorize the loop.
7980 LoopVectorizationRequirements Requirements;
7981 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
7982 &Requirements, &Hints, DB, AC,
7983 /*AllowRuntimeSCEVChecks=*/!OptForSize, AA);
7985 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7986 Hints.emitRemarkWithHints();
7987 return false;
7988 }
7989
7990 bool IsInnerLoop = L->isInnermost();
7991
7992 // Outer loops require a computable trip count.
7993 if (!IsInnerLoop && isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
7994 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
7995 return false;
7996 }
7997
7998 if (LVL.hasUncountableEarlyExit()) {
8000 reportVectorizationFailure("Auto-vectorization of loops with uncountable "
8001 "early exit is not enabled",
8002 "UncountableEarlyExitLoopsDisabled", ORE, L);
8003 return false;
8004 }
8005 }
8006
8007 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
8008 bool UseInterleaved =
8009 IsInnerLoop && TTI->enableInterleavedAccessVectorization();
8010
8011 // If an override option has been passed in for interleaved accesses, use it.
8012 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
8013 UseInterleaved = IsInnerLoop && EnableInterleavedMemAccesses;
8014
8015 // Analyze interleaved memory accesses.
8016 if (UseInterleaved)
8018
8019 if (LVL.hasUncountableEarlyExit()) {
8020 BasicBlock *LoopLatch = L->getLoopLatch();
8021 if (IAI.requiresScalarEpilogue() ||
8022 any_of(LVL.getCountableExitingBlocks(), not_equal_to(LoopLatch))) {
8023 reportVectorizationFailure("Auto-vectorization of early exit loops "
8024 "requiring a scalar epilogue is unsupported",
8025 "UncountableEarlyExitUnsupported", ORE, L);
8026 return false;
8027 }
8028 }
8029
8030 // Check the function attributes and profiles to find out if this function
8031 // should be optimized for size.
8032 EpilogueLowering SEL =
8033 getEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, LVL, &IAI);
8034
8035 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
8036 // count by optimizing for size, to minimize overheads.
8037 auto ExpectedTC = getSmallBestKnownTC(PSE, L);
8038 if (ExpectedTC && ExpectedTC->isFixed() &&
8039 ExpectedTC->getFixedValue() < TinyTripCountVectorThreshold) {
8040 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
8041 << "This loop is worth vectorizing only if no scalar "
8042 << "iteration overheads are incurred.");
8044 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
8045 else {
8046 LLVM_DEBUG(dbgs() << "\n");
8047 // Tail-folded loops are efficient even when the loop
8048 // iteration count is low. However, setting the epilogue policy to
8049 // `CM_EpilogueNotAllowedLowTripLoop` prevents vectorizing loops
8050 // with runtime checks. It's more effective to let
8051 // `isOutsideLoopWorkProfitable` determine if vectorization is
8052 // beneficial for the loop.
8055 }
8056 }
8057
8058 // Check the function attributes to see if implicit floats or vectors are
8059 // allowed.
8060 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
8062 "Can't vectorize when the NoImplicitFloat attribute is used",
8063 "loop not vectorized due to NoImplicitFloat attribute",
8064 "NoImplicitFloat", ORE, L);
8065 Hints.emitRemarkWithHints();
8066 return false;
8067 }
8068
8069 // Check if the target supports potentially unsafe FP vectorization.
8070 // FIXME: Add a check for the type of safety issue (denormal, signaling)
8071 // for the target we're vectorizing for, to make sure none of the
8072 // additional fp-math flags can help.
8073 if (Hints.isPotentiallyUnsafe() &&
8074 TTI->isFPVectorizationPotentiallyUnsafe()) {
8076 "Potentially unsafe FP op prevents vectorization",
8077 "loop not vectorized due to unsafe FP support.",
8078 "UnsafeFP", ORE, L);
8079 Hints.emitRemarkWithHints();
8080 return false;
8081 }
8082
8083 bool AllowOrderedReductions;
8084 // If the flag is set, use that instead and override the TTI behaviour.
8085 if (ForceOrderedReductions.getNumOccurrences() > 0)
8086 AllowOrderedReductions = ForceOrderedReductions;
8087 else
8088 AllowOrderedReductions = TTI->enableOrderedReductions();
8089 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
8090 ORE->emit([&]() {
8091 auto *ExactFPMathInst = Requirements.getExactFPInst();
8092 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
8093 ExactFPMathInst->getDebugLoc(),
8094 ExactFPMathInst->getParent())
8095 << "loop not vectorized: cannot prove it is safe to reorder "
8096 "floating-point operations";
8097 });
8098 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
8099 "reorder floating-point operations\n");
8100 Hints.emitRemarkWithHints();
8101 return false;
8102 }
8103
8104 // Use the cost model.
8105 VFSelectionContext Config(*TTI, &LVL, L, *F, PSE, DB, ORE, &Hints,
8106 OptForSize);
8107 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, AC, ORE,
8108 GetBFI, F, &Hints, IAI, Config);
8109 // Use the planner for vectorization.
8110 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, Config, IAI, PSE,
8111 Hints, ORE);
8112
8113 EpilogueLowering EpilogueTailLoweringStatus =
8115 if (EpilogueTailLoweringStatus ==
8117 // TODO: Apply tail-folding on the vectorized epilogue loop.
8118 LLVM_DEBUG(dbgs() << "LV: epilogue tail-folding is not supported yet\n");
8120 "The epilogue-tail-folding policy prefer-fold-tail is not supported "
8121 "yet, fall back to a normal epilogue",
8122 "UnsupportedEpilogueTailFoldingPolicy", ORE, L);
8123 }
8124
8125 // Get user vectorization factor and interleave count.
8126 ElementCount UserVF = Hints.getWidth();
8127 unsigned UserIC = Hints.getInterleave();
8128 // Outer loops don't have LoopAccessInfo, so skip the safety check and reset
8129 // UserIC (interleaving is not supported for outer loops).
8130 if (!IsInnerLoop)
8131 UserIC = 0;
8132 else if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth())
8133 UserIC = 1;
8134
8135 // Plan how to best vectorize.
8136 LVP.plan(UserVF, UserIC);
8137 auto [VF, BestPlanPtr] = LVP.computeBestVF();
8138 unsigned IC = 1;
8139
8140 // For VPlan build stress testing of outer loops, bail after plan
8141 // construction.
8142 if (!IsInnerLoop && VPlanBuildOuterloopStressTest)
8143 return false;
8144
8145 if (IsInnerLoop && ORE->allowExtraAnalysis(LV_NAME))
8147
8148 GeneratedRTChecks Checks(PSE, DT, LI, TTI, Config.CostKind);
8149 if (IsInnerLoop && LVP.hasPlanWithVF(VF.Width)) {
8150 // Select the interleave count.
8151 IC = LVP.selectInterleaveCount(*BestPlanPtr, VF.Width, VF.Cost);
8152
8153 unsigned SelectedIC = std::max(IC, UserIC);
8154 // Optimistically generate runtime checks if they are needed. Drop them if
8155 // they turn out to not be profitable.
8156 if (VF.Width.isVector() || SelectedIC > 1) {
8157 Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC,
8158 *ORE);
8159
8160 // Bail out early if either the SCEV or memory runtime checks are known to
8161 // fail. In that case, the vector loop would never execute.
8162 using namespace llvm::PatternMatch;
8163 if (Checks.getSCEVChecks().first &&
8164 match(Checks.getSCEVChecks().first, m_One()))
8165 return false;
8166 if (Checks.getMemRuntimeChecks().first &&
8167 match(Checks.getMemRuntimeChecks().first, m_One()))
8168 return false;
8169 }
8170
8171 // Check if it is profitable to vectorize with runtime checks.
8172 bool ForceVectorization =
8174 VPCostContext CostCtx(CM.TTI, *CM.TLI, *BestPlanPtr, CM, Config.CostKind,
8175 CM.PSE, L);
8176 if (!ForceVectorization &&
8177 !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, *BestPlanPtr,
8178 SEL, Config.getVScaleForTuning())) {
8179 ORE->emit([&]() {
8181 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
8182 L->getHeader())
8183 << "loop not vectorized: cannot prove it is safe to reorder "
8184 "memory operations";
8185 });
8186 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
8187 Hints.emitRemarkWithHints();
8188 return false;
8189 }
8190 }
8191
8192 // Identify the diagnostic messages that should be produced.
8193 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
8194 bool VectorizeLoop = true, InterleaveLoop = true;
8195 if (VF.Width.isScalar()) {
8196 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
8197 VecDiagMsg = {
8198 "VectorizationNotBeneficial",
8199 "the cost-model indicates that vectorization is not beneficial"};
8200 VectorizeLoop = false;
8201 }
8202
8203 if (UserIC == 1 && Hints.getInterleave() > 1) {
8205 "UserIC should only be ignored due to unsafe dependencies");
8206 LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n");
8207 IntDiagMsg = {"InterleavingUnsafe",
8208 "Ignoring user-specified interleave count due to possibly "
8209 "unsafe dependencies in the loop."};
8210 InterleaveLoop = false;
8211 } else if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
8212 // Tell the user interleaving was avoided up-front, despite being explicitly
8213 // requested.
8214 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
8215 "interleaving should be avoided up front\n");
8216 IntDiagMsg = {"InterleavingAvoided",
8217 "Ignoring UserIC, because interleaving was avoided up front"};
8218 InterleaveLoop = false;
8219 } else if (IC == 1 && UserIC <= 1) {
8220 // Tell the user interleaving is not beneficial.
8221 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
8222 IntDiagMsg = {
8223 "InterleavingNotBeneficial",
8224 "the cost-model indicates that interleaving is not beneficial"};
8225 InterleaveLoop = false;
8226 if (UserIC == 1) {
8227 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
8228 IntDiagMsg.second +=
8229 " and is explicitly disabled or interleave count is set to 1";
8230 }
8231 } else if (IC > 1 && UserIC == 1) {
8232 // Tell the user interleaving is beneficial, but it explicitly disabled.
8233 LLVM_DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly "
8234 "disabled.\n");
8235 IntDiagMsg = {"InterleavingBeneficialButDisabled",
8236 "the cost-model indicates that interleaving is beneficial "
8237 "but is explicitly disabled or interleave count is set to 1"};
8238 InterleaveLoop = false;
8239 }
8240
8241 // If there is a histogram in the loop, do not just interleave without
8242 // vectorizing. The order of operations will be incorrect without the
8243 // histogram intrinsics, which are only used for recipes with VF > 1.
8244 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
8245 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
8246 << "to histogram operations.\n");
8247 IntDiagMsg = {
8248 "HistogramPreventsScalarInterleaving",
8249 "Unable to interleave without vectorization due to constraints on "
8250 "the order of histogram operations"};
8251 InterleaveLoop = false;
8252 }
8253
8254 // Override IC if user provided an interleave count.
8255 IC = UserIC > 0 ? UserIC : IC;
8256
8257 // Emit diagnostic messages, if any.
8258 if (!VectorizeLoop && !InterleaveLoop) {
8259 // Do not vectorize or interleaving the loop.
8260 ORE->emit([&]() {
8261 return OptimizationRemarkMissed(LV_NAME, VecDiagMsg.first,
8262 L->getStartLoc(), L->getHeader())
8263 << VecDiagMsg.second;
8264 });
8265 ORE->emit([&]() {
8266 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
8267 L->getStartLoc(), L->getHeader())
8268 << IntDiagMsg.second;
8269 });
8270 return false;
8271 }
8272
8273 if (!VectorizeLoop && InterleaveLoop) {
8274 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8275 ORE->emit([&]() {
8276 return OptimizationRemarkAnalysis(LV_NAME, VecDiagMsg.first,
8277 L->getStartLoc(), L->getHeader())
8278 << VecDiagMsg.second;
8279 });
8280 } else if (VectorizeLoop && !InterleaveLoop) {
8281 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8282 << ") in " << L->getLocStr() << '\n');
8283 ORE->emit([&]() {
8284 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
8285 L->getStartLoc(), L->getHeader())
8286 << IntDiagMsg.second;
8287 });
8288 } else if (VectorizeLoop && InterleaveLoop) {
8289 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8290 << ") in " << L->getLocStr() << '\n');
8291 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8292 }
8293
8294 // Report the vectorization decision.
8295 if (VF.Width.isScalar()) {
8296 using namespace ore;
8297 assert(IC > 1);
8298 ORE->emit([&]() {
8299 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
8300 L->getHeader())
8301 << "interleaved loop (interleaved count: "
8302 << NV("InterleaveCount", IC) << ")";
8303 });
8304 } else {
8305 // Report the vectorization decision.
8306 reportVectorization(ORE, L, VF, IC);
8307 }
8308 if (ORE->allowExtraAnalysis(LV_NAME))
8310
8311 // If we decided that it is *legal* to interleave or vectorize the loop, then
8312 // do it.
8313
8314 VPlan &BestPlan = *BestPlanPtr;
8315 // Consider vectorizing the epilogue too if it's profitable.
8316 std::unique_ptr<VPlan> EpiPlan =
8317 LVP.selectBestEpiloguePlan(BestPlan, VF.Width, IC);
8318 bool HasBranchWeights =
8319 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
8320 if (EpiPlan) {
8321 VPlan &BestEpiPlan = *EpiPlan;
8322 VPlan &BestMainPlan = BestPlan;
8323 ElementCount EpilogueVF = BestEpiPlan.getSingleVF();
8324
8325 // The first pass vectorizes the main loop and creates a scalar epilogue
8326 // to be vectorized by executing the plan (potentially with a different
8327 // factor) again shortly afterwards.
8328 BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
8329 BestEpiPlan.getVectorPreheader()->setName("vec.epilog.ph");
8330 SmallVector<VPInstruction *> ResumeValues =
8331 preparePlanForMainVectorLoop(BestMainPlan, BestEpiPlan);
8332 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF, 1, BestEpiPlan);
8333
8334 // Add minimum iteration check for the epilogue plan, followed by runtime
8335 // checks for the main plan.
8336 LVP.addMinimumIterationCheck(BestMainPlan, EPI.EpilogueVF, EPI.EpilogueUF,
8338 LVP.attachRuntimeChecks(BestMainPlan, Checks, HasBranchWeights);
8340 EPI.MainLoopVF, EPI.MainLoopUF,
8342 HasBranchWeights ? MinItersBypassWeights : nullptr,
8343 L->getLoopPredecessor()->getTerminator()->getDebugLoc(),
8344 PSE);
8345
8346 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
8347 Checks, BestMainPlan);
8348 auto ExpandedSCEVs = LVP.executePlan(
8349 EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT,
8351 ++LoopsVectorized;
8352
8353 // Derive EPI fields from VPlan-generated IR.
8354 BasicBlock *EntryBB =
8355 cast<VPIRBasicBlock>(BestMainPlan.getEntry())->getIRBasicBlock();
8356 EntryBB->setName("iter.check");
8357 EPI.EpilogueIterationCountCheck = EntryBB;
8358 // The check chain is: Entry -> [SCEV] -> [Mem] -> MainCheck -> VecPH.
8359 // MainCheck is the non-bypass successor of the last runtime check block
8360 // (or Entry if there are no runtime checks).
8361 BasicBlock *LastCheck = EntryBB;
8362 if (BasicBlock *MemBB = Checks.getMemRuntimeChecks().second)
8363 LastCheck = MemBB;
8364 else if (BasicBlock *SCEVBB = Checks.getSCEVChecks().second)
8365 LastCheck = SCEVBB;
8366 BasicBlock *ScalarPH = L->getLoopPreheader();
8367 auto *BI = cast<CondBrInst>(LastCheck->getTerminator());
8369 BI->getSuccessor(BI->getSuccessor(0) == ScalarPH);
8370
8371 // Second pass vectorizes the epilogue and adjusts the control flow
8372 // edges from the first pass.
8373 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
8374 Checks, BestEpiPlan);
8376 BestMainPlan, BestEpiPlan, L, ExpandedSCEVs, EPI, CM, Config,
8377 *PSE.getSE());
8378 LVP.attachRuntimeChecks(BestEpiPlan, Checks, HasBranchWeights);
8379 LVP.executePlan(
8380 EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT,
8382 connectEpilogueVectorLoop(BestEpiPlan, L, EPI, DT, Checks, InstsToMove,
8383 ResumeValues);
8384 ++LoopsEpilogueVectorized;
8385 } else {
8386 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, Checks,
8387 BestPlan);
8388 LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC,
8389 VF.MinProfitableTripCount);
8390 LVP.attachRuntimeChecks(BestPlan, Checks, HasBranchWeights);
8391
8392 if (!IsInnerLoop)
8393 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << F->getName()
8394 << "\"\n");
8395 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
8396 ++LoopsVectorized;
8397 }
8398
8399 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
8400 "DT not preserved correctly");
8401 assert(!verifyFunction(*F, &dbgs()));
8402
8403 return true;
8404}
8405
8407
8408 // Don't attempt if
8409 // 1. the target claims to have no vector registers, and
8410 // 2. interleaving won't help ILP.
8411 //
8412 // The second condition is necessary because, even if the target has no
8413 // vector registers, loop vectorization may still enable scalar
8414 // interleaving.
8415 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
8416 TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
8417 return LoopVectorizeResult(false, false);
8418
8419 bool Changed = false, CFGChanged = false;
8420
8421 // The vectorizer requires loops to be in simplified form.
8422 // Since simplification may add new inner loops, it has to run before the
8423 // legality and profitability checks. This means running the loop vectorizer
8424 // will simplify all loops, regardless of whether anything end up being
8425 // vectorized.
8426 for (const auto &L : *LI)
8427 Changed |= CFGChanged |=
8428 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
8429
8430 // Build up a worklist of inner-loops to vectorize. This is necessary as
8431 // the act of vectorizing or partially unrolling a loop creates new loops
8432 // and can invalidate iterators across the loops.
8433 SmallVector<Loop *, 8> Worklist;
8434
8435 for (Loop *L : *LI)
8436 collectSupportedLoops(*L, LI, ORE, Worklist);
8437
8438 LoopsAnalyzed += Worklist.size();
8439
8440 // Now walk the identified inner loops.
8441 while (!Worklist.empty()) {
8442 Loop *L = Worklist.pop_back_val();
8443
8444 // For the inner loops we actually process, form LCSSA to simplify the
8445 // transform.
8446 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8447
8448 Changed |= CFGChanged |= processLoop(L);
8449
8450 if (Changed) {
8451 LAIs->clear();
8452
8453#ifndef NDEBUG
8454 if (VerifySCEV)
8455 SE->verify();
8456#endif
8457 }
8458 }
8459
8460 // Process each loop nest in the function.
8461 return LoopVectorizeResult(Changed, CFGChanged);
8462}
8463
8466 LI = &AM.getResult<LoopAnalysis>(F);
8467 // There are no loops in the function. Return before computing other
8468 // expensive analyses.
8469 if (LI->empty())
8470 return PreservedAnalyses::all();
8479 AA = &AM.getResult<AAManager>(F);
8480
8481 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
8482 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8483 GetBFI = [&AM, &F]() -> BlockFrequencyInfo & {
8485 };
8486 LoopVectorizeResult Result = runImpl(F);
8487 if (!Result.MadeAnyChange)
8488 return PreservedAnalyses::all();
8490
8491 if (isAssignmentTrackingEnabled(*F.getParent())) {
8492 for (auto &BB : F)
8494 }
8495
8496 PA.preserve<LoopAnalysis>();
8500
8501 if (Result.MadeCFGChange) {
8502 // Making CFG changes likely means a loop got vectorized. Indicate that
8503 // extra simplification passes should be run.
8504 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
8505 // be run if runtime checks have been added.
8508 } else {
8510 }
8511 return PA;
8512}
8513
8515 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
8516 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
8517 OS, MapClassName2PassName);
8518
8519 OS << '<';
8520 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
8521 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
8522 OS << '>';
8523}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Lower Kernel Arguments
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
This is the interface for LLVM's primary stateless and local alias analysis.
static bool IsEmptyBlock(MachineBasicBlock *MBB)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static InstructionCost getCost(Instruction &Inst, TTI::TargetCostKind CostKind, TargetTransformInfo &TTI)
Definition CostModel.cpp:73
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
Module.h This file contains the declarations for the Module class.
This defines the Use class.
static bool hasNoUnsignedWrap(BinaryOperator &I)
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static cl::opt< unsigned, true > VectorizationFactor("force-vector-width", cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect."), cl::location(VectorizerParams::VectorizationFactor))
This header provides classes for managing per-loop analyses.
static const char * VerboseDebug
#define LV_NAME
This file defines the LoopVectorizationLegality class.
cl::opt< bool > VPlanBuildOuterloopStressTest
static cl::opt< bool > ConsiderRegPressure("vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden, cl::desc("Discard VFs if their register pressure is too high."))
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static unsigned getMaxTCFromNonZeroRange(PredicatedScalarEvolution &PSE, Loop *L)
Get the maximum trip count for L from the SCEV unsigned range, excluding zero from the range.
static Type * maybeVectorizeType(Type *Ty, ElementCount VF)
static ElementCount getSmallConstantTripCount(ScalarEvolution *SE, const Loop *L)
A version of ScalarEvolution::getSmallConstantTripCount that returns an ElementCount to include loops...
static bool hasUnsupportedHeaderPhiRecipe(VPlan &Plan)
Returns true if the VPlan contains header phi recipes that are not currently supported for epilogue v...
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static void connectEpilogueVectorLoop(VPlan &EpiPlan, Loop *L, EpilogueLoopVectorizationInfo &EPI, DominatorTree *DT, GeneratedRTChecks &Checks, ArrayRef< Instruction * > InstsToMove, ArrayRef< VPInstruction * > ResumeValues)
Connect the epilogue vector loop generated for EpiPlan to the main vector loop, after both plans have...
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< cl::boolOrDefault > ForceMaskedDivRem("force-widen-divrem-via-masked-intrinsic", cl::Hidden, cl::desc("Override cost based masked intrinsic widening " "for div/rem instructions"))
static void legacyCSE(BasicBlock *BB)
FIXME: This legacy common-subexpression-elimination routine is scheduled for removal,...
static VPIRBasicBlock * replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB, VPlan *Plan=nullptr)
Replace VPBB with a VPIRBasicBlock wrapping IRBB.
static Intrinsic::ID getMaskedDivRemIntrinsic(unsigned Opcode)
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or its operands.
TailFoldingPolicyTy
Option tail-folding-policy controls the tail-folding strategy and lists all available options.
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static cl::opt< TailFoldingPolicyTy > EpilogueTailFoldingPolicy("epilogue-tail-folding-policy", cl::Hidden, cl::desc("Epilogue-tail-folding preferences over creating an epilogue loop."), cl::values(clEnumValN(TailFoldingPolicyTy::None, "dont-fold-tail", "Don't tail-fold loops."), clEnumValN(TailFoldingPolicyTy::PreferFoldTail, "prefer-fold-tail", "prefer tail-folding, otherwise create an epilogue when " "appropriate.")))
static cl::opt< bool > EnableEarlyExitVectorization("enable-early-exit-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of early exit loops with uncountable exits."))
static unsigned estimateElementCount(ElementCount VF, std::optional< unsigned > VScale)
This function attempts to return a value that represents the ElementCount at runtime.
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static SmallVector< VPInstruction * > preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan)
Prepare MainPlan for vectorizing the main vector loop during epilogue vectorization.
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, const Loop *TheLoop, Instruction *I, DebugLoc DL={})
Create an analysis remark that explains why vectorization failed.
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static SmallVector< Instruction * > preparePlanForEpilogueVectorLoop(VPlan &MainPlan, VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel &CM, VFSelectionContext &Config, ScalarEvolution &SE)
Prepare Plan for vectorizing the epilogue loop.
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static void printOptimizedVPlan(VPlan &)
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static std::optional< ElementCount > getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, bool CanUseConstantMax=true, bool CanExcludeZeroTrips=false)
Returns "best known" trip count, which is either a valid positive trip count or std::nullopt when an ...
static const SCEV * getAddressAccessSCEV(Value *Ptr, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets the address access SCEV for Ptr, if it should be used for cost modeling according to isAddressSC...
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static bool useActiveLaneMask(TailFoldingStyle Style)
static bool hasReplicatorRegion(VPlan &Plan)
static EpilogueLowering getEpilogueTailLowering(const LoopVectorizationCostModel &MainCM, const Loop *L, OptimizationRemarkEmitter *ORE)
Determine how to lower the epilogue for the vector epilogue loop.
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static void addFullyUnrolledInstructionsToIgnore(Loop *L, const LoopVectorizationLegality::InductionList &IL, SmallPtrSetImpl< Instruction * > &InstsToIgnore)
Knowing that loop L executes a single vector iteration, add instructions that will get simplified and...
static bool hasFindLastReductionPhi(VPlan &Plan)
Returns true if the VPlan contains a VPReductionPHIRecipe with FindLast recurrence kind.
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static cl::opt< TailFoldingPolicyTy > TailFoldingPolicy("tail-folding-policy", cl::init(TailFoldingPolicyTy::None), cl::Hidden, cl::desc("Tail-folding preferences over creating an epilogue loop."), cl::values(clEnumValN(TailFoldingPolicyTy::None, "dont-fold-tail", "Don't tail-fold loops."), clEnumValN(TailFoldingPolicyTy::PreferFoldTail, "prefer-fold-tail", "prefer tail-folding, otherwise create an epilogue when " "appropriate."), clEnumValN(TailFoldingPolicyTy::MustFoldTail, "must-fold-tail", "always tail-fold, don't attempt vectorization if " "tail-folding fails.")))
static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, Loop *L, PredicatedScalarEvolution &PSE, VPCostContext &CostCtx, VPlan &Plan, EpilogueLowering SEL, std::optional< unsigned > VScale)
This function determines whether or not it's still profitable to vectorize the loop given the extra w...
static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx, VPlan &Plan, ElementCount VF)
For loops with uncountable early exits, find the cost of doing work when exiting the loop early,...
cl::opt< bool > VPlanBuildOuterloopStressTest("vplan-build-outerloop-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static EpilogueLowering getEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, bool OptForSize, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L, VPlan &BestEpiPlan, ArrayRef< VPInstruction * > ResumeValues)
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static bool willGenerateVectors(VPlan &Plan, ElementCount VF, const TargetTransformInfo &TTI)
Check if any recipe of Plan will generate a vector value, which will be assigned a vector register.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file implements a map that provides insertion order iteration.
This file contains the declarations for metadata subclasses.
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None)
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
This file contains some templates that are useful if you are working with the STL at all.
#define OP(OPC)
Definition Instruction.h:46
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:119
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition Debug.h:72
This pass exposes codegen information to IR-level passes.
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file contains the declarations of different VPlan-related auxiliary helpers.
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
#define RUN_VPLAN_PASS_NO_VERIFY(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:530
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
This class represents a function call, abstracting a target machine's calling convention.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
Conditional Branch instruction.
BasicBlock * getSuccessor(unsigned i) const
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
This class represents a range of values.
LLVM_ABI APInt getUnsignedMax() const
Return the largest unsigned value contained in the ConstantRange.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getTemporary()
Definition DebugLoc.h:160
static DebugLoc getUnknown()
Definition DebugLoc.h:161
An analysis that produces DemandedBits for a function.
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:254
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
void insert_range(Range &&R)
Inserts range of 'std::pair<KeyT, ValueT>' values into the map.
Definition DenseMap.h:292
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition TypeSize.h:315
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, GeneratedRTChecks &Checks, VPlan &Plan)
BasicBlock * createVectorizedLoopSkeleton() final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (i....
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, GeneratedRTChecks &Check, VPlan &Plan)
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Class to represent function types.
param_iterator param_begin() const
param_iterator param_end() const
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags none()
void applyUpdates(ArrayRef< UpdateT > Updates)
Submit updates to all available trees.
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2858
A struct for saving information about induction variables.
const SCEV * getStep() const
ArrayRef< Instruction * > getCastInsts() const
Returns an ArrayRef to the type cast instructions in the induction update chain, that are redundant w...
@ IK_PtrInduction
Pointer induction var. Step = C.
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor)
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
const TargetTransformInfo * TTI
Target Transform Info.
LoopVectorizationCostModel * Cost
The profitablity analysis.
friend class LoopVectorizationPlanner
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, ElementCount VecWidth, unsigned UnrollFactor, LoopVectorizationCostModel *CM, GeneratedRTChecks &RTChecks, VPlan &Plan)
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
LoopInfo * LI
Loop Info.
DominatorTree * DT
Dominator Tree.
void fixVectorizedLoop(VPTransformState &State)
Fix the vectorized code, taking care of header phi's, and more.
virtual BasicBlock * createVectorizedLoopSkeleton()
Creates a basic block for the scalar preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
IRBuilder Builder
The builder that we use.
void fixNonInductionPHIs(VPTransformState &State)
Fix the non-induction PHIs in Plan.
VPBasicBlock * VectorPHVPBB
The vector preheader block of Plan, used as target for check blocks introduced during skeleton creati...
unsigned UF
The vectorization unroll factor to use.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
BasicBlock * createScalarPreheader(StringRef Prefix)
Create and return a new IR basic block for the scalar preheader whose name is prefixed with Prefix.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
bool isCast() const
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
LLVM_ABI APInt getMask() const
For example, this is 0xFF for an 8 bit integer, 0xFFFF for i16, etc.
Definition Type.cpp:378
The group of interleaved loads/stores sharing the same stride and close to each other.
auto members() const
Return an iterator range over the non-null members of this group, in index order.
InstTy * getInsertPos() const
uint32_t getNumMembers() const
Drive the analysis of interleaved memory accesses in the loop.
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
LLVM_ABI void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
An instruction for reading from memory.
Type * getPointerOperandType() const
This analysis provides dependence information for the memory accesses of a loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:587
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
Store the result of a depth first search within basic blocks contained by a single loop.
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
RPOIterator endRPO() const
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
bool isEpilogueVectorizationProfitable(const ElementCount VF, const unsigned IC) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
bool useWideActiveLaneMask() const
Returns true if the use of wide lane masks is requested and the loop is using tail-folding with a lan...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
BlockFrequencyInfo * BFI
The BlockFrequencyInfo returned from GetBFI.
BlockFrequencyInfo & getBFI()
Returns the BlockFrequencyInfo for the function if cached, otherwise fetches it via GetBFI.
bool isForcedScalar(Instruction *I, ElementCount VF) const
Returns true if I has been forced to be scalarized at VF.
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
bool preferTailFoldedLoop() const
Returns true if tail-folding is preferred over an epilogue.
bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF)
Returns true if an artificially high cost for emulated masked memrefs should be used.
void collectNonVectorizedAndSetWideningDecisions(ElementCount VF)
Collect values that will not be widened, including Uniforms, Scalars, and Instructions to Scalarize f...
bool isMaskRequired(Instruction *I) const
Wrapper function for LoopVectorizationLegality::isMaskRequired, that passes the Instruction I and if ...
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationLegality * Legal
Vectorization legality.
uint64_t getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB)
A helper function that returns how much we should divide the cost of a predicated block by.
std::optional< InstructionCost > getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy) const
Return the cost of instructions in an inloop reduction pattern, if I is part of that pattern.
InstructionCost getInstructionCost(Instruction *I, ElementCount VF)
Returns the execution time cost of an instruction for a given vector width.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr) const
Get the interleaved access group that Instr belongs to.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
Loop * TheLoop
The loop that we evaluate.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, InstructionCost Cost)
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool isAccessInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleaved access group.
void setTailFoldingStyle(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
bool isEpilogueAllowed() const
Returns true if an epilogue is allowed (e.g., not prevented by optsize or a loop hint annotation).
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool shouldConsiderInvariant(Value *Op)
Returns true if Op should be considered invariant and if it is trivially hoistable.
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
@ CM_InvalidatedDecision
A widening decision that has been invalidated after replacing the corresponding recipe during VPlan t...
bool usePredicatedReductionSelect(RecurKind RecurrenceKind) const
Returns true if the predicated reduction select should be used to set the incoming value for the redu...
LoopVectorizationCostModel(EpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, std::function< BlockFrequencyInfo &()> GetBFI, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI, VFSelectionContext &Config)
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF)
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool isScalarWithPredication(Instruction *I, ElementCount VF)
Returns true if I is an instruction which requires predication and for which our chosen predication s...
std::function< BlockFrequencyInfo &()> GetBFI
A function to lazily fetch BlockFrequencyInfo.
InstructionCost expectedCost(ElementCount VF)
Returns the expected execution cost.
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost MaskedCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
TailFoldingStyle getTailFoldingStyle() const
Returns the TailFoldingStyle that is best for the current loop.
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
MapVector< PHINode *, InductionDescriptor > InductionList
InductionList saves induction variables and maps them to the induction descriptor.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
const SmallVector< BasicBlock *, 4 > & getCountableExitingBlocks() const
Returns all exiting blocks with a countable exit, i.e.
bool hasUncountableEarlyExit() const
Returns true if the loop has uncountable early exits, i.e.
bool hasHistograms() const
Returns a list of all known histogram operations in the loop.
const LoopAccessInfo * getLAI() const
Planner drives the vectorization process after having passed Legality checks.
DenseMap< const SCEV *, Value * > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, EpilogueVectorizationKind EpilogueVecKind=EpilogueVectorizationKind::None)
EpilogueVectorizationKind
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
@ MainLoop
Vectorizing the main loop of epilogue vectorization.
VPlan & getPlanFor(ElementCount VF) const
Return the VPlan for VF.
Definition VPlan.cpp:1712
void updateLoopMetadataAndProfileInfo(Loop *VectorLoop, VPBasicBlock *HeaderVPBB, const VPlan &Plan, bool VectorizingEpilogue, MDNode *OrigLoopID, std::optional< unsigned > OrigAverageTripCount, unsigned OrigLoopInvocationWeight, unsigned EstimatedVFxUF, bool DisableRuntimeUnroll)
Update loop metadata and profile info for both the scalar remainder loop and VectorLoop,...
Definition VPlan.cpp:1763
void attachRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const
Attach the runtime checks of RTChecks to Plan.
unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF, InstructionCost LoopCost)
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE)
Emit remarks for recipes with invalid costs in the available VPlans.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1698
void printPlans(raw_ostream &O)
Definition VPlan.cpp:1869
void plan(ElementCount UserVF, unsigned UserIC)
Build VPlans for the specified UserVF and UserIC if they are non-zero or all applicable candidate VFs...
std::unique_ptr< VPlan > selectBestEpiloguePlan(VPlan &MainPlan, ElementCount MainLoopVF, unsigned IC)
void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, ElementCount MinProfitableTripCount) const
Create a check to Plan to see if the vector loop should be executed based on its trip count.
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
std::pair< VectorizationFactor, VPlan * > computeBestVF()
Compute and return the most profitable vectorization factor and the corresponding best VPlan.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
void emitRemarkWithHints() const
Dumps all the hint information.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition LoopInfo.cpp:73
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:659
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition LoopInfo.cpp:67
Metadata node.
Definition Metadata.h:1080
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition MapVector.h:126
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition Module.cpp:235
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEVPredicate & getPredicate() const
LLVM_ABI unsigned getSmallConstantMaxTripCount()
Returns the upper bound of the loop trip count as a normal unsigned value, or 0 if the trip count is ...
LLVM_ABI const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static LLVM_ABI bool isSubRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is for a sub operation.
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
LLVM_ABI Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
void eraseDeadInstructions(Value *Root)
Remove inserted instructions that are dead, e.g.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getURemExpr(SCEVUse LHS, SCEVUse RHS)
Represents an unsigned remainder expression based on unsigned division.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
LLVM_ABI void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
const SCEV * getMinusOne(Type *Ty)
Return a SCEV for the constant -1 of a specific type.
LLVM_ABI void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
LLVM_ABI const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
void insert_range(Range &&R)
Definition SetVector.h:176
size_type count(const_arg_type key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:262
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
@ Store
The extracted value is stored (ExtractElement only).
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
@ TCC_Free
Expected to fold away in lowering.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:286
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:267
iterator_range< op_iterator > op_range
Definition User.h:256
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:25
Value * getOperand(unsigned i) const
Definition User.h:207
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:76
Holds state needed to make cost decisions before computing costs per-VF, including the maximum VFs.
const TTI::TargetCostKind CostKind
The kind of cost that we are calculating.
std::optional< unsigned > getVScaleForTuning() const
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4256
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4283
iterator end()
Definition VPlan.h:4293
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4291
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4344
InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override
Return the cost of this VPBasicBlock.
Definition VPlan.cpp:778
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:266
const VPRecipeBase & front() const
Definition VPlan.h:4303
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:661
bool empty() const
Definition VPlan.h:4302
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:236
void setName(const Twine &newName)
Definition VPlan.h:178
VPlan * getPlan()
Definition VPlan.cpp:211
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:216
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:226
static void reassociateBlocks(VPBlockBase *Old, VPBlockBase *New)
Reassociate all the blocks connected to Old so that they now point to New.
Definition VPlanUtils.h:267
static auto blocksOnly(T &&Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:295
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createAdd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false})
T * insert(T *R)
Insert R at the current insertion point. Returns R unchanged.
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:559
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:532
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2395
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2437
void setBackedgeValue(VPValue *V)
Update the incoming value from the loop backedge.
Definition VPlan.h:2442
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2426
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition VPlan.h:2133
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4409
Class to record and manage LLVM IR flags.
Definition VPlan.h:696
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1227
iterator_range< operand_iterator > operandsWithoutMask()
Returns an iterator range over the operands excluding the mask operand if present.
Definition VPlan.h:1481
@ ResumeForEpilogue
Explicit user for the resume phi of the canonical induction in the main VPlan, used by the epilogue v...
Definition VPlan.h:1319
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1312
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1270
unsigned getOpcode() const
Definition VPlan.h:1410
void setName(StringRef NewName)
Set the symbolic name for the VPInstruction.
Definition VPlan.h:1509
VPValue * getMask() const
Returns the mask for the VPInstruction.
Definition VPlan.h:1475
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:3049
detail::zippy< llvm::detail::zip_first, VPUser::const_operand_range, const_incoming_blocks_range > incoming_values_and_blocks() const
Returns an iterator range over pairs of incoming values and corresponding incoming blocks.
Definition VPlan.h:1639
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:401
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:554
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R, VFRange &Range)
Create and return a widened recipe for a non-phi recipe R if one can be created within the given VF R...
VPHistogramRecipe * widenIfHistogram(VPInstruction *VPI)
If VPI represents a histogram operation (as determined by LoopVectorizationLegality) make that safe f...
VPRecipeBase * tryToWidenMemory(VPInstruction *VPI, VFRange &Range)
Check if the load or store instruction VPI should widened for Range.Start and potentially masked.
bool replaceWithFinalIfReductionStore(VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder)
If VPI is a store of a reduction into an invariant address, delete it.
VPReplicateRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a VPReplicationRecipe for VPI.
bool isOrdered() const
Returns true, if the phi is part of an ordered reduction.
Definition VPlan.h:2848
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2827
bool isInLoop() const
Returns true if the phi is part of an in-loop reduction.
Definition VPlan.h:2851
VPReductionPHIRecipe * cloneWithOperands(VPValue *Start, VPValue *BackedgeValue)
Definition VPlan.h:2809
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2845
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3142
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4466
const VPBlockBase * getEntry() const
Definition VPlan.h:4510
void clearCanonicalIVNUW(VPInstruction *Increment)
Unsets NUW for the canonical IV increment Increment, for loop regions.
Definition VPlan.h:4594
VPRegionValue * getCanonicalIV()
Return the canonical induction variable of the region, null for replicating regions.
Definition VPlan.h:4578
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3296
VPSingleDefRecipe is a base class for recipes that model a sequence of one or more output IR that def...
Definition VPlan.h:610
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:681
An analysis for type-inference for VPValues.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:384
operand_range operands()
Definition VPlanValue.h:455
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:428
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:423
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:50
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:143
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:130
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:75
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1511
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1517
user_range users()
Definition VPlanValue.h:157
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2240
A recipe to compute the pointers for widened memory accesses of SourceElementTy, with the Stride expr...
Definition VPlan.h:2314
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1848
A recipe for handling GEP instructions.
Definition VPlan.h:2175
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2543
A recipe for widened phis.
Definition VPlan.h:2679
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1790
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4614
bool hasVF(ElementCount VF) const
Definition VPlan.h:4837
ElementCount getSingleVF() const
Returns the single VF of the plan, asserting that the plan has exactly one VF.
Definition VPlan.h:4850
VPBasicBlock * getEntry()
Definition VPlan.h:4710
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4773
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4813
bool hasUF(unsigned UF) const
Definition VPlan.h:4862
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4763
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4887
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:4913
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1098
bool hasEarlyExit() const
Returns true if the VPlan is based on a loop with an early exit.
Definition VPlan.h:5010
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this plan.
Definition VPlan.cpp:1080
LLVM_ABI_FOR_TEST bool isOuterLoop() const
Returns true if this VPlan is for an outer loop, i.e., its vector loop region contains a nested loop ...
Definition VPlan.cpp:1113
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4787
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4739
VPBasicBlock * getVectorPreheader() const
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4715
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4810
bool hasScalarVFOnly() const
Definition VPlan.h:4855
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4753
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition VPlan.cpp:950
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4759
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4806
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1254
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:162
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:393
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
constexpr bool isNonZero() const
Definition TypeSize.h:155
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr bool isZero() const
Definition TypeSize.h:153
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:223
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:237
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an std::string.
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
match_bind< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
specificloop_ty m_SpecificLoop(const Loop *L)
cst_pred_ty< is_specific_signed_cst > m_scev_SpecificSInt(int64_t V)
Match an SCEV constant with a plain signed integer (sign-extended value will be matched)
match_bind< const SCEVMulExpr > m_scev_Mul(const SCEVMulExpr *&V)
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
SCEVBinaryExpr_match< SCEVMulExpr, Op0_t, Op1_t, SCEV::FlagAnyWrap, true > m_scev_c_Mul(const Op0_t &Op0, const Op1_t &Op1)
bool matchFindIVResult(VPInstruction *VPI, Op0_t ReducedIV, Op1_t Start)
Match FindIV result pattern: select(icmp ne ComputeReductionResult(ReducedIV), Sentinel),...
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
Add a small namespace to avoid name clashes with the classes used in the streaming interface.
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition RDFGraph.h:389
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
VPBasicBlock * getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT)
Returns the header block of the first, top-level loop, or null if none exist.
bool isAddressSCEVForCost(const SCEV *Addr, ScalarEvolution &SE, const Loop *L)
Returns true if Addr is an address SCEV that can be passed to TTI::getAddressComputationCost,...
VPInstruction * findCanonicalIVIncrement(VPlan &Plan)
Find the canonical IV increment of Plan's vector loop region.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:116
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
Definition VPlanUtils.h:137
GEPNoWrapFlags getGEPFlagsForPtr(VPValue *Ptr)
Returns the GEP nowrap flags for Ptr, looking through pointer casts mirroring Value::stripPointerCast...
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, const Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:830
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2179
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
auto cast_if_present(const Y &Val)
cast_if_present<X> - Functionally identical to cast, except that a null value is accepted.
Definition Casting.h:683
LLVM_ABI bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
LLVM_ABI_FOR_TEST cl::opt< bool > VerifyEachVPlan
LLVM_ABI std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Return either:
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
InstructionCost Cost
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
OuterAnalysisManagerProxy< ModuleAnalysisManager, Function > ModuleAnalysisManagerFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
LLVM_ABI bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition LCSSA.cpp:449
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
LLVM_ABI bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
LLVM_ABI bool VerifySCEV
LLVM_ABI_FOR_TEST cl::opt< bool > VPlanPrintAfterAll
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:288
SmallVector< VPRegisterUsage, 8 > calculateRegisterUsageForPlan(VPlan &Plan, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Estimate the register usage for Plan and vectorization factors in VFs by calculating the highest numb...
auto map_range(ContainerTy &&C, FuncTy F)
Return a range that applies F to the elements of C.
Definition STLExtras.h:365
constexpr auto bind_front(FnT &&Fn, BindArgsT &&...BindArgs)
C++20 bind_front.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
void collectEphemeralRecipesForVPlan(VPlan &Plan, DenseSet< VPRecipeBase * > &EphRecipes)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
bool containsIrreducibleCFG(RPOTraversalT &RPOTraversal, const LoopInfoT &LI)
Return true if the control flow in RPOTraversal is irreducible.
Definition CFG.h:154
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:78
@ ReadOnly
No side effects to worry about, so we can process any uncountable exits in the loop and branch either...
Definition VPlan.h:83
@ MaskedHandleExitInScalarLoop
All memory operations other than the load(s) required to determine whether an uncountable exit occurr...
Definition VPlan.h:88
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
LLVM_ABI cl::opt< bool > EnableLoopVectorization
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
LLVM_ABI_FOR_TEST cl::list< std::string > VPlanPrintAfterPasses
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:422
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Type * toVectorizedTy(Type *Ty, ElementCount EC)
A helper for converting to vectorized types.
LLVM_ABI void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
T * find_singleton(R &&Range, Predicate P, bool AllowRepeats=false)
Return the single value in Range that satisfies P(<member of Range> *, AllowRepeats)->T * returning n...
Definition STLExtras.h:1836
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
cl::opt< unsigned > ForceTargetInstructionCost
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
LLVM_ABI void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, const Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
bool canVectorizeTy(Type *Ty)
Returns true if Ty is a valid vector element type, void, or an unpacked literal struct where all elem...
TargetTransformInfo TTI
@ CM_EpilogueNotAllowedLowTripLoop
@ CM_EpilogueNotNeededFoldTail
@ CM_EpilogueNotAllowedFoldTail
@ CM_EpilogueNotAllowedOptSize
@ CM_EpilogueAllowed
LLVM_ABI bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
DWARFExpression::Operation Op
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1771
auto predecessors(const MachineBasicBlock *BB)
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:368
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
cl::opt< bool > EnableVPlanNativePath
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
ArrayRef< Type * > getContainedTypes(Type *const &Ty)
Returns the types contained in Ty.
LLVM_ABI Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
bool pred_empty(const BasicBlock *BB)
Definition CFG.h:119
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataAndControlFlow
Use predicate to control both data and control flow.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
@ Data
Use predicate only to mask operations on data in the loop.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:325
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:347
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
std::unique_ptr< VPlan > VPlanPtr
Definition VPlan.h:73
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
LLVM_ABI_FOR_TEST bool verifyVPlanIsValid(const VPlan &Plan)
Verify invariants for general VPlans.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:305
LLVM_ABI_FOR_TEST cl::opt< bool > VPlanPrintVectorRegionScope
LLVM_ABI cl::opt< bool > EnableLoopInterleaving
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition Analysis.h:29
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
An information struct used to provide DenseMap with the various necessary components for a given valu...
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF, VPlan &EpiloguePlan)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
This holds details about a histogram operation – a load -> update -> store sequence where each lane i...
TargetLibraryInfo * TLI
LLVM_ABI LoopVectorizeResult runImpl(Function &F)
LLVM_ABI bool processLoop(Loop *L)
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
LLVM_ABI void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LLVM_ABI LoopVectorizePass(LoopVectorizeOptions Opts={})
ScalarEvolution * SE
AssumptionCache * AC
LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
std::function< BlockFrequencyInfo &()> GetBFI
TargetTransformInfo * TTI
Storage for information about made changes.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition PassManager.h:89
A marker analysis to determine if extra passes should be run after loop vectorization.
static LLVM_ABI AnalysisKey Key
Holds the VFShape for a specific scalar to vector function mapping.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
ElementCount End
Struct to hold various analysis needed for cost computations.
LoopVectorizationCostModel & CM
bool skipCostComputation(Instruction *UI, bool IsVector) const
Return true if the cost for UI shouldn't be computed, e.g.
InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const
Return the cost for UI with VF using the legacy cost model as fallback until computing the cost of al...
bool isMaskRequired(Instruction *I) const
Forwards to LoopVectorizationCostModel::isMaskRequired.
void invalidateWideningDecision(Instruction *I, ElementCount VF)
Mark the widening decision for I at VF as invalidated since a VPlan transform replaced the original r...
bool willBeScalarized(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalarized at VF.
uint64_t getPredBlockCostDivisor(BasicBlock *BB) const
TargetTransformInfo::TargetCostKind CostKind
std::optional< CallWideningKind > getLegacyCallKind(CallInst *CI, ElementCount VF) const
Returns the legacy call widening decision for CI at VF, or std::nullopt if none was recorded.
SmallPtrSet< Instruction *, 8 > SkipCostComputation
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:246
A pure-virtual common base class for recipes defining a single VPValue and using IR flags.
Definition VPlan.h:1118
A struct that represents some properties of the register usage of a loop.
InstructionCost spillCost(const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, unsigned OverrideMaxNumRegs=0) const
Calculate the estimated cost of any spills due to using more registers than the number available for ...
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3664
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3762
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder)
Convert load/store VPInstructions in Plan into widened or replicate recipes.
static bool createHeaderPhiRecipes(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &OrigLoop, const MapVector< PHINode *, InductionDescriptor > &Inductions, const MapVector< PHINode *, RecurrenceDescriptor > &Reductions, const SmallPtrSetImpl< const PHINode * > &FixedOrderRecurrences, const SmallPtrSetImpl< PHINode * > &InLoopReductions, bool AllowReordering)
Replace VPPhi recipes in Plan's header with corresponding VPHeaderPHIRecipe subclasses for inductions...
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, const bool &EpilogueAllowed)
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
static void removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void introduceMasksAndLinearize(VPlan &Plan)
Predicate and linearize the control-flow in the only loop region of Plan.
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void foldTailByMasking(VPlan &Plan)
Adapts the vector loop region for tail folding by introducing a header mask and conditionally executi...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void addMinimumVectorEpilogueIterationCheck(VPlan &Plan, Value *VectorTripCount, bool RequiresScalarEpilogue, ElementCount EpilogueVF, unsigned EpilogueUF, unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE)
Add a check to Plan to see if the epilogue vector loop should be executed.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static bool handleMultiUseReductions(VPlan &Plan, OptimizationRemarkEmitter *ORE, Loop *TheLoop)
Try to legalize reductions with multiple in-loop uses.
static void replaceWideCanonicalIVWithWideIV(VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, ElementCount VF, unsigned UF, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Replace a VPWidenCanonicalIVRecipe if it is present in Plan, with a VPWidenIntOrFpInductionRecipe,...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static bool handleFindLastReductions(VPlan &Plan)
Check if Plan contains any FindLast reductions.
static void createInLoopReductionRecipes(VPlan &Plan, ElementCount MinVF)
Create VPReductionRecipes for in-loop reductions.
static void unrollByUF(VPlan &Plan, unsigned UF)
Explicitly unroll Plan by UF.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static LLVM_ABI_FOR_TEST std::unique_ptr< VPlan > buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, PredicatedScalarEvolution &PSE, LoopVersioning *LVer=nullptr)
Create a base VPlan0, serving as the common starting point for all later candidates.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step, std::optional< uint64_t > MaxRuntimeStep=std::nullopt)
Materialize vector trip count computations to a set of VPInstructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static void addCanonicalIVRecipes(VPlan &Plan, DebugLoc DL)
Add a canonical IV and its increment, using InductionTy and DL to Plan.
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void makeScalarizationDecisions(VPlan &Plan, VFRange &Range)
Make VPlan-based scalarization decision prior to delegating to the ones made by the legacy CM.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses of the canonical ...
static void makeCallWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder, VPCostContext &CostCtx)
Convert call VPInstructions in Plan into widened call, vector intrinsic or replicate recipes based on...
static void adjustFirstOrderRecurrenceMiddleUsers(VPlan &Plan, VFRange &Range)
Adjust first-order recurrence users in the middle block: create penultimate element extracts for LCSS...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static LLVM_ABI_FOR_TEST bool handleEarlyExits(VPlan &Plan, UncountableExitStyle Style, Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT, AssumptionCache *AC)
Update Plan to account for all early exits.
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static bool handleMaxMinNumReductions(VPlan &Plan)
Check if Plan contains any FMaxNum or FMinNum reductions.
static LLVM_ABI_FOR_TEST void createLoopRegions(VPlan &Plan)
Replace loops in Plan's flat CFG with VPRegionBlocks, turning Plan's flat CFG into a hierarchical CFG...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock, bool AddBranchWeights)
Wrap runtime check block CheckBlock in a VPIRBB and Cond in a VPValue and connect the block to Plan,...
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void replicateByVF(VPlan &Plan, ElementCount VF)
Replace replicating VPReplicateRecipe, VPScalarIVStepsRecipe and VPInstruction in Plan with VF single...
static void convertToStridedAccesses(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L, VPCostContext &Ctx, VFRange &Range)
Transform widen memory recipes into strided access recipes when legal and profitable.
static void addIterationCountCheckBlock(VPlan &Plan, ElementCount VF, unsigned UF, bool RequiresScalarEpilogue, Loop *OrigLoop, const uint32_t *MinItersBypassWeights, DebugLoc DL, PredicatedScalarEvolution &PSE)
Add a new check block before the vector preheader to Plan to check if the main vector loop should be ...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue, bool TailFolded, Loop *OrigLoop, const uint32_t *MinItersBypassWeights, DebugLoc DL, PredicatedScalarEvolution &PSE, VPBasicBlock *CheckBlock)
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static void dropPoisonGeneratingRecipes(VPlan &Plan)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...
static LLVM_ABI_FOR_TEST void addMiddleCheck(VPlan &Plan, bool TailFolded)
If a check is needed to guard executing the scalar epilogue loop, it will be added to the middle bloc...
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.
static LLVM_ABI bool HoistRuntimeChecks