Bug Summary

File:build/source/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Warning:line 8133, column 35
Potential leak of memory pointed to by 'BlockMask'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name LoopVectorize.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm -resource-dir /usr/lib/llvm-16/lib/clang/16 -I lib/Transforms/Vectorize -I /build/source/llvm/lib/Transforms/Vectorize -I include -I /build/source/llvm/include -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-16/lib/clang/16/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm=build-llvm -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm=build-llvm -fcoverage-prefix-map=/build/source/= -source-date-epoch 1674602410 -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm -fdebug-prefix-map=/build/source/build-llvm=build-llvm -fdebug-prefix-map=/build/source/= -fdebug-prefix-map=/build/source/build-llvm=build-llvm -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-01-25-024556-16494-1 -x c++ /build/source/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

/build/source/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/Proposal/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanHCFGBuilder.h"
61#include "VPlanTransforms.h"
62#include "llvm/ADT/APInt.h"
63#include "llvm/ADT/ArrayRef.h"
64#include "llvm/ADT/DenseMap.h"
65#include "llvm/ADT/DenseMapInfo.h"
66#include "llvm/ADT/Hashing.h"
67#include "llvm/ADT/MapVector.h"
68#include "llvm/ADT/STLExtras.h"
69#include "llvm/ADT/SmallPtrSet.h"
70#include "llvm/ADT/SmallSet.h"
71#include "llvm/ADT/SmallVector.h"
72#include "llvm/ADT/Statistic.h"
73#include "llvm/ADT/StringRef.h"
74#include "llvm/ADT/Twine.h"
75#include "llvm/ADT/iterator_range.h"
76#include "llvm/Analysis/AssumptionCache.h"
77#include "llvm/Analysis/BasicAliasAnalysis.h"
78#include "llvm/Analysis/BlockFrequencyInfo.h"
79#include "llvm/Analysis/CFG.h"
80#include "llvm/Analysis/CodeMetrics.h"
81#include "llvm/Analysis/DemandedBits.h"
82#include "llvm/Analysis/GlobalsModRef.h"
83#include "llvm/Analysis/LoopAccessAnalysis.h"
84#include "llvm/Analysis/LoopAnalysisManager.h"
85#include "llvm/Analysis/LoopInfo.h"
86#include "llvm/Analysis/LoopIterator.h"
87#include "llvm/Analysis/OptimizationRemarkEmitter.h"
88#include "llvm/Analysis/ProfileSummaryInfo.h"
89#include "llvm/Analysis/ScalarEvolution.h"
90#include "llvm/Analysis/ScalarEvolutionExpressions.h"
91#include "llvm/Analysis/TargetLibraryInfo.h"
92#include "llvm/Analysis/TargetTransformInfo.h"
93#include "llvm/Analysis/ValueTracking.h"
94#include "llvm/Analysis/VectorUtils.h"
95#include "llvm/IR/Attributes.h"
96#include "llvm/IR/BasicBlock.h"
97#include "llvm/IR/CFG.h"
98#include "llvm/IR/Constant.h"
99#include "llvm/IR/Constants.h"
100#include "llvm/IR/DataLayout.h"
101#include "llvm/IR/DebugInfoMetadata.h"
102#include "llvm/IR/DebugLoc.h"
103#include "llvm/IR/DerivedTypes.h"
104#include "llvm/IR/DiagnosticInfo.h"
105#include "llvm/IR/Dominators.h"
106#include "llvm/IR/Function.h"
107#include "llvm/IR/IRBuilder.h"
108#include "llvm/IR/InstrTypes.h"
109#include "llvm/IR/Instruction.h"
110#include "llvm/IR/Instructions.h"
111#include "llvm/IR/IntrinsicInst.h"
112#include "llvm/IR/Intrinsics.h"
113#include "llvm/IR/Metadata.h"
114#include "llvm/IR/Module.h"
115#include "llvm/IR/Operator.h"
116#include "llvm/IR/PatternMatch.h"
117#include "llvm/IR/Type.h"
118#include "llvm/IR/Use.h"
119#include "llvm/IR/User.h"
120#include "llvm/IR/Value.h"
121#include "llvm/IR/ValueHandle.h"
122#include "llvm/IR/Verifier.h"
123#include "llvm/InitializePasses.h"
124#include "llvm/Pass.h"
125#include "llvm/Support/Casting.h"
126#include "llvm/Support/CommandLine.h"
127#include "llvm/Support/Compiler.h"
128#include "llvm/Support/Debug.h"
129#include "llvm/Support/ErrorHandling.h"
130#include "llvm/Support/InstructionCost.h"
131#include "llvm/Support/MathExtras.h"
132#include "llvm/Support/raw_ostream.h"
133#include "llvm/Transforms/Utils/BasicBlockUtils.h"
134#include "llvm/Transforms/Utils/InjectTLIMappings.h"
135#include "llvm/Transforms/Utils/LoopSimplify.h"
136#include "llvm/Transforms/Utils/LoopUtils.h"
137#include "llvm/Transforms/Utils/LoopVersioning.h"
138#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
139#include "llvm/Transforms/Utils/SizeOpts.h"
140#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
141#include <algorithm>
142#include <cassert>
143#include <cmath>
144#include <cstdint>
145#include <functional>
146#include <iterator>
147#include <limits>
148#include <map>
149#include <memory>
150#include <string>
151#include <tuple>
152#include <utility>
153
154using namespace llvm;
155
156#define LV_NAME"loop-vectorize" "loop-vectorize"
157#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"
158
159#ifndef NDEBUG
160const char VerboseDebug[] = DEBUG_TYPE"loop-vectorize" "-verbose";
161#endif
162
163/// @{
164/// Metadata attribute names
165const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
166const char LLVMLoopVectorizeFollowupVectorized[] =
167 "llvm.loop.vectorize.followup_vectorized";
168const char LLVMLoopVectorizeFollowupEpilogue[] =
169 "llvm.loop.vectorize.followup_epilogue";
170/// @}
171
172STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = {"loop-vectorize", "LoopsVectorized"
, "Number of loops vectorized"}
;
173STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = {"loop-vectorize", "LoopsAnalyzed"
, "Number of loops analyzed for vectorization"}
;
174STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized")static llvm::Statistic LoopsEpilogueVectorized = {"loop-vectorize"
, "LoopsEpilogueVectorized", "Number of epilogues vectorized"
}
;
175
176static cl::opt<bool> EnableEpilogueVectorization(
177 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
178 cl::desc("Enable vectorization of epilogue loops."));
179
180static cl::opt<unsigned> EpilogueVectorizationForceVF(
181 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
182 cl::desc("When epilogue vectorization is enabled, and a value greater than "
183 "1 is specified, forces the given VF for all applicable epilogue "
184 "loops."));
185
186static cl::opt<unsigned> EpilogueVectorizationMinVF(
187 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
188 cl::desc("Only loops with vectorization factor equal to or larger than "
189 "the specified value are considered for epilogue vectorization."));
190
191/// Loops with a known constant trip count below this number are vectorized only
192/// if no scalar iteration overheads are incurred.
193static cl::opt<unsigned> TinyTripCountVectorThreshold(
194 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
195 cl::desc("Loops with a constant trip count that is smaller than this "
196 "value are vectorized only if no scalar iteration overheads "
197 "are incurred."));
198
199static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
200 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
201 cl::desc("The maximum allowed number of runtime memory checks"));
202
203// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
204// that predication is preferred, and this lists all options. I.e., the
205// vectorizer will try to fold the tail-loop (epilogue) into the vector body
206// and predicate the instructions accordingly. If tail-folding fails, there are
207// different fallback strategies depending on these values:
208namespace PreferPredicateTy {
209 enum Option {
210 ScalarEpilogue = 0,
211 PredicateElseScalarEpilogue,
212 PredicateOrDontVectorize
213 };
214} // namespace PreferPredicateTy
215
216static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
217 "prefer-predicate-over-epilogue",
218 cl::init(PreferPredicateTy::ScalarEpilogue),
219 cl::Hidden,
220 cl::desc("Tail-folding and predication preferences over creating a scalar "
221 "epilogue loop."),
222 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
223 "scalar-epilogue",llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
224 "Don't tail-predicate loops, create scalar epilogue")llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
,
225 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
226 "predicate-else-scalar-epilogue",llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
227 "prefer tail-folding, create scalar epilogue if tail "llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
228 "folding fails.")llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
,
229 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
230 "predicate-dont-vectorize",llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
231 "prefers tail-folding, don't attempt vectorization if "llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
232 "tail-folding fails.")llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
));
233
234static cl::opt<bool> MaximizeBandwidth(
235 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
236 cl::desc("Maximize bandwidth when selecting vectorization factor which "
237 "will be determined by the smallest type in loop."));
238
239static cl::opt<bool> EnableInterleavedMemAccesses(
240 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
241 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
242
243/// An interleave-group may need masking if it resides in a block that needs
244/// predication, or in order to mask away gaps.
245static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
246 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
247 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
248
249static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
250 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
251 cl::desc("We don't interleave loops with a estimated constant trip count "
252 "below this number"));
253
254static cl::opt<unsigned> ForceTargetNumScalarRegs(
255 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
256 cl::desc("A flag that overrides the target's number of scalar registers."));
257
258static cl::opt<unsigned> ForceTargetNumVectorRegs(
259 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
260 cl::desc("A flag that overrides the target's number of vector registers."));
261
262static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
263 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
264 cl::desc("A flag that overrides the target's max interleave factor for "
265 "scalar loops."));
266
267static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
268 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
269 cl::desc("A flag that overrides the target's max interleave factor for "
270 "vectorized loops."));
271
272static cl::opt<unsigned> ForceTargetInstructionCost(
273 "force-target-instruction-cost", cl::init(0), cl::Hidden,
274 cl::desc("A flag that overrides the target's expected cost for "
275 "an instruction to a single constant value. Mostly "
276 "useful for getting consistent testing."));
277
278static cl::opt<bool> ForceTargetSupportsScalableVectors(
279 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
280 cl::desc(
281 "Pretend that scalable vectors are supported, even if the target does "
282 "not support them. This flag should only be used for testing."));
283
284static cl::opt<unsigned> SmallLoopCost(
285 "small-loop-cost", cl::init(20), cl::Hidden,
286 cl::desc(
287 "The cost of a loop that is considered 'small' by the interleaver."));
288
289static cl::opt<bool> LoopVectorizeWithBlockFrequency(
290 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
291 cl::desc("Enable the use of the block frequency analysis to access PGO "
292 "heuristics minimizing code growth in cold regions and being more "
293 "aggressive in hot regions."));
294
295// Runtime interleave loops for load/store throughput.
296static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
297 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
298 cl::desc(
299 "Enable runtime interleaving until load/store ports are saturated"));
300
301/// Interleave small loops with scalar reductions.
302static cl::opt<bool> InterleaveSmallLoopScalarReduction(
303 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
304 cl::desc("Enable interleaving for loops with small iteration counts that "
305 "contain scalar reductions to expose ILP."));
306
307/// The number of stores in a loop that are allowed to need predication.
308static cl::opt<unsigned> NumberOfStoresToPredicate(
309 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
310 cl::desc("Max number of stores to be predicated behind an if."));
311
312static cl::opt<bool> EnableIndVarRegisterHeur(
313 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
314 cl::desc("Count the induction variable only once when interleaving"));
315
316static cl::opt<bool> EnableCondStoresVectorization(
317 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
318 cl::desc("Enable if predication of stores during vectorization."));
319
320static cl::opt<unsigned> MaxNestedScalarReductionIC(
321 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
322 cl::desc("The maximum interleave count to use when interleaving a scalar "
323 "reduction in a nested loop."));
324
325static cl::opt<bool>
326 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
327 cl::Hidden,
328 cl::desc("Prefer in-loop vector reductions, "
329 "overriding the targets preference."));
330
331static cl::opt<bool> ForceOrderedReductions(
332 "force-ordered-reductions", cl::init(false), cl::Hidden,
333 cl::desc("Enable the vectorisation of loops with in-order (strict) "
334 "FP reductions"));
335
336static cl::opt<bool> PreferPredicatedReductionSelect(
337 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
338 cl::desc(
339 "Prefer predicating a reduction operation over an after loop select."));
340
341cl::opt<bool> EnableVPlanNativePath(
342 "enable-vplan-native-path", cl::init(false), cl::Hidden,
343 cl::desc("Enable VPlan-native vectorization path with "
344 "support for outer loop vectorization."));
345
346// This flag enables the stress testing of the VPlan H-CFG construction in the
347// VPlan-native vectorization path. It must be used in conjuction with
348// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
349// verification of the H-CFGs built.
350static cl::opt<bool> VPlanBuildStressTest(
351 "vplan-build-stress-test", cl::init(false), cl::Hidden,
352 cl::desc(
353 "Build VPlan for every supported loop nest in the function and bail "
354 "out right after the build (stress test the VPlan H-CFG construction "
355 "in the VPlan-native vectorization path)."));
356
357cl::opt<bool> llvm::EnableLoopInterleaving(
358 "interleave-loops", cl::init(true), cl::Hidden,
359 cl::desc("Enable loop interleaving in Loop vectorization passes"));
360cl::opt<bool> llvm::EnableLoopVectorization(
361 "vectorize-loops", cl::init(true), cl::Hidden,
362 cl::desc("Run the Loop vectorization passes"));
363
364static cl::opt<bool> PrintVPlansInDotFormat(
365 "vplan-print-in-dot-format", cl::Hidden,
366 cl::desc("Use dot format instead of plain text when dumping VPlans"));
367
368static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
369 "force-widen-divrem-via-safe-divisor", cl::Hidden,
370 cl::desc(
371 "Override cost based safe divisor widening for div/rem instructions"));
372
373/// A helper function that returns true if the given type is irregular. The
374/// type is irregular if its allocated size doesn't equal the store size of an
375/// element of the corresponding vector type.
376static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
377 // Determine if an array of N elements of type Ty is "bitcast compatible"
378 // with a <N x Ty> vector.
379 // This is only true if there is no padding between the array elements.
380 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
381}
382
383/// A helper function that returns the reciprocal of the block probability of
384/// predicated blocks. If we return X, we are assuming the predicated block
385/// will execute once for every X iterations of the loop header.
386///
387/// TODO: We should use actual block probability here, if available. Currently,
388/// we always assume predicated blocks have a 50% chance of executing.
389static unsigned getReciprocalPredBlockProb() { return 2; }
390
391/// A helper function that returns an integer or floating-point constant with
392/// value C.
393static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
394 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
395 : ConstantFP::get(Ty, C);
396}
397
398/// Returns "best known" trip count for the specified loop \p L as defined by
399/// the following procedure:
400/// 1) Returns exact trip count if it is known.
401/// 2) Returns expected trip count according to profile data if any.
402/// 3) Returns upper bound estimate if it is known.
403/// 4) Returns std::nullopt if all of the above failed.
404static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
405 Loop *L) {
406 // Check if exact trip count is known.
407 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
408 return ExpectedTC;
409
410 // Check if there is an expected trip count available from profile data.
411 if (LoopVectorizeWithBlockFrequency)
412 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
413 return *EstimatedTC;
414
415 // Check if upper bound estimate is known.
416 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
417 return ExpectedTC;
418
419 return std::nullopt;
420}
421
422namespace {
423// Forward declare GeneratedRTChecks.
424class GeneratedRTChecks;
425} // namespace
426
427namespace llvm {
428
429AnalysisKey ShouldRunExtraVectorPasses::Key;
430
431/// InnerLoopVectorizer vectorizes loops which contain only one basic
432/// block to a specified vectorization factor (VF).
433/// This class performs the widening of scalars into vectors, or multiple
434/// scalars. This class also implements the following features:
435/// * It inserts an epilogue loop for handling loops that don't have iteration
436/// counts that are known to be a multiple of the vectorization factor.
437/// * It handles the code generation for reduction variables.
438/// * Scalarization (implementation using scalars) of un-vectorizable
439/// instructions.
440/// InnerLoopVectorizer does not perform any vectorization-legality
441/// checks, and relies on the caller to check for the different legality
442/// aspects. The InnerLoopVectorizer relies on the
443/// LoopVectorizationLegality class to provide information about the induction
444/// and reduction variables that were found to a given vectorization factor.
445class InnerLoopVectorizer {
446public:
447 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
448 LoopInfo *LI, DominatorTree *DT,
449 const TargetLibraryInfo *TLI,
450 const TargetTransformInfo *TTI, AssumptionCache *AC,
451 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
452 ElementCount MinProfitableTripCount,
453 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
454 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
455 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
456 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
457 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
458 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
459 PSI(PSI), RTChecks(RTChecks) {
460 // Query this against the original loop and save it here because the profile
461 // of the original loop header may change as the transformation happens.
462 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
463 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
464
465 if (MinProfitableTripCount.isZero())
466 this->MinProfitableTripCount = VecWidth;
467 else
468 this->MinProfitableTripCount = MinProfitableTripCount;
469 }
470
471 virtual ~InnerLoopVectorizer() = default;
472
473 /// Create a new empty loop that will contain vectorized instructions later
474 /// on, while the old loop will be used as the scalar remainder. Control flow
475 /// is generated around the vectorized (and scalar epilogue) loops consisting
476 /// of various checks and bypasses. Return the pre-header block of the new
477 /// loop and the start value for the canonical induction, if it is != 0. The
478 /// latter is the case when vectorizing the epilogue loop. In the case of
479 /// epilogue vectorization, this function is overriden to handle the more
480 /// complex control flow around the loops.
481 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
482
483 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
484 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
485
486 // Return true if any runtime check is added.
487 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
488
489 /// A type for vectorized values in the new loop. Each value from the
490 /// original loop, when vectorized, is represented by UF vector values in the
491 /// new unrolled loop, where UF is the unroll factor.
492 using VectorParts = SmallVector<Value *, 2>;
493
494 /// A helper function to scalarize a single Instruction in the innermost loop.
495 /// Generates a sequence of scalar instances for each lane between \p MinLane
496 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
497 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
498 /// Instr's operands.
499 void scalarizeInstruction(const Instruction *Instr,
500 VPReplicateRecipe *RepRecipe,
501 const VPIteration &Instance, bool IfPredicateInstr,
502 VPTransformState &State);
503
504 /// Construct the vector value of a scalarized value \p V one lane at a time.
505 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
506 VPTransformState &State);
507
508 /// Try to vectorize interleaved access group \p Group with the base address
509 /// given in \p Addr, optionally masking the vector operations if \p
510 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
511 /// values in the vectorized loop.
512 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
513 ArrayRef<VPValue *> VPDefs,
514 VPTransformState &State, VPValue *Addr,
515 ArrayRef<VPValue *> StoredValues,
516 VPValue *BlockInMask = nullptr);
517
518 /// Fix the non-induction PHIs in \p Plan.
519 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
520
521 /// Returns true if the reordering of FP operations is not allowed, but we are
522 /// able to vectorize with strict in-order reductions for the given RdxDesc.
523 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
524
525 /// Create a broadcast instruction. This method generates a broadcast
526 /// instruction (shuffle) for loop invariant values and for the induction
527 /// value. If this is the induction variable then we extend it to N, N+1, ...
528 /// this is needed because each iteration in the loop corresponds to a SIMD
529 /// element.
530 virtual Value *getBroadcastInstrs(Value *V);
531
532 // Returns the resume value (bc.merge.rdx) for a reduction as
533 // generated by fixReduction.
534 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
535
536 /// Create a new phi node for the induction variable \p OrigPhi to resume
537 /// iteration count in the scalar epilogue, from where the vectorized loop
538 /// left off. In cases where the loop skeleton is more complicated (eg.
539 /// epilogue vectorization) and the resume values can come from an additional
540 /// bypass block, the \p AdditionalBypass pair provides information about the
541 /// bypass block and the end value on the edge from bypass to this loop.
542 PHINode *createInductionResumeValue(
543 PHINode *OrigPhi, const InductionDescriptor &ID,
544 ArrayRef<BasicBlock *> BypassBlocks,
545 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
546
547protected:
548 friend class LoopVectorizationPlanner;
549
550 /// A small list of PHINodes.
551 using PhiVector = SmallVector<PHINode *, 4>;
552
553 /// A type for scalarized values in the new loop. Each value from the
554 /// original loop, when scalarized, is represented by UF x VF scalar values
555 /// in the new unrolled loop, where UF is the unroll factor and VF is the
556 /// vectorization factor.
557 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
558
559 /// Set up the values of the IVs correctly when exiting the vector loop.
560 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
561 Value *VectorTripCount, Value *EndValue,
562 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
563 VPlan &Plan);
564
565 /// Handle all cross-iteration phis in the header.
566 void fixCrossIterationPHIs(VPTransformState &State);
567
568 /// Create the exit value of first order recurrences in the middle block and
569 /// update their users.
570 void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
571 VPTransformState &State);
572
573 /// Create code for the loop exit value of the reduction.
574 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
575
576 /// Clear NSW/NUW flags from reduction instructions if necessary.
577 void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
578 VPTransformState &State);
579
580 /// Iteratively sink the scalarized operands of a predicated instruction into
581 /// the block that was created for it.
582 void sinkScalarOperands(Instruction *PredInst);
583
584 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
585 /// represented as.
586 void truncateToMinimalBitwidths(VPTransformState &State);
587
588 /// Returns (and creates if needed) the original loop trip count.
589 Value *getOrCreateTripCount(BasicBlock *InsertBlock);
590
591 /// Returns (and creates if needed) the trip count of the widened loop.
592 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
593
594 /// Returns a bitcasted value to the requested vector type.
595 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
596 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
597 const DataLayout &DL);
598
599 /// Emit a bypass check to see if the vector trip count is zero, including if
600 /// it overflows.
601 void emitIterationCountCheck(BasicBlock *Bypass);
602
603 /// Emit a bypass check to see if all of the SCEV assumptions we've
604 /// had to make are correct. Returns the block containing the checks or
605 /// nullptr if no checks have been added.
606 BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
607
608 /// Emit bypass checks to check any memory assumptions we may have made.
609 /// Returns the block containing the checks or nullptr if no checks have been
610 /// added.
611 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
612
613 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
614 /// vector loop preheader, middle block and scalar preheader.
615 void createVectorLoopSkeleton(StringRef Prefix);
616
617 /// Create new phi nodes for the induction variables to resume iteration count
618 /// in the scalar epilogue, from where the vectorized loop left off.
619 /// In cases where the loop skeleton is more complicated (eg. epilogue
620 /// vectorization) and the resume values can come from an additional bypass
621 /// block, the \p AdditionalBypass pair provides information about the bypass
622 /// block and the end value on the edge from bypass to this loop.
623 void createInductionResumeValues(
624 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
625
626 /// Complete the loop skeleton by adding debug MDs, creating appropriate
627 /// conditional branches in the middle block, preparing the builder and
628 /// running the verifier. Return the preheader of the completed vector loop.
629 BasicBlock *completeLoopSkeleton();
630
631 /// Collect poison-generating recipes that may generate a poison value that is
632 /// used after vectorization, even when their operands are not poison. Those
633 /// recipes meet the following conditions:
634 /// * Contribute to the address computation of a recipe generating a widen
635 /// memory load/store (VPWidenMemoryInstructionRecipe or
636 /// VPInterleaveRecipe).
637 /// * Such a widen memory load/store has at least one underlying Instruction
638 /// that is in a basic block that needs predication and after vectorization
639 /// the generated instruction won't be predicated.
640 void collectPoisonGeneratingRecipes(VPTransformState &State);
641
642 /// Allow subclasses to override and print debug traces before/after vplan
643 /// execution, when trace information is requested.
644 virtual void printDebugTracesAtStart(){};
645 virtual void printDebugTracesAtEnd(){};
646
647 /// The original loop.
648 Loop *OrigLoop;
649
650 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
651 /// dynamic knowledge to simplify SCEV expressions and converts them to a
652 /// more usable form.
653 PredicatedScalarEvolution &PSE;
654
655 /// Loop Info.
656 LoopInfo *LI;
657
658 /// Dominator Tree.
659 DominatorTree *DT;
660
661 /// Target Library Info.
662 const TargetLibraryInfo *TLI;
663
664 /// Target Transform Info.
665 const TargetTransformInfo *TTI;
666
667 /// Assumption Cache.
668 AssumptionCache *AC;
669
670 /// Interface to emit optimization remarks.
671 OptimizationRemarkEmitter *ORE;
672
673 /// The vectorization SIMD factor to use. Each vector will have this many
674 /// vector elements.
675 ElementCount VF;
676
677 ElementCount MinProfitableTripCount;
678
679 /// The vectorization unroll factor to use. Each scalar is vectorized to this
680 /// many different vector instructions.
681 unsigned UF;
682
683 /// The builder that we use
684 IRBuilder<> Builder;
685
686 // --- Vectorization state ---
687
688 /// The vector-loop preheader.
689 BasicBlock *LoopVectorPreHeader;
690
691 /// The scalar-loop preheader.
692 BasicBlock *LoopScalarPreHeader;
693
694 /// Middle Block between the vector and the scalar.
695 BasicBlock *LoopMiddleBlock;
696
697 /// The unique ExitBlock of the scalar loop if one exists. Note that
698 /// there can be multiple exiting edges reaching this block.
699 BasicBlock *LoopExitBlock;
700
701 /// The scalar loop body.
702 BasicBlock *LoopScalarBody;
703
704 /// A list of all bypass blocks. The first block is the entry of the loop.
705 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
706
707 /// Store instructions that were predicated.
708 SmallVector<Instruction *, 4> PredicatedInstructions;
709
710 /// Trip count of the original loop.
711 Value *TripCount = nullptr;
712
713 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
714 Value *VectorTripCount = nullptr;
715
716 /// The legality analysis.
717 LoopVectorizationLegality *Legal;
718
719 /// The profitablity analysis.
720 LoopVectorizationCostModel *Cost;
721
722 // Record whether runtime checks are added.
723 bool AddedSafetyChecks = false;
724
725 // Holds the end values for each induction variable. We save the end values
726 // so we can later fix-up the external users of the induction variables.
727 DenseMap<PHINode *, Value *> IVEndValues;
728
729 /// BFI and PSI are used to check for profile guided size optimizations.
730 BlockFrequencyInfo *BFI;
731 ProfileSummaryInfo *PSI;
732
733 // Whether this loop should be optimized for size based on profile guided size
734 // optimizatios.
735 bool OptForSizeBasedOnProfile;
736
737 /// Structure to hold information about generated runtime checks, responsible
738 /// for cleaning the checks, if vectorization turns out unprofitable.
739 GeneratedRTChecks &RTChecks;
740
741 // Holds the resume values for reductions in the loops, used to set the
742 // correct start value of reduction PHIs when vectorizing the epilogue.
743 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
744 ReductionResumeValues;
745};
746
747class InnerLoopUnroller : public InnerLoopVectorizer {
748public:
749 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
750 LoopInfo *LI, DominatorTree *DT,
751 const TargetLibraryInfo *TLI,
752 const TargetTransformInfo *TTI, AssumptionCache *AC,
753 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
754 LoopVectorizationLegality *LVL,
755 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
756 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
757 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
758 ElementCount::getFixed(1),
759 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
760 BFI, PSI, Check) {}
761
762private:
763 Value *getBroadcastInstrs(Value *V) override;
764};
765
766/// Encapsulate information regarding vectorization of a loop and its epilogue.
767/// This information is meant to be updated and used across two stages of
768/// epilogue vectorization.
769struct EpilogueLoopVectorizationInfo {
770 ElementCount MainLoopVF = ElementCount::getFixed(0);
771 unsigned MainLoopUF = 0;
772 ElementCount EpilogueVF = ElementCount::getFixed(0);
773 unsigned EpilogueUF = 0;
774 BasicBlock *MainLoopIterationCountCheck = nullptr;
775 BasicBlock *EpilogueIterationCountCheck = nullptr;
776 BasicBlock *SCEVSafetyCheck = nullptr;
777 BasicBlock *MemSafetyCheck = nullptr;
778 Value *TripCount = nullptr;
779 Value *VectorTripCount = nullptr;
780
781 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
782 ElementCount EVF, unsigned EUF)
783 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
784 assert(EUF == 1 &&(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 785, __extension__
__PRETTY_FUNCTION__))
785 "A high UF for the epilogue loop is likely not beneficial.")(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 785, __extension__
__PRETTY_FUNCTION__))
;
786 }
787};
788
789/// An extension of the inner loop vectorizer that creates a skeleton for a
790/// vectorized loop that has its epilogue (residual) also vectorized.
791/// The idea is to run the vplan on a given loop twice, firstly to setup the
792/// skeleton and vectorize the main loop, and secondly to complete the skeleton
793/// from the first step and vectorize the epilogue. This is achieved by
794/// deriving two concrete strategy classes from this base class and invoking
795/// them in succession from the loop vectorizer planner.
796class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
797public:
798 InnerLoopAndEpilogueVectorizer(
799 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
800 DominatorTree *DT, const TargetLibraryInfo *TLI,
801 const TargetTransformInfo *TTI, AssumptionCache *AC,
802 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
803 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
804 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
805 GeneratedRTChecks &Checks)
806 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
807 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
808 CM, BFI, PSI, Checks),
809 EPI(EPI) {}
810
811 // Override this function to handle the more complex control flow around the
812 // three loops.
813 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton() final {
814 return createEpilogueVectorizedLoopSkeleton();
815 }
816
817 /// The interface for creating a vectorized skeleton using one of two
818 /// different strategies, each corresponding to one execution of the vplan
819 /// as described above.
820 virtual std::pair<BasicBlock *, Value *>
821 createEpilogueVectorizedLoopSkeleton() = 0;
822
823 /// Holds and updates state information required to vectorize the main loop
824 /// and its epilogue in two separate passes. This setup helps us avoid
825 /// regenerating and recomputing runtime safety checks. It also helps us to
826 /// shorten the iteration-count-check path length for the cases where the
827 /// iteration count of the loop is so small that the main vector loop is
828 /// completely skipped.
829 EpilogueLoopVectorizationInfo &EPI;
830};
831
832/// A specialized derived class of inner loop vectorizer that performs
833/// vectorization of *main* loops in the process of vectorizing loops and their
834/// epilogues.
835class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
836public:
837 EpilogueVectorizerMainLoop(
838 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
839 DominatorTree *DT, const TargetLibraryInfo *TLI,
840 const TargetTransformInfo *TTI, AssumptionCache *AC,
841 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
842 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
843 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
844 GeneratedRTChecks &Check)
845 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
846 EPI, LVL, CM, BFI, PSI, Check) {}
847 /// Implements the interface for creating a vectorized skeleton using the
848 /// *main loop* strategy (ie the first pass of vplan execution).
849 std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
850
851protected:
852 /// Emits an iteration count bypass check once for the main loop (when \p
853 /// ForEpilogue is false) and once for the epilogue loop (when \p
854 /// ForEpilogue is true).
855 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
856 void printDebugTracesAtStart() override;
857 void printDebugTracesAtEnd() override;
858};
859
860// A specialized derived class of inner loop vectorizer that performs
861// vectorization of *epilogue* loops in the process of vectorizing loops and
862// their epilogues.
863class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
864public:
865 EpilogueVectorizerEpilogueLoop(
866 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
867 DominatorTree *DT, const TargetLibraryInfo *TLI,
868 const TargetTransformInfo *TTI, AssumptionCache *AC,
869 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
870 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
871 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
872 GeneratedRTChecks &Checks)
873 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
874 EPI, LVL, CM, BFI, PSI, Checks) {
875 TripCount = EPI.TripCount;
876 }
877 /// Implements the interface for creating a vectorized skeleton using the
878 /// *epilogue loop* strategy (ie the second pass of vplan execution).
879 std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
880
881protected:
882 /// Emits an iteration count bypass check after the main vector loop has
883 /// finished to see if there are any iterations left to execute by either
884 /// the vector epilogue or the scalar epilogue.
885 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
886 BasicBlock *Bypass,
887 BasicBlock *Insert);
888 void printDebugTracesAtStart() override;
889 void printDebugTracesAtEnd() override;
890};
891} // end namespace llvm
892
893/// Look for a meaningful debug location on the instruction or it's
894/// operands.
895static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
896 if (!I)
897 return I;
898
899 DebugLoc Empty;
900 if (I->getDebugLoc() != Empty)
901 return I;
902
903 for (Use &Op : I->operands()) {
904 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
905 if (OpInst->getDebugLoc() != Empty)
906 return OpInst;
907 }
908
909 return I;
910}
911
912/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
913/// is passed, the message relates to that particular instruction.
914#ifndef NDEBUG
915static void debugVectorizationMessage(const StringRef Prefix,
916 const StringRef DebugMsg,
917 Instruction *I) {
918 dbgs() << "LV: " << Prefix << DebugMsg;
919 if (I != nullptr)
920 dbgs() << " " << *I;
921 else
922 dbgs() << '.';
923 dbgs() << '\n';
924}
925#endif
926
927/// Create an analysis remark that explains why vectorization failed
928///
929/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
930/// RemarkName is the identifier for the remark. If \p I is passed it is an
931/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
932/// the location of the remark. \return the remark object that can be
933/// streamed to.
934static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
935 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
936 Value *CodeRegion = TheLoop->getHeader();
937 DebugLoc DL = TheLoop->getStartLoc();
938
939 if (I) {
940 CodeRegion = I->getParent();
941 // If there is no debug location attached to the instruction, revert back to
942 // using the loop's.
943 if (I->getDebugLoc())
944 DL = I->getDebugLoc();
945 }
946
947 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
948}
949
950namespace llvm {
951
952/// Return a value for Step multiplied by VF.
953Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
954 int64_t Step) {
955 assert(Ty->isIntegerTy() && "Expected an integer step")(static_cast <bool> (Ty->isIntegerTy() && "Expected an integer step"
) ? void (0) : __assert_fail ("Ty->isIntegerTy() && \"Expected an integer step\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 955, __extension__
__PRETTY_FUNCTION__))
;
956 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
957 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
958}
959
960/// Return the runtime value for VF.
961Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
962 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
963 return VF.isScalable() ? B.CreateVScale(EC) : EC;
964}
965
966const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE) {
967 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
968 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count")(static_cast <bool> (!isa<SCEVCouldNotCompute>(BackedgeTakenCount
) && "Invalid loop count") ? void (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 968, __extension__
__PRETTY_FUNCTION__))
;
969
970 ScalarEvolution &SE = *PSE.getSE();
971
972 // The exit count might have the type of i64 while the phi is i32. This can
973 // happen if we have an induction variable that is sign extended before the
974 // compare. The only way that we get a backedge taken count is that the
975 // induction variable was signed and as such will not overflow. In such a case
976 // truncation is legal.
977 if (SE.getTypeSizeInBits(BackedgeTakenCount->getType()) >
978 IdxTy->getPrimitiveSizeInBits())
979 BackedgeTakenCount = SE.getTruncateOrNoop(BackedgeTakenCount, IdxTy);
980 BackedgeTakenCount = SE.getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
981
982 // Get the total trip count from the count by adding 1.
983 return SE.getAddExpr(BackedgeTakenCount,
984 SE.getOne(BackedgeTakenCount->getType()));
985}
986
987static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
988 ElementCount VF) {
989 assert(FTy->isFloatingPointTy() && "Expected floating point type!")(static_cast <bool> (FTy->isFloatingPointTy() &&
"Expected floating point type!") ? void (0) : __assert_fail (
"FTy->isFloatingPointTy() && \"Expected floating point type!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 989, __extension__
__PRETTY_FUNCTION__))
;
990 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
991 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
992 return B.CreateUIToFP(RuntimeVF, FTy);
993}
994
995void reportVectorizationFailure(const StringRef DebugMsg,
996 const StringRef OREMsg, const StringRef ORETag,
997 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
998 Instruction *I) {
999 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationMessage("Not vectorizing: "
, DebugMsg, I); } } while (false)
;
1000 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1001 ORE->emit(
1002 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1003 << "loop not vectorized: " << OREMsg);
1004}
1005
1006void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1007 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1008 Instruction *I) {
1009 LLVM_DEBUG(debugVectorizationMessage("", Msg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationMessage("", Msg, I); }
} while (false)
;
1010 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1011 ORE->emit(
1012 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1013 << Msg);
1014}
1015
1016} // end namespace llvm
1017
1018#ifndef NDEBUG
1019/// \return string containing a file name and a line # for the given loop.
1020static std::string getDebugLocString(const Loop *L) {
1021 std::string Result;
1022 if (L) {
1023 raw_string_ostream OS(Result);
1024 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1025 LoopDbgLoc.print(OS);
1026 else
1027 // Just print the module name.
1028 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1029 OS.flush();
1030 }
1031 return Result;
1032}
1033#endif
1034
1035void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1036 VPTransformState &State) {
1037
1038 // Collect recipes in the backward slice of `Root` that may generate a poison
1039 // value that is used after vectorization.
1040 SmallPtrSet<VPRecipeBase *, 16> Visited;
1041 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1042 SmallVector<VPRecipeBase *, 16> Worklist;
1043 Worklist.push_back(Root);
1044
1045 // Traverse the backward slice of Root through its use-def chain.
1046 while (!Worklist.empty()) {
1047 VPRecipeBase *CurRec = Worklist.back();
1048 Worklist.pop_back();
1049
1050 if (!Visited.insert(CurRec).second)
1051 continue;
1052
1053 // Prune search if we find another recipe generating a widen memory
1054 // instruction. Widen memory instructions involved in address computation
1055 // will lead to gather/scatter instructions, which don't need to be
1056 // handled.
1057 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1058 isa<VPInterleaveRecipe>(CurRec) ||
1059 isa<VPScalarIVStepsRecipe>(CurRec) ||
1060 isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1061 isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1062 continue;
1063
1064 // This recipe contributes to the address computation of a widen
1065 // load/store. Collect recipe if its underlying instruction has
1066 // poison-generating flags.
1067 Instruction *Instr = CurRec->getUnderlyingInstr();
1068 if (Instr && Instr->hasPoisonGeneratingFlags())
1069 State.MayGeneratePoisonRecipes.insert(CurRec);
1070
1071 // Add new definitions to the worklist.
1072 for (VPValue *operand : CurRec->operands())
1073 if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
1074 Worklist.push_back(OpDef);
1075 }
1076 });
1077
1078 // Traverse all the recipes in the VPlan and collect the poison-generating
1079 // recipes in the backward slice starting at the address of a VPWidenRecipe or
1080 // VPInterleaveRecipe.
1081 auto Iter = vp_depth_first_deep(State.Plan->getEntry());
1082 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1083 for (VPRecipeBase &Recipe : *VPBB) {
1084 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1085 Instruction &UnderlyingInstr = WidenRec->getIngredient();
1086 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
1087 if (AddrDef && WidenRec->isConsecutive() &&
1088 Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1089 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1090 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1091 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
1092 if (AddrDef) {
1093 // Check if any member of the interleave group needs predication.
1094 const InterleaveGroup<Instruction> *InterGroup =
1095 InterleaveRec->getInterleaveGroup();
1096 bool NeedPredication = false;
1097 for (int I = 0, NumMembers = InterGroup->getNumMembers();
1098 I < NumMembers; ++I) {
1099 Instruction *Member = InterGroup->getMember(I);
1100 if (Member)
1101 NeedPredication |=
1102 Legal->blockNeedsPredication(Member->getParent());
1103 }
1104
1105 if (NeedPredication)
1106 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1107 }
1108 }
1109 }
1110 }
1111}
1112
1113PHINode *InnerLoopVectorizer::getReductionResumeValue(
1114 const RecurrenceDescriptor &RdxDesc) {
1115 auto It = ReductionResumeValues.find(&RdxDesc);
1116 assert(It != ReductionResumeValues.end() &&(static_cast <bool> (It != ReductionResumeValues.end() &&
"Expected to find a resume value for the reduction.") ? void
(0) : __assert_fail ("It != ReductionResumeValues.end() && \"Expected to find a resume value for the reduction.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1117, __extension__
__PRETTY_FUNCTION__))
1117 "Expected to find a resume value for the reduction.")(static_cast <bool> (It != ReductionResumeValues.end() &&
"Expected to find a resume value for the reduction.") ? void
(0) : __assert_fail ("It != ReductionResumeValues.end() && \"Expected to find a resume value for the reduction.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1117, __extension__
__PRETTY_FUNCTION__))
;
1118 return It->second;
1119}
1120
1121namespace llvm {
1122
1123// Loop vectorization cost-model hints how the scalar epilogue loop should be
1124// lowered.
1125enum ScalarEpilogueLowering {
1126
1127 // The default: allowing scalar epilogues.
1128 CM_ScalarEpilogueAllowed,
1129
1130 // Vectorization with OptForSize: don't allow epilogues.
1131 CM_ScalarEpilogueNotAllowedOptSize,
1132
1133 // A special case of vectorisation with OptForSize: loops with a very small
1134 // trip count are considered for vectorization under OptForSize, thereby
1135 // making sure the cost of their loop body is dominant, free of runtime
1136 // guards and scalar iteration overheads.
1137 CM_ScalarEpilogueNotAllowedLowTripLoop,
1138
1139 // Loop hint predicate indicating an epilogue is undesired.
1140 CM_ScalarEpilogueNotNeededUsePredicate,
1141
1142 // Directive indicating we must either tail fold or not vectorize
1143 CM_ScalarEpilogueNotAllowedUsePredicate
1144};
1145
1146/// ElementCountComparator creates a total ordering for ElementCount
1147/// for the purposes of using it in a set structure.
1148struct ElementCountComparator {
1149 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1150 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1151 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1152 }
1153};
1154using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1155
1156/// LoopVectorizationCostModel - estimates the expected speedups due to
1157/// vectorization.
1158/// In many cases vectorization is not profitable. This can happen because of
1159/// a number of reasons. In this class we mainly attempt to predict the
1160/// expected speedup/slowdowns due to the supported instruction set. We use the
1161/// TargetTransformInfo to query the different backends for the cost of
1162/// different operations.
1163class LoopVectorizationCostModel {
1164public:
1165 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1166 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1167 LoopVectorizationLegality *Legal,
1168 const TargetTransformInfo &TTI,
1169 const TargetLibraryInfo *TLI, DemandedBits *DB,
1170 AssumptionCache *AC,
1171 OptimizationRemarkEmitter *ORE, const Function *F,
1172 const LoopVectorizeHints *Hints,
1173 InterleavedAccessInfo &IAI)
1174 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1175 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1176 Hints(Hints), InterleaveInfo(IAI) {}
1177
1178 /// \return An upper bound for the vectorization factors (both fixed and
1179 /// scalable). If the factors are 0, vectorization and interleaving should be
1180 /// avoided up front.
1181 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1182
1183 /// \return True if runtime checks are required for vectorization, and false
1184 /// otherwise.
1185 bool runtimeChecksRequired();
1186
1187 /// \return The most profitable vectorization factor and the cost of that VF.
1188 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1189 /// then this vectorization factor will be selected if vectorization is
1190 /// possible.
1191 VectorizationFactor
1192 selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1193
1194 VectorizationFactor
1195 selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1196 const LoopVectorizationPlanner &LVP);
1197
1198 /// Setup cost-based decisions for user vectorization factor.
1199 /// \return true if the UserVF is a feasible VF to be chosen.
1200 bool selectUserVectorizationFactor(ElementCount UserVF) {
1201 collectUniformsAndScalars(UserVF);
1202 collectInstsToScalarize(UserVF);
1203 return expectedCost(UserVF).first.isValid();
1204 }
1205
1206 /// \return The size (in bits) of the smallest and widest types in the code
1207 /// that needs to be vectorized. We ignore values that remain scalar such as
1208 /// 64 bit loop indices.
1209 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1210
1211 /// \return The desired interleave count.
1212 /// If interleave count has been specified by metadata it will be returned.
1213 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1214 /// are the selected vectorization factor and the cost of the selected VF.
1215 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1216
1217 /// Memory access instruction may be vectorized in more than one way.
1218 /// Form of instruction after vectorization depends on cost.
1219 /// This function takes cost-based decisions for Load/Store instructions
1220 /// and collects them in a map. This decisions map is used for building
1221 /// the lists of loop-uniform and loop-scalar instructions.
1222 /// The calculated cost is saved with widening decision in order to
1223 /// avoid redundant calculations.
1224 void setCostBasedWideningDecision(ElementCount VF);
1225
1226 /// A struct that represents some properties of the register usage
1227 /// of a loop.
1228 struct RegisterUsage {
1229 /// Holds the number of loop invariant values that are used in the loop.
1230 /// The key is ClassID of target-provided register class.
1231 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1232 /// Holds the maximum number of concurrent live intervals in the loop.
1233 /// The key is ClassID of target-provided register class.
1234 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1235 };
1236
1237 /// \return Returns information about the register usages of the loop for the
1238 /// given vectorization factors.
1239 SmallVector<RegisterUsage, 8>
1240 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1241
1242 /// Collect values we want to ignore in the cost model.
1243 void collectValuesToIgnore();
1244
1245 /// Collect all element types in the loop for which widening is needed.
1246 void collectElementTypesForWidening();
1247
1248 /// Split reductions into those that happen in the loop, and those that happen
1249 /// outside. In loop reductions are collected into InLoopReductionChains.
1250 void collectInLoopReductions();
1251
1252 /// Returns true if we should use strict in-order reductions for the given
1253 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1254 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1255 /// of FP operations.
1256 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1257 return !Hints->allowReordering() && RdxDesc.isOrdered();
1258 }
1259
1260 /// \returns The smallest bitwidth each instruction can be represented with.
1261 /// The vector equivalents of these instructions should be truncated to this
1262 /// type.
1263 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1264 return MinBWs;
1265 }
1266
1267 /// \returns True if it is more profitable to scalarize instruction \p I for
1268 /// vectorization factor \p VF.
1269 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1270 assert(VF.isVector() &&(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1271, __extension__
__PRETTY_FUNCTION__))
1271 "Profitable to scalarize relevant only for VF > 1.")(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1271, __extension__
__PRETTY_FUNCTION__))
;
1272
1273 // Cost model is not run in the VPlan-native path - return conservative
1274 // result until this changes.
1275 if (EnableVPlanNativePath)
1276 return false;
1277
1278 auto Scalars = InstsToScalarize.find(VF);
1279 assert(Scalars != InstsToScalarize.end() &&(static_cast <bool> (Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability") ? void
(0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1280, __extension__
__PRETTY_FUNCTION__))
1280 "VF not yet analyzed for scalarization profitability")(static_cast <bool> (Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability") ? void
(0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1280, __extension__
__PRETTY_FUNCTION__))
;
1281 return Scalars->second.find(I) != Scalars->second.end();
1282 }
1283
1284 /// Returns true if \p I is known to be uniform after vectorization.
1285 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1286 if (VF.isScalar())
1287 return true;
1288
1289 // Cost model is not run in the VPlan-native path - return conservative
1290 // result until this changes.
1291 if (EnableVPlanNativePath)
1292 return false;
1293
1294 auto UniformsPerVF = Uniforms.find(VF);
1295 assert(UniformsPerVF != Uniforms.end() &&(static_cast <bool> (UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity") ? void (0) : __assert_fail
("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1296, __extension__
__PRETTY_FUNCTION__))
1296 "VF not yet analyzed for uniformity")(static_cast <bool> (UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity") ? void (0) : __assert_fail
("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1296, __extension__
__PRETTY_FUNCTION__))
;
1297 return UniformsPerVF->second.count(I);
1298 }
1299
1300 /// Returns true if \p I is known to be scalar after vectorization.
1301 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1302 if (VF.isScalar())
1303 return true;
1304
1305 // Cost model is not run in the VPlan-native path - return conservative
1306 // result until this changes.
1307 if (EnableVPlanNativePath)
1308 return false;
1309
1310 auto ScalarsPerVF = Scalars.find(VF);
1311 assert(ScalarsPerVF != Scalars.end() &&(static_cast <bool> (ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF") ? void (0) : __assert_fail
("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1312, __extension__
__PRETTY_FUNCTION__))
1312 "Scalar values are not calculated for VF")(static_cast <bool> (ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF") ? void (0) : __assert_fail
("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1312, __extension__
__PRETTY_FUNCTION__))
;
1313 return ScalarsPerVF->second.count(I);
1314 }
1315
1316 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1317 /// for vectorization factor \p VF.
1318 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1319 return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1320 !isProfitableToScalarize(I, VF) &&
1321 !isScalarAfterVectorization(I, VF);
1322 }
1323
1324 /// Decision that was taken during cost calculation for memory instruction.
1325 enum InstWidening {
1326 CM_Unknown,
1327 CM_Widen, // For consecutive accesses with stride +1.
1328 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1329 CM_Interleave,
1330 CM_GatherScatter,
1331 CM_Scalarize
1332 };
1333
1334 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1335 /// instruction \p I and vector width \p VF.
1336 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1337 InstructionCost Cost) {
1338 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1338, __extension__
__PRETTY_FUNCTION__))
;
1339 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1340 }
1341
1342 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1343 /// interleaving group \p Grp and vector width \p VF.
1344 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1345 ElementCount VF, InstWidening W,
1346 InstructionCost Cost) {
1347 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1347, __extension__
__PRETTY_FUNCTION__))
;
1348 /// Broadcast this decicion to all instructions inside the group.
1349 /// But the cost will be assigned to one instruction only.
1350 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1351 if (auto *I = Grp->getMember(i)) {
1352 if (Grp->getInsertPos() == I)
1353 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1354 else
1355 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1356 }
1357 }
1358 }
1359
1360 /// Return the cost model decision for the given instruction \p I and vector
1361 /// width \p VF. Return CM_Unknown if this instruction did not pass
1362 /// through the cost modeling.
1363 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1364 assert(VF.isVector() && "Expected VF to be a vector VF")(static_cast <bool> (VF.isVector() && "Expected VF to be a vector VF"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF to be a vector VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1364, __extension__
__PRETTY_FUNCTION__))
;
1365 // Cost model is not run in the VPlan-native path - return conservative
1366 // result until this changes.
1367 if (EnableVPlanNativePath)
1368 return CM_GatherScatter;
1369
1370 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1371 auto Itr = WideningDecisions.find(InstOnVF);
1372 if (Itr == WideningDecisions.end())
1373 return CM_Unknown;
1374 return Itr->second.first;
1375 }
1376
1377 /// Return the vectorization cost for the given instruction \p I and vector
1378 /// width \p VF.
1379 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1380 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1380, __extension__
__PRETTY_FUNCTION__))
;
1381 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1382 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&(static_cast <bool> (WideningDecisions.find(InstOnVF) !=
WideningDecisions.end() && "The cost is not calculated"
) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1383, __extension__
__PRETTY_FUNCTION__))
1383 "The cost is not calculated")(static_cast <bool> (WideningDecisions.find(InstOnVF) !=
WideningDecisions.end() && "The cost is not calculated"
) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1383, __extension__
__PRETTY_FUNCTION__))
;
1384 return WideningDecisions[InstOnVF].second;
1385 }
1386
1387 /// Return True if instruction \p I is an optimizable truncate whose operand
1388 /// is an induction variable. Such a truncate will be removed by adding a new
1389 /// induction variable with the destination type.
1390 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1391 // If the instruction is not a truncate, return false.
1392 auto *Trunc = dyn_cast<TruncInst>(I);
1393 if (!Trunc)
1394 return false;
1395
1396 // Get the source and destination types of the truncate.
1397 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1398 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1399
1400 // If the truncate is free for the given types, return false. Replacing a
1401 // free truncate with an induction variable would add an induction variable
1402 // update instruction to each iteration of the loop. We exclude from this
1403 // check the primary induction variable since it will need an update
1404 // instruction regardless.
1405 Value *Op = Trunc->getOperand(0);
1406 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1407 return false;
1408
1409 // If the truncated value is not an induction variable, return false.
1410 return Legal->isInductionPhi(Op);
1411 }
1412
1413 /// Collects the instructions to scalarize for each predicated instruction in
1414 /// the loop.
1415 void collectInstsToScalarize(ElementCount VF);
1416
1417 /// Collect Uniform and Scalar values for the given \p VF.
1418 /// The sets depend on CM decision for Load/Store instructions
1419 /// that may be vectorized as interleave, gather-scatter or scalarized.
1420 void collectUniformsAndScalars(ElementCount VF) {
1421 // Do the analysis once.
1422 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1423 return;
1424 setCostBasedWideningDecision(VF);
1425 collectLoopUniforms(VF);
1426 collectLoopScalars(VF);
1427 }
1428
1429 /// Returns true if the target machine supports masked store operation
1430 /// for the given \p DataType and kind of access to \p Ptr.
1431 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1432 return Legal->isConsecutivePtr(DataType, Ptr) &&
1433 TTI.isLegalMaskedStore(DataType, Alignment);
1434 }
1435
1436 /// Returns true if the target machine supports masked load operation
1437 /// for the given \p DataType and kind of access to \p Ptr.
1438 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1439 return Legal->isConsecutivePtr(DataType, Ptr) &&
1440 TTI.isLegalMaskedLoad(DataType, Alignment);
1441 }
1442
1443 /// Returns true if the target machine can represent \p V as a masked gather
1444 /// or scatter operation.
1445 bool isLegalGatherOrScatter(Value *V,
1446 ElementCount VF = ElementCount::getFixed(1)) {
1447 bool LI = isa<LoadInst>(V);
1448 bool SI = isa<StoreInst>(V);
1449 if (!LI && !SI)
1450 return false;
1451 auto *Ty = getLoadStoreType(V);
1452 Align Align = getLoadStoreAlignment(V);
1453 if (VF.isVector())
1454 Ty = VectorType::get(Ty, VF);
1455 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1456 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1457 }
1458
1459 /// Returns true if the target machine supports all of the reduction
1460 /// variables found for the given VF.
1461 bool canVectorizeReductions(ElementCount VF) const {
1462 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1463 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1464 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1465 }));
1466 }
1467
1468 /// Given costs for both strategies, return true if the scalar predication
1469 /// lowering should be used for div/rem. This incorporates an override
1470 /// option so it is not simply a cost comparison.
1471 bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1472 InstructionCost SafeDivisorCost) const {
1473 switch (ForceSafeDivisor) {
1474 case cl::BOU_UNSET:
1475 return ScalarCost < SafeDivisorCost;
1476 case cl::BOU_TRUE:
1477 return false;
1478 case cl::BOU_FALSE:
1479 return true;
1480 };
1481 llvm_unreachable("impossible case value")::llvm::llvm_unreachable_internal("impossible case value", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1481)
;
1482 }
1483
1484 /// Returns true if \p I is an instruction which requires predication and
1485 /// for which our chosen predication strategy is scalarization (i.e. we
1486 /// don't have an alternate strategy such as masking available).
1487 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1488 bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1489
1490 /// Returns true if \p I is an instruction that needs to be predicated
1491 /// at runtime. The result is independent of the predication mechanism.
1492 /// Superset of instructions that return true for isScalarWithPredication.
1493 bool isPredicatedInst(Instruction *I) const;
1494
1495 /// Return the costs for our two available strategies for lowering a
1496 /// div/rem operation which requires speculating at least one lane.
1497 /// First result is for scalarization (will be invalid for scalable
1498 /// vectors); second is for the safe-divisor strategy.
1499 std::pair<InstructionCost, InstructionCost>
1500 getDivRemSpeculationCost(Instruction *I,
1501 ElementCount VF) const;
1502
1503 /// Returns true if \p I is a memory instruction with consecutive memory
1504 /// access that can be widened.
1505 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1506
1507 /// Returns true if \p I is a memory instruction in an interleaved-group
1508 /// of memory accesses that can be vectorized with wide vector loads/stores
1509 /// and shuffles.
1510 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
1511
1512 /// Check if \p Instr belongs to any interleaved access group.
1513 bool isAccessInterleaved(Instruction *Instr) {
1514 return InterleaveInfo.isInterleaved(Instr);
1515 }
1516
1517 /// Get the interleaved access group that \p Instr belongs to.
1518 const InterleaveGroup<Instruction> *
1519 getInterleavedAccessGroup(Instruction *Instr) {
1520 return InterleaveInfo.getInterleaveGroup(Instr);
1521 }
1522
1523 /// Returns true if we're required to use a scalar epilogue for at least
1524 /// the final iteration of the original loop.
1525 bool requiresScalarEpilogue(ElementCount VF) const {
1526 if (!isScalarEpilogueAllowed())
1527 return false;
1528 // If we might exit from anywhere but the latch, must run the exiting
1529 // iteration in scalar form.
1530 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1531 return true;
1532 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1533 }
1534
1535 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1536 /// loop hint annotation.
1537 bool isScalarEpilogueAllowed() const {
1538 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1539 }
1540
1541 /// Returns true if all loop blocks should be masked to fold tail loop.
1542 bool foldTailByMasking() const { return FoldTailByMasking; }
1543
1544 /// Returns true if were tail-folding and want to use the active lane mask
1545 /// for vector loop control flow.
1546 bool useActiveLaneMaskForControlFlow() const {
1547 return FoldTailByMasking &&
1548 TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow;
1549 }
1550
1551 /// Returns true if the instructions in this block requires predication
1552 /// for any reason, e.g. because tail folding now requires a predicate
1553 /// or because the block in the original loop was predicated.
1554 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1555 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1556 }
1557
1558 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1559 /// nodes to the chain of instructions representing the reductions. Uses a
1560 /// MapVector to ensure deterministic iteration order.
1561 using ReductionChainMap =
1562 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1563
1564 /// Return the chain of instructions representing an inloop reduction.
1565 const ReductionChainMap &getInLoopReductionChains() const {
1566 return InLoopReductionChains;
1567 }
1568
1569 /// Returns true if the Phi is part of an inloop reduction.
1570 bool isInLoopReduction(PHINode *Phi) const {
1571 return InLoopReductionChains.count(Phi);
1572 }
1573
1574 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1575 /// with factor VF. Return the cost of the instruction, including
1576 /// scalarization overhead if it's needed.
1577 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1578
1579 /// Estimate cost of a call instruction CI if it were vectorized with factor
1580 /// VF. Return the cost of the instruction, including scalarization overhead
1581 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1582 /// scalarized -
1583 /// i.e. either vector version isn't available, or is too expensive.
1584 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1585 bool &NeedToScalarize) const;
1586
1587 /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1588 /// that of B.
1589 bool isMoreProfitable(const VectorizationFactor &A,
1590 const VectorizationFactor &B) const;
1591
1592 /// Invalidates decisions already taken by the cost model.
1593 void invalidateCostModelingDecisions() {
1594 WideningDecisions.clear();
1595 Uniforms.clear();
1596 Scalars.clear();
1597 }
1598
1599 /// Convenience function that returns the value of vscale_range iff
1600 /// vscale_range.min == vscale_range.max or otherwise returns the value
1601 /// returned by the corresponding TLI method.
1602 std::optional<unsigned> getVScaleForTuning() const;
1603
1604private:
1605 unsigned NumPredStores = 0;
1606
1607 /// \return An upper bound for the vectorization factors for both
1608 /// fixed and scalable vectorization, where the minimum-known number of
1609 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1610 /// disabled or unsupported, then the scalable part will be equal to
1611 /// ElementCount::getScalable(0).
1612 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1613 ElementCount UserVF,
1614 bool FoldTailByMasking);
1615
1616 /// \return the maximized element count based on the targets vector
1617 /// registers and the loop trip-count, but limited to a maximum safe VF.
1618 /// This is a helper function of computeFeasibleMaxVF.
1619 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1620 unsigned SmallestType,
1621 unsigned WidestType,
1622 ElementCount MaxSafeVF,
1623 bool FoldTailByMasking);
1624
1625 /// \return the maximum legal scalable VF, based on the safe max number
1626 /// of elements.
1627 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1628
1629 /// The vectorization cost is a combination of the cost itself and a boolean
1630 /// indicating whether any of the contributing operations will actually
1631 /// operate on vector values after type legalization in the backend. If this
1632 /// latter value is false, then all operations will be scalarized (i.e. no
1633 /// vectorization has actually taken place).
1634 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1635
1636 /// Returns the expected execution cost. The unit of the cost does
1637 /// not matter because we use the 'cost' units to compare different
1638 /// vector widths. The cost that is returned is *not* normalized by
1639 /// the factor width. If \p Invalid is not nullptr, this function
1640 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1641 /// each instruction that has an Invalid cost for the given VF.
1642 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1643 VectorizationCostTy
1644 expectedCost(ElementCount VF,
1645 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1646
1647 /// Returns the execution time cost of an instruction for a given vector
1648 /// width. Vector width of one means scalar.
1649 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1650
1651 /// The cost-computation logic from getInstructionCost which provides
1652 /// the vector type as an output parameter.
1653 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1654 Type *&VectorTy);
1655
1656 /// Return the cost of instructions in an inloop reduction pattern, if I is
1657 /// part of that pattern.
1658 std::optional<InstructionCost>
1659 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1660 TTI::TargetCostKind CostKind);
1661
1662 /// Calculate vectorization cost of memory instruction \p I.
1663 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1664
1665 /// The cost computation for scalarized memory instruction.
1666 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1667
1668 /// The cost computation for interleaving group of memory instructions.
1669 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1670
1671 /// The cost computation for Gather/Scatter instruction.
1672 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1673
1674 /// The cost computation for widening instruction \p I with consecutive
1675 /// memory access.
1676 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1677
1678 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1679 /// Load: scalar load + broadcast.
1680 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1681 /// element)
1682 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1683
1684 /// Estimate the overhead of scalarizing an instruction. This is a
1685 /// convenience wrapper for the type-based getScalarizationOverhead API.
1686 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1687 TTI::TargetCostKind CostKind) const;
1688
1689 /// Returns true if an artificially high cost for emulated masked memrefs
1690 /// should be used.
1691 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1692
1693 /// Map of scalar integer values to the smallest bitwidth they can be legally
1694 /// represented as. The vector equivalents of these values should be truncated
1695 /// to this type.
1696 MapVector<Instruction *, uint64_t> MinBWs;
1697
1698 /// A type representing the costs for instructions if they were to be
1699 /// scalarized rather than vectorized. The entries are Instruction-Cost
1700 /// pairs.
1701 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1702
1703 /// A set containing all BasicBlocks that are known to present after
1704 /// vectorization as a predicated block.
1705 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1706 PredicatedBBsAfterVectorization;
1707
1708 /// Records whether it is allowed to have the original scalar loop execute at
1709 /// least once. This may be needed as a fallback loop in case runtime
1710 /// aliasing/dependence checks fail, or to handle the tail/remainder
1711 /// iterations when the trip count is unknown or doesn't divide by the VF,
1712 /// or as a peel-loop to handle gaps in interleave-groups.
1713 /// Under optsize and when the trip count is very small we don't allow any
1714 /// iterations to execute in the scalar loop.
1715 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1716
1717 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1718 bool FoldTailByMasking = false;
1719
1720 /// A map holding scalar costs for different vectorization factors. The
1721 /// presence of a cost for an instruction in the mapping indicates that the
1722 /// instruction will be scalarized when vectorizing with the associated
1723 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1724 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1725
1726 /// Holds the instructions known to be uniform after vectorization.
1727 /// The data is collected per VF.
1728 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1729
1730 /// Holds the instructions known to be scalar after vectorization.
1731 /// The data is collected per VF.
1732 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1733
1734 /// Holds the instructions (address computations) that are forced to be
1735 /// scalarized.
1736 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1737
1738 /// PHINodes of the reductions that should be expanded in-loop along with
1739 /// their associated chains of reduction operations, in program order from top
1740 /// (PHI) to bottom
1741 ReductionChainMap InLoopReductionChains;
1742
1743 /// A Map of inloop reduction operations and their immediate chain operand.
1744 /// FIXME: This can be removed once reductions can be costed correctly in
1745 /// vplan. This was added to allow quick lookup to the inloop operations,
1746 /// without having to loop through InLoopReductionChains.
1747 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1748
1749 /// Returns the expected difference in cost from scalarizing the expression
1750 /// feeding a predicated instruction \p PredInst. The instructions to
1751 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1752 /// non-negative return value implies the expression will be scalarized.
1753 /// Currently, only single-use chains are considered for scalarization.
1754 InstructionCost computePredInstDiscount(Instruction *PredInst,
1755 ScalarCostsTy &ScalarCosts,
1756 ElementCount VF);
1757
1758 /// Collect the instructions that are uniform after vectorization. An
1759 /// instruction is uniform if we represent it with a single scalar value in
1760 /// the vectorized loop corresponding to each vector iteration. Examples of
1761 /// uniform instructions include pointer operands of consecutive or
1762 /// interleaved memory accesses. Note that although uniformity implies an
1763 /// instruction will be scalar, the reverse is not true. In general, a
1764 /// scalarized instruction will be represented by VF scalar values in the
1765 /// vectorized loop, each corresponding to an iteration of the original
1766 /// scalar loop.
1767 void collectLoopUniforms(ElementCount VF);
1768
1769 /// Collect the instructions that are scalar after vectorization. An
1770 /// instruction is scalar if it is known to be uniform or will be scalarized
1771 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1772 /// to the list if they are used by a load/store instruction that is marked as
1773 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1774 /// VF values in the vectorized loop, each corresponding to an iteration of
1775 /// the original scalar loop.
1776 void collectLoopScalars(ElementCount VF);
1777
1778 /// Keeps cost model vectorization decision and cost for instructions.
1779 /// Right now it is used for memory instructions only.
1780 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1781 std::pair<InstWidening, InstructionCost>>;
1782
1783 DecisionList WideningDecisions;
1784
1785 /// Returns true if \p V is expected to be vectorized and it needs to be
1786 /// extracted.
1787 bool needsExtract(Value *V, ElementCount VF) const {
1788 Instruction *I = dyn_cast<Instruction>(V);
1789 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1790 TheLoop->isLoopInvariant(I))
1791 return false;
1792
1793 // Assume we can vectorize V (and hence we need extraction) if the
1794 // scalars are not computed yet. This can happen, because it is called
1795 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1796 // the scalars are collected. That should be a safe assumption in most
1797 // cases, because we check if the operands have vectorizable types
1798 // beforehand in LoopVectorizationLegality.
1799 return Scalars.find(VF) == Scalars.end() ||
1800 !isScalarAfterVectorization(I, VF);
1801 };
1802
1803 /// Returns a range containing only operands needing to be extracted.
1804 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1805 ElementCount VF) const {
1806 return SmallVector<Value *, 4>(make_filter_range(
1807 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1808 }
1809
1810 /// Determines if we have the infrastructure to vectorize loop \p L and its
1811 /// epilogue, assuming the main loop is vectorized by \p VF.
1812 bool isCandidateForEpilogueVectorization(const Loop &L,
1813 const ElementCount VF) const;
1814
1815 /// Returns true if epilogue vectorization is considered profitable, and
1816 /// false otherwise.
1817 /// \p VF is the vectorization factor chosen for the original loop.
1818 bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1819
1820public:
1821 /// The loop that we evaluate.
1822 Loop *TheLoop;
1823
1824 /// Predicated scalar evolution analysis.
1825 PredicatedScalarEvolution &PSE;
1826
1827 /// Loop Info analysis.
1828 LoopInfo *LI;
1829
1830 /// Vectorization legality.
1831 LoopVectorizationLegality *Legal;
1832
1833 /// Vector target information.
1834 const TargetTransformInfo &TTI;
1835
1836 /// Target Library Info.
1837 const TargetLibraryInfo *TLI;
1838
1839 /// Demanded bits analysis.
1840 DemandedBits *DB;
1841
1842 /// Assumption cache.
1843 AssumptionCache *AC;
1844
1845 /// Interface to emit optimization remarks.
1846 OptimizationRemarkEmitter *ORE;
1847
1848 const Function *TheFunction;
1849
1850 /// Loop Vectorize Hint.
1851 const LoopVectorizeHints *Hints;
1852
1853 /// The interleave access information contains groups of interleaved accesses
1854 /// with the same stride and close to each other.
1855 InterleavedAccessInfo &InterleaveInfo;
1856
1857 /// Values to ignore in the cost model.
1858 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1859
1860 /// Values to ignore in the cost model when VF > 1.
1861 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1862
1863 /// All element types found in the loop.
1864 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1865
1866 /// Profitable vector factors.
1867 SmallVector<VectorizationFactor, 8> ProfitableVFs;
1868};
1869} // end namespace llvm
1870
1871namespace {
1872/// Helper struct to manage generating runtime checks for vectorization.
1873///
1874/// The runtime checks are created up-front in temporary blocks to allow better
1875/// estimating the cost and un-linked from the existing IR. After deciding to
1876/// vectorize, the checks are moved back. If deciding not to vectorize, the
1877/// temporary blocks are completely removed.
1878class GeneratedRTChecks {
1879 /// Basic block which contains the generated SCEV checks, if any.
1880 BasicBlock *SCEVCheckBlock = nullptr;
1881
1882 /// The value representing the result of the generated SCEV checks. If it is
1883 /// nullptr, either no SCEV checks have been generated or they have been used.
1884 Value *SCEVCheckCond = nullptr;
1885
1886 /// Basic block which contains the generated memory runtime checks, if any.
1887 BasicBlock *MemCheckBlock = nullptr;
1888
1889 /// The value representing the result of the generated memory runtime checks.
1890 /// If it is nullptr, either no memory runtime checks have been generated or
1891 /// they have been used.
1892 Value *MemRuntimeCheckCond = nullptr;
1893
1894 DominatorTree *DT;
1895 LoopInfo *LI;
1896 TargetTransformInfo *TTI;
1897
1898 SCEVExpander SCEVExp;
1899 SCEVExpander MemCheckExp;
1900
1901 bool CostTooHigh = false;
1902
1903public:
1904 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1905 TargetTransformInfo *TTI, const DataLayout &DL)
1906 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1907 MemCheckExp(SE, DL, "scev.check") {}
1908
1909 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1910 /// accurately estimate the cost of the runtime checks. The blocks are
1911 /// un-linked from the IR and is added back during vector code generation. If
1912 /// there is no vector code generation, the check blocks are removed
1913 /// completely.
1914 void Create(Loop *L, const LoopAccessInfo &LAI,
1915 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1916
1917 // Hard cutoff to limit compile-time increase in case a very large number of
1918 // runtime checks needs to be generated.
1919 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1920 // profile info.
1921 CostTooHigh =
1922 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1923 if (CostTooHigh)
1924 return;
1925
1926 BasicBlock *LoopHeader = L->getHeader();
1927 BasicBlock *Preheader = L->getLoopPreheader();
1928
1929 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1930 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1931 // may be used by SCEVExpander. The blocks will be un-linked from their
1932 // predecessors and removed from LI & DT at the end of the function.
1933 if (!UnionPred.isAlwaysTrue()) {
1934 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1935 nullptr, "vector.scevcheck");
1936
1937 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1938 &UnionPred, SCEVCheckBlock->getTerminator());
1939 }
1940
1941 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1942 if (RtPtrChecking.Need) {
1943 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1944 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1945 "vector.memcheck");
1946
1947 auto DiffChecks = RtPtrChecking.getDiffChecks();
1948 if (DiffChecks) {
1949 Value *RuntimeVF = nullptr;
1950 MemRuntimeCheckCond = addDiffRuntimeChecks(
1951 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1952 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1953 if (!RuntimeVF)
1954 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1955 return RuntimeVF;
1956 },
1957 IC);
1958 } else {
1959 MemRuntimeCheckCond =
1960 addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1961 RtPtrChecking.getChecks(), MemCheckExp);
1962 }
1963 assert(MemRuntimeCheckCond &&(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1965, __extension__
__PRETTY_FUNCTION__))
1964 "no RT checks generated although RtPtrChecking "(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1965, __extension__
__PRETTY_FUNCTION__))
1965 "claimed checks are required")(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1965, __extension__
__PRETTY_FUNCTION__))
;
1966 }
1967
1968 if (!MemCheckBlock && !SCEVCheckBlock)
1969 return;
1970
1971 // Unhook the temporary block with the checks, update various places
1972 // accordingly.
1973 if (SCEVCheckBlock)
1974 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1975 if (MemCheckBlock)
1976 MemCheckBlock->replaceAllUsesWith(Preheader);
1977
1978 if (SCEVCheckBlock) {
1979 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1980 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1981 Preheader->getTerminator()->eraseFromParent();
1982 }
1983 if (MemCheckBlock) {
1984 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1985 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1986 Preheader->getTerminator()->eraseFromParent();
1987 }
1988
1989 DT->changeImmediateDominator(LoopHeader, Preheader);
1990 if (MemCheckBlock) {
1991 DT->eraseNode(MemCheckBlock);
1992 LI->removeBlock(MemCheckBlock);
1993 }
1994 if (SCEVCheckBlock) {
1995 DT->eraseNode(SCEVCheckBlock);
1996 LI->removeBlock(SCEVCheckBlock);
1997 }
1998 }
1999
2000 InstructionCost getCost() {
2001 if (SCEVCheckBlock || MemCheckBlock)
2002 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Calculating cost of runtime checks:\n"
; } } while (false)
;
2003
2004 if (CostTooHigh) {
2005 InstructionCost Cost;
2006 Cost.setInvalid();
2007 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " number of checks exceeded threshold\n"
; } } while (false)
;
2008 return Cost;
2009 }
2010
2011 InstructionCost RTCheckCost = 0;
2012 if (SCEVCheckBlock)
2013 for (Instruction &I : *SCEVCheckBlock) {
2014 if (SCEVCheckBlock->getTerminator() == &I)
2015 continue;
2016 InstructionCost C =
2017 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2018 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " " << C <<
" for " << I << "\n"; } } while (false)
;
2019 RTCheckCost += C;
2020 }
2021 if (MemCheckBlock)
2022 for (Instruction &I : *MemCheckBlock) {
2023 if (MemCheckBlock->getTerminator() == &I)
2024 continue;
2025 InstructionCost C =
2026 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2027 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " " << C <<
" for " << I << "\n"; } } while (false)
;
2028 RTCheckCost += C;
2029 }
2030
2031 if (SCEVCheckBlock || MemCheckBlock)
2032 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCostdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Total cost of runtime checks: "
<< RTCheckCost << "\n"; } } while (false)
2033 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Total cost of runtime checks: "
<< RTCheckCost << "\n"; } } while (false)
;
2034
2035 return RTCheckCost;
2036 }
2037
2038 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2039 /// unused.
2040 ~GeneratedRTChecks() {
2041 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2042 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2043 if (!SCEVCheckCond)
2044 SCEVCleaner.markResultUsed();
2045
2046 if (!MemRuntimeCheckCond)
2047 MemCheckCleaner.markResultUsed();
2048
2049 if (MemRuntimeCheckCond) {
2050 auto &SE = *MemCheckExp.getSE();
2051 // Memory runtime check generation creates compares that use expanded
2052 // values. Remove them before running the SCEVExpanderCleaners.
2053 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2054 if (MemCheckExp.isInsertedInstruction(&I))
2055 continue;
2056 SE.forgetValue(&I);
2057 I.eraseFromParent();
2058 }
2059 }
2060 MemCheckCleaner.cleanup();
2061 SCEVCleaner.cleanup();
2062
2063 if (SCEVCheckCond)
2064 SCEVCheckBlock->eraseFromParent();
2065 if (MemRuntimeCheckCond)
2066 MemCheckBlock->eraseFromParent();
2067 }
2068
2069 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2070 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2071 /// depending on the generated condition.
2072 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2073 BasicBlock *LoopVectorPreHeader,
2074 BasicBlock *LoopExitBlock) {
2075 if (!SCEVCheckCond)
2076 return nullptr;
2077
2078 Value *Cond = SCEVCheckCond;
2079 // Mark the check as used, to prevent it from being removed during cleanup.
2080 SCEVCheckCond = nullptr;
2081 if (auto *C = dyn_cast<ConstantInt>(Cond))
2082 if (C->isZero())
2083 return nullptr;
2084
2085 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2086
2087 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2088 // Create new preheader for vector loop.
2089 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2090 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2091
2092 SCEVCheckBlock->getTerminator()->eraseFromParent();
2093 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2094 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2095 SCEVCheckBlock);
2096
2097 DT->addNewBlock(SCEVCheckBlock, Pred);
2098 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2099
2100 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
2101 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
2102 return SCEVCheckBlock;
2103 }
2104
2105 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2106 /// the branches to branch to the vector preheader or \p Bypass, depending on
2107 /// the generated condition.
2108 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2109 BasicBlock *LoopVectorPreHeader) {
2110 // Check if we generated code that checks in runtime if arrays overlap.
2111 if (!MemRuntimeCheckCond)
2112 return nullptr;
2113
2114 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2115 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2116 MemCheckBlock);
2117
2118 DT->addNewBlock(MemCheckBlock, Pred);
2119 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2120 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2121
2122 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2123 PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2124
2125 ReplaceInstWithInst(
2126 MemCheckBlock->getTerminator(),
2127 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2128 MemCheckBlock->getTerminator()->setDebugLoc(
2129 Pred->getTerminator()->getDebugLoc());
2130
2131 // Mark the check as used, to prevent it from being removed during cleanup.
2132 MemRuntimeCheckCond = nullptr;
2133 return MemCheckBlock;
2134 }
2135};
2136} // namespace
2137
2138// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2139// vectorization. The loop needs to be annotated with #pragma omp simd
2140// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2141// vector length information is not provided, vectorization is not considered
2142// explicit. Interleave hints are not allowed either. These limitations will be
2143// relaxed in the future.
2144// Please, note that we are currently forced to abuse the pragma 'clang
2145// vectorize' semantics. This pragma provides *auto-vectorization hints*
2146// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2147// provides *explicit vectorization hints* (LV can bypass legal checks and
2148// assume that vectorization is legal). However, both hints are implemented
2149// using the same metadata (llvm.loop.vectorize, processed by
2150// LoopVectorizeHints). This will be fixed in the future when the native IR
2151// representation for pragma 'omp simd' is introduced.
2152static bool isExplicitVecOuterLoop(Loop *OuterLp,
2153 OptimizationRemarkEmitter *ORE) {
2154 assert(!OuterLp->isInnermost() && "This is not an outer loop")(static_cast <bool> (!OuterLp->isInnermost() &&
"This is not an outer loop") ? void (0) : __assert_fail ("!OuterLp->isInnermost() && \"This is not an outer loop\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2154, __extension__
__PRETTY_FUNCTION__))
;
2155 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2156
2157 // Only outer loops with an explicit vectorization hint are supported.
2158 // Unannotated outer loops are ignored.
2159 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2160 return false;
2161
2162 Function *Fn = OuterLp->getHeader()->getParent();
2163 if (!Hints.allowVectorization(Fn, OuterLp,
2164 true /*VectorizeOnlyWhenForced*/)) {
2165 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"
; } } while (false)
;
2166 return false;
2167 }
2168
2169 if (Hints.getInterleave() > 1) {
2170 // TODO: Interleave support is future work.
2171 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
2172 "outer loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
;
2173 Hints.emitRemarkWithHints();
2174 return false;
2175 }
2176
2177 return true;
2178}
2179
2180static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2181 OptimizationRemarkEmitter *ORE,
2182 SmallVectorImpl<Loop *> &V) {
2183 // Collect inner loops and outer loops without irreducible control flow. For
2184 // now, only collect outer loops that have explicit vectorization hints. If we
2185 // are stress testing the VPlan H-CFG construction, we collect the outermost
2186 // loop of every loop nest.
2187 if (L.isInnermost() || VPlanBuildStressTest ||
2188 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2189 LoopBlocksRPO RPOT(&L);
2190 RPOT.perform(LI);
2191 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2192 V.push_back(&L);
2193 // TODO: Collect inner loops inside marked outer loops in case
2194 // vectorization fails for the outer loop. Do not invoke
2195 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2196 // already known to be reducible. We can use an inherited attribute for
2197 // that.
2198 return;
2199 }
2200 }
2201 for (Loop *InnerL : L)
2202 collectSupportedLoops(*InnerL, LI, ORE, V);
2203}
2204
2205namespace {
2206
2207/// The LoopVectorize Pass.
2208struct LoopVectorize : public FunctionPass {
2209 /// Pass identification, replacement for typeid
2210 static char ID;
2211
2212 LoopVectorizePass Impl;
2213
2214 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2215 bool VectorizeOnlyWhenForced = false)
2216 : FunctionPass(ID),
2217 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2218 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2219 }
2220
2221 bool runOnFunction(Function &F) override {
2222 if (skipFunction(F))
2223 return false;
2224
2225 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2226 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2227 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2228 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2229 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2230 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2231 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2232 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2233 auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs();
2234 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2235 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2236 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2237
2238 return Impl
2239 .runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AC, LAIs, *ORE, PSI)
2240 .MadeAnyChange;
2241 }
2242
2243 void getAnalysisUsage(AnalysisUsage &AU) const override {
2244 AU.addRequired<AssumptionCacheTracker>();
2245 AU.addRequired<BlockFrequencyInfoWrapperPass>();
2246 AU.addRequired<DominatorTreeWrapperPass>();
2247 AU.addRequired<LoopInfoWrapperPass>();
2248 AU.addRequired<ScalarEvolutionWrapperPass>();
2249 AU.addRequired<TargetTransformInfoWrapperPass>();
2250 AU.addRequired<LoopAccessLegacyAnalysis>();
2251 AU.addRequired<DemandedBitsWrapperPass>();
2252 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2253 AU.addRequired<InjectTLIMappingsLegacy>();
2254
2255 // We currently do not preserve loopinfo/dominator analyses with outer loop
2256 // vectorization. Until this is addressed, mark these analyses as preserved
2257 // only for non-VPlan-native path.
2258 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2259 if (!EnableVPlanNativePath) {
2260 AU.addPreserved<LoopInfoWrapperPass>();
2261 AU.addPreserved<DominatorTreeWrapperPass>();
2262 }
2263
2264 AU.addPreserved<BasicAAWrapperPass>();
2265 AU.addPreserved<GlobalsAAWrapperPass>();
2266 AU.addRequired<ProfileSummaryInfoWrapperPass>();
2267 }
2268};
2269
2270} // end anonymous namespace
2271
2272//===----------------------------------------------------------------------===//
2273// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2274// LoopVectorizationCostModel and LoopVectorizationPlanner.
2275//===----------------------------------------------------------------------===//
2276
2277Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2278 // We need to place the broadcast of invariant variables outside the loop,
2279 // but only if it's proven safe to do so. Else, broadcast will be inside
2280 // vector loop body.
2281 Instruction *Instr = dyn_cast<Instruction>(V);
2282 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2283 (!Instr ||
2284 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2285 // Place the code for broadcasting invariant variables in the new preheader.
2286 IRBuilder<>::InsertPointGuard Guard(Builder);
2287 if (SafeToHoist)
2288 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2289
2290 // Broadcast the scalar into all locations in the vector.
2291 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2292
2293 return Shuf;
2294}
2295
2296/// This function adds
2297/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2298/// to each vector element of Val. The sequence starts at StartIndex.
2299/// \p Opcode is relevant for FP induction variable.
2300static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2301 Instruction::BinaryOps BinOp, ElementCount VF,
2302 IRBuilderBase &Builder) {
2303 assert(VF.isVector() && "only vector VFs are supported")(static_cast <bool> (VF.isVector() && "only vector VFs are supported"
) ? void (0) : __assert_fail ("VF.isVector() && \"only vector VFs are supported\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2303, __extension__
__PRETTY_FUNCTION__))
;
2304
2305 // Create and check the types.
2306 auto *ValVTy = cast<VectorType>(Val->getType());
2307 ElementCount VLen = ValVTy->getElementCount();
2308
2309 Type *STy = Val->getType()->getScalarType();
2310 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2311, __extension__
__PRETTY_FUNCTION__))
2311 "Induction Step must be an integer or FP")(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2311, __extension__
__PRETTY_FUNCTION__))
;
2312 assert(Step->getType() == STy && "Step has wrong type")(static_cast <bool> (Step->getType() == STy &&
"Step has wrong type") ? void (0) : __assert_fail ("Step->getType() == STy && \"Step has wrong type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2312, __extension__
__PRETTY_FUNCTION__))
;
2313
2314 SmallVector<Constant *, 8> Indices;
2315
2316 // Create a vector of consecutive numbers from zero to VF.
2317 VectorType *InitVecValVTy = ValVTy;
2318 if (STy->isFloatingPointTy()) {
2319 Type *InitVecValSTy =
2320 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2321 InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2322 }
2323 Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2324
2325 // Splat the StartIdx
2326 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2327
2328 if (STy->isIntegerTy()) {
2329 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2330 Step = Builder.CreateVectorSplat(VLen, Step);
2331 assert(Step->getType() == Val->getType() && "Invalid step vec")(static_cast <bool> (Step->getType() == Val->getType
() && "Invalid step vec") ? void (0) : __assert_fail (
"Step->getType() == Val->getType() && \"Invalid step vec\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2331, __extension__
__PRETTY_FUNCTION__))
;
2332 // FIXME: The newly created binary instructions should contain nsw/nuw
2333 // flags, which can be found from the original scalar operations.
2334 Step = Builder.CreateMul(InitVec, Step);
2335 return Builder.CreateAdd(Val, Step, "induction");
2336 }
2337
2338 // Floating point induction.
2339 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
== Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2340, __extension__
__PRETTY_FUNCTION__))
2340 "Binary Opcode should be specified for FP induction")(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
== Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2340, __extension__
__PRETTY_FUNCTION__))
;
2341 InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2342 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2343
2344 Step = Builder.CreateVectorSplat(VLen, Step);
2345 Value *MulOp = Builder.CreateFMul(InitVec, Step);
2346 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2347}
2348
2349/// Compute scalar induction steps. \p ScalarIV is the scalar induction
2350/// variable on which to base the steps, \p Step is the size of the step.
2351static void buildScalarSteps(Value *ScalarIV, Value *Step,
2352 const InductionDescriptor &ID, VPValue *Def,
2353 VPTransformState &State) {
2354 IRBuilderBase &Builder = State.Builder;
2355
2356 // Ensure step has the same type as that of scalar IV.
2357 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2358 if (ScalarIVTy != Step->getType()) {
2359 // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to
2360 // avoid separate truncate here.
2361 assert(Step->getType()->isIntegerTy() &&(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2362, __extension__
__PRETTY_FUNCTION__))
2362 "Truncation requires an integer step")(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2362, __extension__
__PRETTY_FUNCTION__))
;
2363 Step = State.Builder.CreateTrunc(Step, ScalarIVTy);
2364 }
2365
2366 // We build scalar steps for both integer and floating-point induction
2367 // variables. Here, we determine the kind of arithmetic we will perform.
2368 Instruction::BinaryOps AddOp;
2369 Instruction::BinaryOps MulOp;
2370 if (ScalarIVTy->isIntegerTy()) {
2371 AddOp = Instruction::Add;
2372 MulOp = Instruction::Mul;
2373 } else {
2374 AddOp = ID.getInductionOpcode();
2375 MulOp = Instruction::FMul;
2376 }
2377
2378 // Determine the number of scalars we need to generate for each unroll
2379 // iteration.
2380 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2381 // Compute the scalar steps and save the results in State.
2382 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2383 ScalarIVTy->getScalarSizeInBits());
2384 Type *VecIVTy = nullptr;
2385 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2386 if (!FirstLaneOnly && State.VF.isScalable()) {
2387 VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2388 UnitStepVec =
2389 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2390 SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2391 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2392 }
2393
2394 unsigned StartPart = 0;
2395 unsigned EndPart = State.UF;
2396 unsigned StartLane = 0;
2397 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2398 if (State.Instance) {
2399 StartPart = State.Instance->Part;
2400 EndPart = StartPart + 1;
2401 StartLane = State.Instance->Lane.getKnownLane();
2402 EndLane = StartLane + 1;
2403 }
2404 for (unsigned Part = StartPart; Part < EndPart; ++Part) {
2405 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2406
2407 if (!FirstLaneOnly && State.VF.isScalable()) {
2408 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2409 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2410 if (ScalarIVTy->isFloatingPointTy())
2411 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2412 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2413 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2414 State.set(Def, Add, Part);
2415 // It's useful to record the lane values too for the known minimum number
2416 // of elements so we do those below. This improves the code quality when
2417 // trying to extract the first element, for example.
2418 }
2419
2420 if (ScalarIVTy->isFloatingPointTy())
2421 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2422
2423 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2424 Value *StartIdx = Builder.CreateBinOp(
2425 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2426 // The step returned by `createStepForVF` is a runtime-evaluated value
2427 // when VF is scalable. Otherwise, it should be folded into a Constant.
2428 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&(static_cast <bool> ((State.VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2430, __extension__
__PRETTY_FUNCTION__))
2429 "Expected StartIdx to be folded to a constant when VF is not "(static_cast <bool> ((State.VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2430, __extension__
__PRETTY_FUNCTION__))
2430 "scalable")(static_cast <bool> ((State.VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2430, __extension__
__PRETTY_FUNCTION__))
;
2431 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2432 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2433 State.set(Def, Add, VPIteration(Part, Lane));
2434 }
2435 }
2436}
2437
2438// Generate code for the induction step. Note that induction steps are
2439// required to be loop-invariant
2440static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2441 Instruction *InsertBefore,
2442 Loop *OrigLoop = nullptr) {
2443 const DataLayout &DL = SE.getDataLayout();
2444 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&(static_cast <bool> ((!OrigLoop || SE.isLoopInvariant(Step
, OrigLoop)) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("(!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && \"Induction step should be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2445, __extension__
__PRETTY_FUNCTION__))
2445 "Induction step should be loop invariant")(static_cast <bool> ((!OrigLoop || SE.isLoopInvariant(Step
, OrigLoop)) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("(!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && \"Induction step should be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2445, __extension__
__PRETTY_FUNCTION__))
;
2446 if (auto *E = dyn_cast<SCEVUnknown>(Step))
2447 return E->getValue();
2448
2449 SCEVExpander Exp(SE, DL, "induction");
2450 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2451}
2452
2453/// Compute the transformed value of Index at offset StartValue using step
2454/// StepValue.
2455/// For integer induction, returns StartValue + Index * StepValue.
2456/// For pointer induction, returns StartValue[Index * StepValue].
2457/// FIXME: The newly created binary instructions should contain nsw/nuw
2458/// flags, which can be found from the original scalar operations.
2459static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
2460 Value *StartValue, Value *Step,
2461 const InductionDescriptor &ID) {
2462 Type *StepTy = Step->getType();
2463 Value *CastedIndex = StepTy->isIntegerTy()
2464 ? B.CreateSExtOrTrunc(Index, StepTy)
2465 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2466 if (CastedIndex != Index) {
2467 CastedIndex->setName(CastedIndex->getName() + ".cast");
2468 Index = CastedIndex;
2469 }
2470
2471 // Note: the IR at this point is broken. We cannot use SE to create any new
2472 // SCEV and then expand it, hoping that SCEV's simplification will give us
2473 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2474 // lead to various SCEV crashes. So all we can do is to use builder and rely
2475 // on InstCombine for future simplifications. Here we handle some trivial
2476 // cases only.
2477 auto CreateAdd = [&B](Value *X, Value *Y) {
2478 assert(X->getType() == Y->getType() && "Types don't match!")(static_cast <bool> (X->getType() == Y->getType()
&& "Types don't match!") ? void (0) : __assert_fail (
"X->getType() == Y->getType() && \"Types don't match!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2478, __extension__
__PRETTY_FUNCTION__))
;
2479 if (auto *CX = dyn_cast<ConstantInt>(X))
2480 if (CX->isZero())
2481 return Y;
2482 if (auto *CY = dyn_cast<ConstantInt>(Y))
2483 if (CY->isZero())
2484 return X;
2485 return B.CreateAdd(X, Y);
2486 };
2487
2488 // We allow X to be a vector type, in which case Y will potentially be
2489 // splatted into a vector with the same element count.
2490 auto CreateMul = [&B](Value *X, Value *Y) {
2491 assert(X->getType()->getScalarType() == Y->getType() &&(static_cast <bool> (X->getType()->getScalarType(
) == Y->getType() && "Types don't match!") ? void (
0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2492, __extension__
__PRETTY_FUNCTION__))
2492 "Types don't match!")(static_cast <bool> (X->getType()->getScalarType(
) == Y->getType() && "Types don't match!") ? void (
0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2492, __extension__
__PRETTY_FUNCTION__))
;
2493 if (auto *CX = dyn_cast<ConstantInt>(X))
2494 if (CX->isOne())
2495 return Y;
2496 if (auto *CY = dyn_cast<ConstantInt>(Y))
2497 if (CY->isOne())
2498 return X;
2499 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2500 if (XVTy && !isa<VectorType>(Y->getType()))
2501 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2502 return B.CreateMul(X, Y);
2503 };
2504
2505 switch (ID.getKind()) {
2506 case InductionDescriptor::IK_IntInduction: {
2507 assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for integer inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2508, __extension__
__PRETTY_FUNCTION__))
2508 "Vector indices not supported for integer inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for integer inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2508, __extension__
__PRETTY_FUNCTION__))
;
2509 assert(Index->getType() == StartValue->getType() &&(static_cast <bool> (Index->getType() == StartValue->
getType() && "Index type does not match StartValue type"
) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2510, __extension__
__PRETTY_FUNCTION__))
2510 "Index type does not match StartValue type")(static_cast <bool> (Index->getType() == StartValue->
getType() && "Index type does not match StartValue type"
) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2510, __extension__
__PRETTY_FUNCTION__))
;
2511 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2512 return B.CreateSub(StartValue, Index);
2513 auto *Offset = CreateMul(Index, Step);
2514 return CreateAdd(StartValue, Offset);
2515 }
2516 case InductionDescriptor::IK_PtrInduction: {
2517 assert(isa<Constant>(Step) &&(static_cast <bool> (isa<Constant>(Step) &&
"Expected constant step for pointer induction") ? void (0) :
__assert_fail ("isa<Constant>(Step) && \"Expected constant step for pointer induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2518, __extension__
__PRETTY_FUNCTION__))
2518 "Expected constant step for pointer induction")(static_cast <bool> (isa<Constant>(Step) &&
"Expected constant step for pointer induction") ? void (0) :
__assert_fail ("isa<Constant>(Step) && \"Expected constant step for pointer induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2518, __extension__
__PRETTY_FUNCTION__))
;
2519 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2520 }
2521 case InductionDescriptor::IK_FpInduction: {
2522 assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for FP inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2523, __extension__
__PRETTY_FUNCTION__))
2523 "Vector indices not supported for FP inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for FP inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2523, __extension__
__PRETTY_FUNCTION__))
;
2524 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value")(static_cast <bool> (Step->getType()->isFloatingPointTy
() && "Expected FP Step value") ? void (0) : __assert_fail
("Step->getType()->isFloatingPointTy() && \"Expected FP Step value\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2524, __extension__
__PRETTY_FUNCTION__))
;
2525 auto InductionBinOp = ID.getInductionBinOp();
2526 assert(InductionBinOp &&(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2529, __extension__
__PRETTY_FUNCTION__))
2527 (InductionBinOp->getOpcode() == Instruction::FAdd ||(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2529, __extension__
__PRETTY_FUNCTION__))
2528 InductionBinOp->getOpcode() == Instruction::FSub) &&(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2529, __extension__
__PRETTY_FUNCTION__))
2529 "Original bin op should be defined for FP induction")(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2529, __extension__
__PRETTY_FUNCTION__))
;
2530
2531 Value *MulExp = B.CreateFMul(Step, Index);
2532 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2533 "induction");
2534 }
2535 case InductionDescriptor::IK_NoInduction:
2536 return nullptr;
2537 }
2538 llvm_unreachable("invalid enum")::llvm::llvm_unreachable_internal("invalid enum", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2538)
;
2539}
2540
2541void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2542 const VPIteration &Instance,
2543 VPTransformState &State) {
2544 Value *ScalarInst = State.get(Def, Instance);
2545 Value *VectorValue = State.get(Def, Instance.Part);
2546 VectorValue = Builder.CreateInsertElement(
2547 VectorValue, ScalarInst,
2548 Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2549 State.set(Def, VectorValue, Instance.Part);
2550}
2551
2552// Return whether we allow using masked interleave-groups (for dealing with
2553// strided loads/stores that reside in predicated blocks, or for dealing
2554// with gaps).
2555static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2556 // If an override option has been passed in for interleaved accesses, use it.
2557 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2558 return EnableMaskedInterleavedMemAccesses;
2559
2560 return TTI.enableMaskedInterleavedAccessVectorization();
2561}
2562
2563// Try to vectorize the interleave group that \p Instr belongs to.
2564//
2565// E.g. Translate following interleaved load group (factor = 3):
2566// for (i = 0; i < N; i+=3) {
2567// R = Pic[i]; // Member of index 0
2568// G = Pic[i+1]; // Member of index 1
2569// B = Pic[i+2]; // Member of index 2
2570// ... // do something to R, G, B
2571// }
2572// To:
2573// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2574// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2575// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2576// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2577//
2578// Or translate following interleaved store group (factor = 3):
2579// for (i = 0; i < N; i+=3) {
2580// ... do something to R, G, B
2581// Pic[i] = R; // Member of index 0
2582// Pic[i+1] = G; // Member of index 1
2583// Pic[i+2] = B; // Member of index 2
2584// }
2585// To:
2586// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2587// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2588// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2589// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2590// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2591void InnerLoopVectorizer::vectorizeInterleaveGroup(
2592 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2593 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2594 VPValue *BlockInMask) {
2595 Instruction *Instr = Group->getInsertPos();
2596 const DataLayout &DL = Instr->getModule()->getDataLayout();
2597
2598 // Prepare for the vector type of the interleaved load/store.
2599 Type *ScalarTy = getLoadStoreType(Instr);
2600 unsigned InterleaveFactor = Group->getFactor();
2601 assert(!VF.isScalable() && "scalable vectors not yet supported.")(static_cast <bool> (!VF.isScalable() && "scalable vectors not yet supported."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2601, __extension__
__PRETTY_FUNCTION__))
;
2602 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2603
2604 // Prepare for the new pointers.
2605 SmallVector<Value *, 2> AddrParts;
2606 unsigned Index = Group->getIndex(Instr);
2607
2608 // TODO: extend the masked interleaved-group support to reversed access.
2609 assert((!BlockInMask || !Group->isReverse()) &&(static_cast <bool> ((!BlockInMask || !Group->isReverse
()) && "Reversed masked interleave-group not supported."
) ? void (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2610, __extension__
__PRETTY_FUNCTION__))
2610 "Reversed masked interleave-group not supported.")(static_cast <bool> ((!BlockInMask || !Group->isReverse
()) && "Reversed masked interleave-group not supported."
) ? void (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2610, __extension__
__PRETTY_FUNCTION__))
;
2611
2612 // If the group is reverse, adjust the index to refer to the last vector lane
2613 // instead of the first. We adjust the index from the first vector lane,
2614 // rather than directly getting the pointer for lane VF - 1, because the
2615 // pointer operand of the interleaved access is supposed to be uniform. For
2616 // uniform instructions, we're only required to generate a value for the
2617 // first vector lane in each unroll iteration.
2618 if (Group->isReverse())
2619 Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2620
2621 for (unsigned Part = 0; Part < UF; Part++) {
2622 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2623 State.setDebugLocFromInst(AddrPart);
2624
2625 // Notice current instruction could be any index. Need to adjust the address
2626 // to the member of index 0.
2627 //
2628 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2629 // b = A[i]; // Member of index 0
2630 // Current pointer is pointed to A[i+1], adjust it to A[i].
2631 //
2632 // E.g. A[i+1] = a; // Member of index 1
2633 // A[i] = b; // Member of index 0
2634 // A[i+2] = c; // Member of index 2 (Current instruction)
2635 // Current pointer is pointed to A[i+2], adjust it to A[i].
2636
2637 bool InBounds = false;
2638 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2639 InBounds = gep->isInBounds();
2640 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2641 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2642
2643 // Cast to the vector pointer type.
2644 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2645 Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2646 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2647 }
2648
2649 State.setDebugLocFromInst(Instr);
2650 Value *PoisonVec = PoisonValue::get(VecTy);
2651
2652 Value *MaskForGaps = nullptr;
2653 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2654 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2655 assert(MaskForGaps && "Mask for Gaps is required but it is null")(static_cast <bool> (MaskForGaps && "Mask for Gaps is required but it is null"
) ? void (0) : __assert_fail ("MaskForGaps && \"Mask for Gaps is required but it is null\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2655, __extension__
__PRETTY_FUNCTION__))
;
2656 }
2657
2658 // Vectorize the interleaved load group.
2659 if (isa<LoadInst>(Instr)) {
2660 // For each unroll part, create a wide load for the group.
2661 SmallVector<Value *, 2> NewLoads;
2662 for (unsigned Part = 0; Part < UF; Part++) {
2663 Instruction *NewLoad;
2664 if (BlockInMask || MaskForGaps) {
2665 assert(useMaskedInterleavedAccesses(*TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(*TTI)
&& "masked interleaved groups are not allowed.") ? void
(0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2666, __extension__
__PRETTY_FUNCTION__))
2666 "masked interleaved groups are not allowed.")(static_cast <bool> (useMaskedInterleavedAccesses(*TTI)
&& "masked interleaved groups are not allowed.") ? void
(0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2666, __extension__
__PRETTY_FUNCTION__))
;
2667 Value *GroupMask = MaskForGaps;
2668 if (BlockInMask) {
2669 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2670 Value *ShuffledMask = Builder.CreateShuffleVector(
2671 BlockInMaskPart,
2672 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2673 "interleaved.mask");
2674 GroupMask = MaskForGaps
2675 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2676 MaskForGaps)
2677 : ShuffledMask;
2678 }
2679 NewLoad =
2680 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2681 GroupMask, PoisonVec, "wide.masked.vec");
2682 }
2683 else
2684 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2685 Group->getAlign(), "wide.vec");
2686 Group->addMetadata(NewLoad);
2687 NewLoads.push_back(NewLoad);
2688 }
2689
2690 // For each member in the group, shuffle out the appropriate data from the
2691 // wide loads.
2692 unsigned J = 0;
2693 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2694 Instruction *Member = Group->getMember(I);
2695
2696 // Skip the gaps in the group.
2697 if (!Member)
2698 continue;
2699
2700 auto StrideMask =
2701 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2702 for (unsigned Part = 0; Part < UF; Part++) {
2703 Value *StridedVec = Builder.CreateShuffleVector(
2704 NewLoads[Part], StrideMask, "strided.vec");
2705
2706 // If this member has different type, cast the result type.
2707 if (Member->getType() != ScalarTy) {
2708 assert(!VF.isScalable() && "VF is assumed to be non scalable.")(static_cast <bool> (!VF.isScalable() && "VF is assumed to be non scalable."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2708, __extension__
__PRETTY_FUNCTION__))
;
2709 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2710 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2711 }
2712
2713 if (Group->isReverse())
2714 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2715
2716 State.set(VPDefs[J], StridedVec, Part);
2717 }
2718 ++J;
2719 }
2720 return;
2721 }
2722
2723 // The sub vector type for current instruction.
2724 auto *SubVT = VectorType::get(ScalarTy, VF);
2725
2726 // Vectorize the interleaved store group.
2727 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2728 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&(static_cast <bool> ((!MaskForGaps || useMaskedInterleavedAccesses
(*TTI)) && "masked interleaved groups are not allowed."
) ? void (0) : __assert_fail ("(!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2729, __extension__
__PRETTY_FUNCTION__))
2729 "masked interleaved groups are not allowed.")(static_cast <bool> ((!MaskForGaps || useMaskedInterleavedAccesses
(*TTI)) && "masked interleaved groups are not allowed."
) ? void (0) : __assert_fail ("(!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2729, __extension__
__PRETTY_FUNCTION__))
;
2730 assert((!MaskForGaps || !VF.isScalable()) &&(static_cast <bool> ((!MaskForGaps || !VF.isScalable())
&& "masking gaps for scalable vectors is not yet supported."
) ? void (0) : __assert_fail ("(!MaskForGaps || !VF.isScalable()) && \"masking gaps for scalable vectors is not yet supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2731, __extension__
__PRETTY_FUNCTION__))
2731 "masking gaps for scalable vectors is not yet supported.")(static_cast <bool> ((!MaskForGaps || !VF.isScalable())
&& "masking gaps for scalable vectors is not yet supported."
) ? void (0) : __assert_fail ("(!MaskForGaps || !VF.isScalable()) && \"masking gaps for scalable vectors is not yet supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2731, __extension__
__PRETTY_FUNCTION__))
;
2732 for (unsigned Part = 0; Part < UF; Part++) {
2733 // Collect the stored vector from each member.
2734 SmallVector<Value *, 4> StoredVecs;
2735 unsigned StoredIdx = 0;
2736 for (unsigned i = 0; i < InterleaveFactor; i++) {
2737 assert((Group->getMember(i) || MaskForGaps) &&(static_cast <bool> ((Group->getMember(i) || MaskForGaps
) && "Fail to get a member from an interleaved store group"
) ? void (0) : __assert_fail ("(Group->getMember(i) || MaskForGaps) && \"Fail to get a member from an interleaved store group\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2738, __extension__
__PRETTY_FUNCTION__))
2738 "Fail to get a member from an interleaved store group")(static_cast <bool> ((Group->getMember(i) || MaskForGaps
) && "Fail to get a member from an interleaved store group"
) ? void (0) : __assert_fail ("(Group->getMember(i) || MaskForGaps) && \"Fail to get a member from an interleaved store group\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2738, __extension__
__PRETTY_FUNCTION__))
;
2739 Instruction *Member = Group->getMember(i);
2740
2741 // Skip the gaps in the group.
2742 if (!Member) {
2743 Value *Undef = PoisonValue::get(SubVT);
2744 StoredVecs.push_back(Undef);
2745 continue;
2746 }
2747
2748 Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2749 ++StoredIdx;
2750
2751 if (Group->isReverse())
2752 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2753
2754 // If this member has different type, cast it to a unified type.
2755
2756 if (StoredVec->getType() != SubVT)
2757 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2758
2759 StoredVecs.push_back(StoredVec);
2760 }
2761
2762 // Concatenate all vectors into a wide vector.
2763 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2764
2765 // Interleave the elements in the wide vector.
2766 Value *IVec = Builder.CreateShuffleVector(
2767 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2768 "interleaved.vec");
2769
2770 Instruction *NewStoreInstr;
2771 if (BlockInMask || MaskForGaps) {
2772 Value *GroupMask = MaskForGaps;
2773 if (BlockInMask) {
2774 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2775 Value *ShuffledMask = Builder.CreateShuffleVector(
2776 BlockInMaskPart,
2777 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2778 "interleaved.mask");
2779 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2780 ShuffledMask, MaskForGaps)
2781 : ShuffledMask;
2782 }
2783 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2784 Group->getAlign(), GroupMask);
2785 } else
2786 NewStoreInstr =
2787 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2788
2789 Group->addMetadata(NewStoreInstr);
2790 }
2791}
2792
2793void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2794 VPReplicateRecipe *RepRecipe,
2795 const VPIteration &Instance,
2796 bool IfPredicateInstr,
2797 VPTransformState &State) {
2798 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")(static_cast <bool> (!Instr->getType()->isAggregateType
() && "Can't handle vectors") ? void (0) : __assert_fail
("!Instr->getType()->isAggregateType() && \"Can't handle vectors\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2798, __extension__
__PRETTY_FUNCTION__))
;
2799
2800 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2801 // the first lane and part.
2802 if (isa<NoAliasScopeDeclInst>(Instr))
2803 if (!Instance.isFirstIteration())
2804 return;
2805
2806 // Does this instruction return a value ?
2807 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2808
2809 Instruction *Cloned = Instr->clone();
2810 if (!IsVoidRetTy)
2811 Cloned->setName(Instr->getName() + ".cloned");
2812
2813 // If the scalarized instruction contributes to the address computation of a
2814 // widen masked load/store which was in a basic block that needed predication
2815 // and is not predicated after vectorization, we can't propagate
2816 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2817 // instruction could feed a poison value to the base address of the widen
2818 // load/store.
2819 if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2820 Cloned->dropPoisonGeneratingFlags();
2821
2822 if (Instr->getDebugLoc())
2823 State.setDebugLocFromInst(Instr);
2824
2825 // Replace the operands of the cloned instructions with their scalar
2826 // equivalents in the new loop.
2827 for (const auto &I : enumerate(RepRecipe->operands())) {
2828 auto InputInstance = Instance;
2829 VPValue *Operand = I.value();
2830 if (vputils::isUniformAfterVectorization(Operand))
2831 InputInstance.Lane = VPLane::getFirstLane();
2832 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2833 }
2834 State.addNewMetadata(Cloned, Instr);
2835
2836 // Place the cloned scalar in the new loop.
2837 State.Builder.Insert(Cloned);
2838
2839 State.set(RepRecipe, Cloned, Instance);
2840
2841 // If we just cloned a new assumption, add it the assumption cache.
2842 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2843 AC->registerAssumption(II);
2844
2845 // End if-block.
2846 if (IfPredicateInstr)
2847 PredicatedInstructions.push_back(Cloned);
2848}
2849
2850Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
2851 if (TripCount)
2852 return TripCount;
2853
2854 assert(InsertBlock)(static_cast <bool> (InsertBlock) ? void (0) : __assert_fail
("InsertBlock", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2854, __extension__ __PRETTY_FUNCTION__))
;
2855 IRBuilder<> Builder(InsertBlock->getTerminator());
2856 // Find the loop boundaries.
2857 Type *IdxTy = Legal->getWidestInductionType();
2858 assert(IdxTy && "No type for induction")(static_cast <bool> (IdxTy && "No type for induction"
) ? void (0) : __assert_fail ("IdxTy && \"No type for induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2858, __extension__
__PRETTY_FUNCTION__))
;
2859 const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE);
2860
2861 const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
2862
2863 // Expand the trip count and place the new instructions in the preheader.
2864 // Notice that the pre-header does not change, only the loop body.
2865 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2866
2867 // Count holds the overall loop count (N).
2868 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2869 InsertBlock->getTerminator());
2870
2871 if (TripCount->getType()->isPointerTy())
2872 TripCount =
2873 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2874 InsertBlock->getTerminator());
2875
2876 return TripCount;
2877}
2878
2879Value *
2880InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2881 if (VectorTripCount)
2882 return VectorTripCount;
2883
2884 Value *TC = getOrCreateTripCount(InsertBlock);
2885 IRBuilder<> Builder(InsertBlock->getTerminator());
2886
2887 Type *Ty = TC->getType();
2888 // This is where we can make the step a runtime constant.
2889 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2890
2891 // If the tail is to be folded by masking, round the number of iterations N
2892 // up to a multiple of Step instead of rounding down. This is done by first
2893 // adding Step-1 and then rounding down. Note that it's ok if this addition
2894 // overflows: the vector induction variable will eventually wrap to zero given
2895 // that it starts at zero and its Step is a power of two; the loop will then
2896 // exit, with the last early-exit vector comparison also producing all-true.
2897 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2898 // is accounted for in emitIterationCountCheck that adds an overflow check.
2899 if (Cost->foldTailByMasking()) {
2900 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
) * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2901, __extension__
__PRETTY_FUNCTION__))
2901 "VF*UF must be a power of 2 when folding tail by masking")(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
) * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2901, __extension__
__PRETTY_FUNCTION__))
;
2902 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2903 TC = Builder.CreateAdd(
2904 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2905 }
2906
2907 // Now we need to generate the expression for the part of the loop that the
2908 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2909 // iterations are not required for correctness, or N - Step, otherwise. Step
2910 // is equal to the vectorization factor (number of SIMD elements) times the
2911 // unroll factor (number of SIMD instructions).
2912 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2913
2914 // There are cases where we *must* run at least one iteration in the remainder
2915 // loop. See the cost model for when this can happen. If the step evenly
2916 // divides the trip count, we set the remainder to be equal to the step. If
2917 // the step does not evenly divide the trip count, no adjustment is necessary
2918 // since there will already be scalar iterations. Note that the minimum
2919 // iterations check ensures that N >= Step.
2920 if (Cost->requiresScalarEpilogue(VF)) {
2921 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2922 R = Builder.CreateSelect(IsZero, Step, R);
2923 }
2924
2925 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2926
2927 return VectorTripCount;
2928}
2929
2930Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2931 const DataLayout &DL) {
2932 // Verify that V is a vector type with same number of elements as DstVTy.
2933 auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2934 unsigned VF = DstFVTy->getNumElements();
2935 auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2936 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match")(static_cast <bool> ((VF == SrcVecTy->getNumElements
()) && "Vector dimensions do not match") ? void (0) :
__assert_fail ("(VF == SrcVecTy->getNumElements()) && \"Vector dimensions do not match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2936, __extension__
__PRETTY_FUNCTION__))
;
2937 Type *SrcElemTy = SrcVecTy->getElementType();
2938 Type *DstElemTy = DstFVTy->getElementType();
2939 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2940, __extension__
__PRETTY_FUNCTION__))
2940 "Vector elements must have same size")(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2940, __extension__
__PRETTY_FUNCTION__))
;
2941
2942 // Do a direct cast if element types are castable.
2943 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2944 return Builder.CreateBitOrPointerCast(V, DstFVTy);
2945 }
2946 // V cannot be directly casted to desired vector type.
2947 // May happen when V is a floating point vector but DstVTy is a vector of
2948 // pointers or vice-versa. Handle this using a two-step bitcast using an
2949 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2950 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2951, __extension__
__PRETTY_FUNCTION__))
2951 "Only one type should be a pointer type")(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2951, __extension__
__PRETTY_FUNCTION__))
;
2952 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&(static_cast <bool> ((DstElemTy->isFloatingPointTy()
!= SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2953, __extension__
__PRETTY_FUNCTION__))
2953 "Only one type should be a floating point type")(static_cast <bool> ((DstElemTy->isFloatingPointTy()
!= SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2953, __extension__
__PRETTY_FUNCTION__))
;
2954 Type *IntTy =
2955 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2956 auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2957 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2958 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2959}
2960
2961void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2962 Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
2963 // Reuse existing vector loop preheader for TC checks.
2964 // Note that new preheader block is generated for vector loop.
2965 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2966 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2967
2968 // Generate code to check if the loop's trip count is less than VF * UF, or
2969 // equal to it in case a scalar epilogue is required; this implies that the
2970 // vector trip count is zero. This check also covers the case where adding one
2971 // to the backedge-taken count overflowed leading to an incorrect trip count
2972 // of zero. In this case we will also jump to the scalar loop.
2973 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2974 : ICmpInst::ICMP_ULT;
2975
2976 // If tail is to be folded, vector loop takes care of all iterations.
2977 Type *CountTy = Count->getType();
2978 Value *CheckMinIters = Builder.getFalse();
2979 auto CreateStep = [&]() -> Value * {
2980 // Create step with max(MinProTripCount, UF * VF).
2981 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2982 return createStepForVF(Builder, CountTy, VF, UF);
2983
2984 Value *MinProfTC =
2985 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2986 if (!VF.isScalable())
2987 return MinProfTC;
2988 return Builder.CreateBinaryIntrinsic(
2989 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2990 };
2991
2992 if (!Cost->foldTailByMasking())
2993 CheckMinIters =
2994 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2995 else if (VF.isScalable()) {
2996 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2997 // an overflow to zero when updating induction variables and so an
2998 // additional overflow check is required before entering the vector loop.
2999
3000 // Get the maximum unsigned value for the type.
3001 Value *MaxUIntTripCount =
3002 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
3003 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
3004
3005 // Don't execute the vector loop if (UMax - n) < (VF * UF).
3006 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
3007 }
3008
3009 // Create new preheader for vector loop.
3010 LoopVectorPreHeader =
3011 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3012 "vector.ph");
3013
3014 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3016, __extension__
__PRETTY_FUNCTION__))
3015 DT->getNode(Bypass)->getIDom()) &&(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3016, __extension__
__PRETTY_FUNCTION__))
3016 "TC check is expected to dominate Bypass")(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3016, __extension__
__PRETTY_FUNCTION__))
;
3017
3018 // Update dominator for Bypass & LoopExit (if needed).
3019 DT->changeImmediateDominator(Bypass, TCCheckBlock);
3020 if (!Cost->requiresScalarEpilogue(VF))
3021 // If there is an epilogue which must run, there's no edge from the
3022 // middle block to exit blocks and thus no need to update the immediate
3023 // dominator of the exit blocks.
3024 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3025
3026 ReplaceInstWithInst(
3027 TCCheckBlock->getTerminator(),
3028 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3029 LoopBypassBlocks.push_back(TCCheckBlock);
3030}
3031
3032BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
3033 BasicBlock *const SCEVCheckBlock =
3034 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
3035 if (!SCEVCheckBlock)
3036 return nullptr;
3037
3038 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3041, __extension__
__PRETTY_FUNCTION__))
3039 (OptForSizeBasedOnProfile &&(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3041, __extension__
__PRETTY_FUNCTION__))
3040 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3041, __extension__
__PRETTY_FUNCTION__))
3041 "Cannot SCEV check stride or overflow when optimizing for size")(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3041, __extension__
__PRETTY_FUNCTION__))
;
3042
3043
3044 // Update dominator only if this is first RT check.
3045 if (LoopBypassBlocks.empty()) {
3046 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3047 if (!Cost->requiresScalarEpilogue(VF))
3048 // If there is an epilogue which must run, there's no edge from the
3049 // middle block to exit blocks and thus no need to update the immediate
3050 // dominator of the exit blocks.
3051 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3052 }
3053
3054 LoopBypassBlocks.push_back(SCEVCheckBlock);
3055 AddedSafetyChecks = true;
3056 return SCEVCheckBlock;
3057}
3058
3059BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
3060 // VPlan-native path does not do any analysis for runtime checks currently.
3061 if (EnableVPlanNativePath)
3062 return nullptr;
3063
3064 BasicBlock *const MemCheckBlock =
3065 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
3066
3067 // Check if we generated code that checks in runtime if arrays overlap. We put
3068 // the checks into a separate block to make the more common case of few
3069 // elements faster.
3070 if (!MemCheckBlock)
3071 return nullptr;
3072
3073 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3074 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3076, __extension__
__PRETTY_FUNCTION__))
3075 "Cannot emit memory checks when optimizing for size, unless forced "(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3076, __extension__
__PRETTY_FUNCTION__))
3076 "to vectorize.")(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3076, __extension__
__PRETTY_FUNCTION__))
;
3077 ORE->emit([&]() {
3078 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationCodeSize",
3079 OrigLoop->getStartLoc(),
3080 OrigLoop->getHeader())
3081 << "Code-size may be reduced by not forcing "
3082 "vectorization, or by source-code modifications "
3083 "eliminating the need for runtime checks "
3084 "(e.g., adding 'restrict').";
3085 });
3086 }
3087
3088 LoopBypassBlocks.push_back(MemCheckBlock);
3089
3090 AddedSafetyChecks = true;
3091
3092 return MemCheckBlock;
3093}
3094
3095void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3096 LoopScalarBody = OrigLoop->getHeader();
3097 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3098 assert(LoopVectorPreHeader && "Invalid loop structure")(static_cast <bool> (LoopVectorPreHeader && "Invalid loop structure"
) ? void (0) : __assert_fail ("LoopVectorPreHeader && \"Invalid loop structure\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3098, __extension__
__PRETTY_FUNCTION__))
;
3099 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3100 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&(static_cast <bool> ((LoopExitBlock || Cost->requiresScalarEpilogue
(VF)) && "multiple exit loop without required epilogue?"
) ? void (0) : __assert_fail ("(LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && \"multiple exit loop without required epilogue?\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3101, __extension__
__PRETTY_FUNCTION__))
3101 "multiple exit loop without required epilogue?")(static_cast <bool> ((LoopExitBlock || Cost->requiresScalarEpilogue
(VF)) && "multiple exit loop without required epilogue?"
) ? void (0) : __assert_fail ("(LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && \"multiple exit loop without required epilogue?\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3101, __extension__
__PRETTY_FUNCTION__))
;
3102
3103 LoopMiddleBlock =
3104 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3105 LI, nullptr, Twine(Prefix) + "middle.block");
3106 LoopScalarPreHeader =
3107 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3108 nullptr, Twine(Prefix) + "scalar.ph");
3109
3110 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3111
3112 // Set up the middle block terminator. Two cases:
3113 // 1) If we know that we must execute the scalar epilogue, emit an
3114 // unconditional branch.
3115 // 2) Otherwise, we must have a single unique exit block (due to how we
3116 // implement the multiple exit case). In this case, set up a conditional
3117 // branch from the middle block to the loop scalar preheader, and the
3118 // exit block. completeLoopSkeleton will update the condition to use an
3119 // iteration check, if required to decide whether to execute the remainder.
3120 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3121 BranchInst::Create(LoopScalarPreHeader) :
3122 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3123 Builder.getTrue());
3124 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3125 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3126
3127 // Update dominator for loop exit. During skeleton creation, only the vector
3128 // pre-header and the middle block are created. The vector loop is entirely
3129 // created during VPlan exection.
3130 if (!Cost->requiresScalarEpilogue(VF))
3131 // If there is an epilogue which must run, there's no edge from the
3132 // middle block to exit blocks and thus no need to update the immediate
3133 // dominator of the exit blocks.
3134 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3135}
3136
3137PHINode *InnerLoopVectorizer::createInductionResumeValue(
3138 PHINode *OrigPhi, const InductionDescriptor &II,
3139 ArrayRef<BasicBlock *> BypassBlocks,
3140 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3141 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3142 assert(VectorTripCount && "Expected valid arguments")(static_cast <bool> (VectorTripCount && "Expected valid arguments"
) ? void (0) : __assert_fail ("VectorTripCount && \"Expected valid arguments\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3142, __extension__
__PRETTY_FUNCTION__))
;
3143
3144 Instruction *OldInduction = Legal->getPrimaryInduction();
3145 Value *&EndValue = IVEndValues[OrigPhi];
3146 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3147 if (OrigPhi == OldInduction) {
3148 // We know what the end value is.
3149 EndValue = VectorTripCount;
3150 } else {
3151 IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3152
3153 // Fast-math-flags propagate from the original induction instruction.
3154 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3155 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3156
3157 Value *Step =
3158 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3159 EndValue =
3160 emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II);
3161 EndValue->setName("ind.end");
3162
3163 // Compute the end value for the additional bypass (if applicable).
3164 if (AdditionalBypass.first) {
3165 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3166 Value *Step =
3167 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3168 EndValueFromAdditionalBypass = emitTransformedIndex(
3169 B, AdditionalBypass.second, II.getStartValue(), Step, II);
3170 EndValueFromAdditionalBypass->setName("ind.end");
3171 }
3172 }
3173
3174 // Create phi nodes to merge from the backedge-taken check block.
3175 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3176 LoopScalarPreHeader->getTerminator());
3177 // Copy original phi DL over to the new one.
3178 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3179
3180 // The new PHI merges the original incoming value, in case of a bypass,
3181 // or the value at the end of the vectorized loop.
3182 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3183
3184 // Fix the scalar body counter (PHI node).
3185 // The old induction's phi node in the scalar body needs the truncated
3186 // value.
3187 for (BasicBlock *BB : BypassBlocks)
3188 BCResumeVal->addIncoming(II.getStartValue(), BB);
3189
3190 if (AdditionalBypass.first)
3191 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3192 EndValueFromAdditionalBypass);
3193 return BCResumeVal;
3194}
3195
3196void InnerLoopVectorizer::createInductionResumeValues(
3197 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3198 assert(((AdditionalBypass.first && AdditionalBypass.second) ||(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3200, __extension__
__PRETTY_FUNCTION__))
3199 (!AdditionalBypass.first && !AdditionalBypass.second)) &&(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3200, __extension__
__PRETTY_FUNCTION__))
3200 "Inconsistent information about additional bypass.")(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3200, __extension__
__PRETTY_FUNCTION__))
;
3201 // We are going to resume the execution of the scalar loop.
3202 // Go over all of the induction variables that we found and fix the
3203 // PHIs that are left in the scalar version of the loop.
3204 // The starting values of PHI nodes depend on the counter of the last
3205 // iteration in the vectorized loop.
3206 // If we come from a bypass edge then we need to start from the original
3207 // start value.
3208 for (const auto &InductionEntry : Legal->getInductionVars()) {
3209 PHINode *OrigPhi = InductionEntry.first;
3210 const InductionDescriptor &II = InductionEntry.second;
3211 PHINode *BCResumeVal = createInductionResumeValue(
3212 OrigPhi, II, LoopBypassBlocks, AdditionalBypass);
3213 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3214 }
3215}
3216
3217BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3218 // The trip counts should be cached by now.
3219 Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
3220 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3221
3222 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3223
3224 // Add a check in the middle block to see if we have completed
3225 // all of the iterations in the first vector loop. Three cases:
3226 // 1) If we require a scalar epilogue, there is no conditional branch as
3227 // we unconditionally branch to the scalar preheader. Do nothing.
3228 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3229 // Thus if tail is to be folded, we know we don't need to run the
3230 // remainder and we can use the previous value for the condition (true).
3231 // 3) Otherwise, construct a runtime check.
3232 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3233 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3234 Count, VectorTripCount, "cmp.n",
3235 LoopMiddleBlock->getTerminator());
3236
3237 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3238 // of the corresponding compare because they may have ended up with
3239 // different line numbers and we want to avoid awkward line stepping while
3240 // debugging. Eg. if the compare has got a line number inside the loop.
3241 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3242 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3243 }
3244
3245#ifdef EXPENSIVE_CHECKS
3246 assert(DT->verify(DominatorTree::VerificationLevel::Fast))(static_cast <bool> (DT->verify(DominatorTree::VerificationLevel
::Fast)) ? void (0) : __assert_fail ("DT->verify(DominatorTree::VerificationLevel::Fast)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3246, __extension__
__PRETTY_FUNCTION__))
;
3247#endif
3248
3249 return LoopVectorPreHeader;
3250}
3251
3252std::pair<BasicBlock *, Value *>
3253InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3254 /*
3255 In this function we generate a new loop. The new loop will contain
3256 the vectorized instructions while the old loop will continue to run the
3257 scalar remainder.
3258
3259 [ ] <-- loop iteration number check.
3260 / |
3261 / v
3262 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3263 | / |
3264 | / v
3265 || [ ] <-- vector pre header.
3266 |/ |
3267 | v
3268 | [ ] \
3269 | [ ]_| <-- vector loop (created during VPlan execution).
3270 | |
3271 | v
3272 \ -[ ] <--- middle-block.
3273 \/ |
3274 /\ v
3275 | ->[ ] <--- new preheader.
3276 | |
3277 (opt) v <-- edge from middle to exit iff epilogue is not required.
3278 | [ ] \
3279 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3280 \ |
3281 \ v
3282 >[ ] <-- exit block(s).
3283 ...
3284 */
3285
3286 // Create an empty vector loop, and prepare basic blocks for the runtime
3287 // checks.
3288 createVectorLoopSkeleton("");
3289
3290 // Now, compare the new count to zero. If it is zero skip the vector loop and
3291 // jump to the scalar loop. This check also covers the case where the
3292 // backedge-taken count is uint##_max: adding one to it will overflow leading
3293 // to an incorrect trip count of zero. In this (rare) case we will also jump
3294 // to the scalar loop.
3295 emitIterationCountCheck(LoopScalarPreHeader);
3296
3297 // Generate the code to check any assumptions that we've made for SCEV
3298 // expressions.
3299 emitSCEVChecks(LoopScalarPreHeader);
3300
3301 // Generate the code that checks in runtime if arrays overlap. We put the
3302 // checks into a separate block to make the more common case of few elements
3303 // faster.
3304 emitMemRuntimeChecks(LoopScalarPreHeader);
3305
3306 // Emit phis for the new starting index of the scalar loop.
3307 createInductionResumeValues();
3308
3309 return {completeLoopSkeleton(), nullptr};
3310}
3311
3312// Fix up external users of the induction variable. At this point, we are
3313// in LCSSA form, with all external PHIs that use the IV having one input value,
3314// coming from the remainder loop. We need those PHIs to also have a correct
3315// value for the IV when arriving directly from the middle block.
3316void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3317 const InductionDescriptor &II,
3318 Value *VectorTripCount, Value *EndValue,
3319 BasicBlock *MiddleBlock,
3320 BasicBlock *VectorHeader, VPlan &Plan) {
3321 // There are two kinds of external IV usages - those that use the value
3322 // computed in the last iteration (the PHI) and those that use the penultimate
3323 // value (the value that feeds into the phi from the loop latch).
3324 // We allow both, but they, obviously, have different values.
3325
3326 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block")(static_cast <bool> (OrigLoop->getUniqueExitBlock() &&
"Expected a single exit block") ? void (0) : __assert_fail (
"OrigLoop->getUniqueExitBlock() && \"Expected a single exit block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3326, __extension__
__PRETTY_FUNCTION__))
;
3327
3328 DenseMap<Value *, Value *> MissingVals;
3329
3330 // An external user of the last iteration's value should see the value that
3331 // the remainder loop uses to initialize its own IV.
3332 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3333 for (User *U : PostInc->users()) {
3334 Instruction *UI = cast<Instruction>(U);
3335 if (!OrigLoop->contains(UI)) {
3336 assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3336, __extension__
__PRETTY_FUNCTION__))
;
3337 MissingVals[UI] = EndValue;
3338 }
3339 }
3340
3341 // An external user of the penultimate value need to see EndValue - Step.
3342 // The simplest way to get this is to recompute it from the constituent SCEVs,
3343 // that is Start + (Step * (CRD - 1)).
3344 for (User *U : OrigPhi->users()) {
3345 auto *UI = cast<Instruction>(U);
3346 if (!OrigLoop->contains(UI)) {
3347 assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3347, __extension__
__PRETTY_FUNCTION__))
;
3348
3349 IRBuilder<> B(MiddleBlock->getTerminator());
3350
3351 // Fast-math-flags propagate from the original induction instruction.
3352 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3353 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3354
3355 Value *CountMinusOne = B.CreateSub(
3356 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3357 CountMinusOne->setName("cmo");
3358 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3359 VectorHeader->getTerminator());
3360 Value *Escape =
3361 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II);
3362 Escape->setName("ind.escape");
3363 MissingVals[UI] = Escape;
3364 }
3365 }
3366
3367 for (auto &I : MissingVals) {
3368 PHINode *PHI = cast<PHINode>(I.first);
3369 // One corner case we have to handle is two IVs "chasing" each-other,
3370 // that is %IV2 = phi [...], [ %IV1, %latch ]
3371 // In this case, if IV1 has an external use, we need to avoid adding both
3372 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3373 // don't already have an incoming value for the middle block.
3374 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3375 PHI->addIncoming(I.second, MiddleBlock);
3376 Plan.removeLiveOut(PHI);
3377 }
3378 }
3379}
3380
3381namespace {
3382
3383struct CSEDenseMapInfo {
3384 static bool canHandle(const Instruction *I) {
3385 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3386 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3387 }
3388
3389 static inline Instruction *getEmptyKey() {
3390 return DenseMapInfo<Instruction *>::getEmptyKey();
3391 }
3392
3393 static inline Instruction *getTombstoneKey() {
3394 return DenseMapInfo<Instruction *>::getTombstoneKey();
3395 }
3396
3397 static unsigned getHashValue(const Instruction *I) {
3398 assert(canHandle(I) && "Unknown instruction!")(static_cast <bool> (canHandle(I) && "Unknown instruction!"
) ? void (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3398, __extension__
__PRETTY_FUNCTION__))
;
3399 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3400 I->value_op_end()));
3401 }
3402
3403 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3404 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3405 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3406 return LHS == RHS;
3407 return LHS->isIdenticalTo(RHS);
3408 }
3409};
3410
3411} // end anonymous namespace
3412
3413///Perform cse of induction variable instructions.
3414static void cse(BasicBlock *BB) {
3415 // Perform simple cse.
3416 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3417 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3418 if (!CSEDenseMapInfo::canHandle(&In))
3419 continue;
3420
3421 // Check if we can replace this instruction with any of the
3422 // visited instructions.
3423 if (Instruction *V = CSEMap.lookup(&In)) {
3424 In.replaceAllUsesWith(V);
3425 In.eraseFromParent();
3426 continue;
3427 }
3428
3429 CSEMap[&In] = &In;
3430 }
3431}
3432
3433InstructionCost
3434LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3435 bool &NeedToScalarize) const {
3436 Function *F = CI->getCalledFunction();
3437 Type *ScalarRetTy = CI->getType();
3438 SmallVector<Type *, 4> Tys, ScalarTys;
3439 for (auto &ArgOp : CI->args())
3440 ScalarTys.push_back(ArgOp->getType());
3441
3442 // Estimate cost of scalarized vector call. The source operands are assumed
3443 // to be vectors, so we need to extract individual elements from there,
3444 // execute VF scalar calls, and then gather the result into the vector return
3445 // value.
3446 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3447 InstructionCost ScalarCallCost =
3448 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind);
3449 if (VF.isScalar())
3450 return ScalarCallCost;
3451
3452 // Compute corresponding vector type for return value and arguments.
3453 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3454 for (Type *ScalarTy : ScalarTys)
3455 Tys.push_back(ToVectorTy(ScalarTy, VF));
3456
3457 // Compute costs of unpacking argument values for the scalar calls and
3458 // packing the return values to a vector.
3459 InstructionCost ScalarizationCost =
3460 getScalarizationOverhead(CI, VF, CostKind);
3461
3462 InstructionCost Cost =
3463 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3464
3465 // If we can't emit a vector call for this function, then the currently found
3466 // cost is the cost we need to return.
3467 NeedToScalarize = true;
3468 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3469 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3470
3471 if (!TLI || CI->isNoBuiltin() || !VecFunc)
3472 return Cost;
3473
3474 // If the corresponding vector cost is cheaper, return its cost.
3475 InstructionCost VectorCallCost =
3476 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
3477 if (VectorCallCost < Cost) {
3478 NeedToScalarize = false;
3479 Cost = VectorCallCost;
3480 }
3481 return Cost;
3482}
3483
3484static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3485 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3486 return Elt;
3487 return VectorType::get(Elt, VF);
3488}
3489
3490InstructionCost
3491LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3492 ElementCount VF) const {
3493 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3494 assert(ID && "Expected intrinsic call!")(static_cast <bool> (ID && "Expected intrinsic call!"
) ? void (0) : __assert_fail ("ID && \"Expected intrinsic call!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3494, __extension__
__PRETTY_FUNCTION__))
;
3495 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3496 FastMathFlags FMF;
3497 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3498 FMF = FPMO->getFastMathFlags();
3499
3500 SmallVector<const Value *> Arguments(CI->args());
3501 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3502 SmallVector<Type *> ParamTys;
3503 std::transform(FTy->param_begin(), FTy->param_end(),
3504 std::back_inserter(ParamTys),
3505 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3506
3507 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3508 dyn_cast<IntrinsicInst>(CI));
3509 return TTI.getIntrinsicInstrCost(CostAttrs,
3510 TargetTransformInfo::TCK_RecipThroughput);
3511}
3512
3513static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3514 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3515 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3516 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3517}
3518
3519static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3520 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3521 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3522 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3523}
3524
3525void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3526 // For every instruction `I` in MinBWs, truncate the operands, create a
3527 // truncated version of `I` and reextend its result. InstCombine runs
3528 // later and will remove any ext/trunc pairs.
3529 SmallPtrSet<Value *, 4> Erased;
3530 for (const auto &KV : Cost->getMinimalBitwidths()) {
3531 // If the value wasn't vectorized, we must maintain the original scalar
3532 // type. The absence of the value from State indicates that it
3533 // wasn't vectorized.
3534 // FIXME: Should not rely on getVPValue at this point.
3535 VPValue *Def = State.Plan->getVPValue(KV.first, true);
3536 if (!State.hasAnyVectorValue(Def))
3537 continue;
3538 for (unsigned Part = 0; Part < UF; ++Part) {
3539 Value *I = State.get(Def, Part);
3540 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3541 continue;
3542 Type *OriginalTy = I->getType();
3543 Type *ScalarTruncatedTy =
3544 IntegerType::get(OriginalTy->getContext(), KV.second);
3545 auto *TruncatedTy = VectorType::get(
3546 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3547 if (TruncatedTy == OriginalTy)
3548 continue;
3549
3550 IRBuilder<> B(cast<Instruction>(I));
3551 auto ShrinkOperand = [&](Value *V) -> Value * {
3552 if (auto *ZI = dyn_cast<ZExtInst>(V))
3553 if (ZI->getSrcTy() == TruncatedTy)
3554 return ZI->getOperand(0);
3555 return B.CreateZExtOrTrunc(V, TruncatedTy);
3556 };
3557
3558 // The actual instruction modification depends on the instruction type,
3559 // unfortunately.
3560 Value *NewI = nullptr;
3561 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3562 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3563 ShrinkOperand(BO->getOperand(1)));
3564
3565 // Any wrapping introduced by shrinking this operation shouldn't be
3566 // considered undefined behavior. So, we can't unconditionally copy
3567 // arithmetic wrapping flags to NewI.
3568 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3569 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3570 NewI =
3571 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3572 ShrinkOperand(CI->getOperand(1)));
3573 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3574 NewI = B.CreateSelect(SI->getCondition(),
3575 ShrinkOperand(SI->getTrueValue()),
3576 ShrinkOperand(SI->getFalseValue()));
3577 } else if (auto *CI = dyn_cast<CastInst>(I)) {
3578 switch (CI->getOpcode()) {
3579 default:
3580 llvm_unreachable("Unhandled cast!")::llvm::llvm_unreachable_internal("Unhandled cast!", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3580)
;
3581 case Instruction::Trunc:
3582 NewI = ShrinkOperand(CI->getOperand(0));
3583 break;
3584 case Instruction::SExt:
3585 NewI = B.CreateSExtOrTrunc(
3586 CI->getOperand(0),
3587 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3588 break;
3589 case Instruction::ZExt:
3590 NewI = B.CreateZExtOrTrunc(
3591 CI->getOperand(0),
3592 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3593 break;
3594 }
3595 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3596 auto Elements0 =
3597 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3598 auto *O0 = B.CreateZExtOrTrunc(
3599 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3600 auto Elements1 =
3601 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3602 auto *O1 = B.CreateZExtOrTrunc(
3603 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3604
3605 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3606 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3607 // Don't do anything with the operands, just extend the result.
3608 continue;
3609 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3610 auto Elements =
3611 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3612 auto *O0 = B.CreateZExtOrTrunc(
3613 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3614 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3615 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3616 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3617 auto Elements =
3618 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3619 auto *O0 = B.CreateZExtOrTrunc(
3620 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3621 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3622 } else {
3623 // If we don't know what to do, be conservative and don't do anything.
3624 continue;
3625 }
3626
3627 // Lastly, extend the result.
3628 NewI->takeName(cast<Instruction>(I));
3629 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3630 I->replaceAllUsesWith(Res);
3631 cast<Instruction>(I)->eraseFromParent();
3632 Erased.insert(I);
3633 State.reset(Def, Res, Part);
3634 }
3635 }
3636
3637 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3638 for (const auto &KV : Cost->getMinimalBitwidths()) {
3639 // If the value wasn't vectorized, we must maintain the original scalar
3640 // type. The absence of the value from State indicates that it
3641 // wasn't vectorized.
3642 // FIXME: Should not rely on getVPValue at this point.
3643 VPValue *Def = State.Plan->getVPValue(KV.first, true);
3644 if (!State.hasAnyVectorValue(Def))
3645 continue;
3646 for (unsigned Part = 0; Part < UF; ++Part) {
3647 Value *I = State.get(Def, Part);
3648 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3649 if (Inst && Inst->use_empty()) {
3650 Value *NewI = Inst->getOperand(0);
3651 Inst->eraseFromParent();
3652 State.reset(Def, NewI, Part);
3653 }
3654 }
3655 }
3656}
3657
3658void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3659 VPlan &Plan) {
3660 // Insert truncates and extends for any truncated instructions as hints to
3661 // InstCombine.
3662 if (VF.isVector())
3663 truncateToMinimalBitwidths(State);
3664
3665 // Fix widened non-induction PHIs by setting up the PHI operands.
3666 if (EnableVPlanNativePath)
3667 fixNonInductionPHIs(Plan, State);
3668
3669 // At this point every instruction in the original loop is widened to a
3670 // vector form. Now we need to fix the recurrences in the loop. These PHI
3671 // nodes are currently empty because we did not want to introduce cycles.
3672 // This is the second stage of vectorizing recurrences.
3673 fixCrossIterationPHIs(State);
3674
3675 // Forget the original basic block.
3676 PSE.getSE()->forgetLoop(OrigLoop);
3677
3678 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
3679 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3680 if (Cost->requiresScalarEpilogue(VF)) {
3681 // No edge from the middle block to the unique exit block has been inserted
3682 // and there is nothing to fix from vector loop; phis should have incoming
3683 // from scalar loop only.
3684 Plan.clearLiveOuts();
3685 } else {
3686 // If we inserted an edge from the middle block to the unique exit block,
3687 // update uses outside the loop (phis) to account for the newly inserted
3688 // edge.
3689
3690 // Fix-up external users of the induction variables.
3691 for (const auto &Entry : Legal->getInductionVars())
3692 fixupIVUsers(Entry.first, Entry.second,
3693 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3694 IVEndValues[Entry.first], LoopMiddleBlock,
3695 VectorLoop->getHeader(), Plan);
3696 }
3697
3698 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3699 // in the exit block, so update the builder.
3700 State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
3701 for (const auto &KV : Plan.getLiveOuts())
3702 KV.second->fixPhi(Plan, State);
3703
3704 for (Instruction *PI : PredicatedInstructions)
3705 sinkScalarOperands(&*PI);
3706
3707 // Remove redundant induction instructions.
3708 cse(VectorLoop->getHeader());
3709
3710 // Set/update profile weights for the vector and remainder loops as original
3711 // loop iterations are now distributed among them. Note that original loop
3712 // represented by LoopScalarBody becomes remainder loop after vectorization.
3713 //
3714 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3715 // end up getting slightly roughened result but that should be OK since
3716 // profile is not inherently precise anyway. Note also possible bypass of
3717 // vector code caused by legality checks is ignored, assigning all the weight
3718 // to the vector loop, optimistically.
3719 //
3720 // For scalable vectorization we can't know at compile time how many iterations
3721 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3722 // vscale of '1'.
3723 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3724 LI->getLoopFor(LoopScalarBody),
3725 VF.getKnownMinValue() * UF);
3726}
3727
3728void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3729 // In order to support recurrences we need to be able to vectorize Phi nodes.
3730 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3731 // stage #2: We now need to fix the recurrences by adding incoming edges to
3732 // the currently empty PHI nodes. At this point every instruction in the
3733 // original loop is widened to a vector form so we can use them to construct
3734 // the incoming edges.
3735 VPBasicBlock *Header =
3736 State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
3737 for (VPRecipeBase &R : Header->phis()) {
3738 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3739 fixReduction(ReductionPhi, State);
3740 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3741 fixFixedOrderRecurrence(FOR, State);
3742 }
3743}
3744
3745void InnerLoopVectorizer::fixFixedOrderRecurrence(
3746 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3747 // This is the second phase of vectorizing first-order recurrences. An
3748 // overview of the transformation is described below. Suppose we have the
3749 // following loop.
3750 //
3751 // for (int i = 0; i < n; ++i)
3752 // b[i] = a[i] - a[i - 1];
3753 //
3754 // There is a first-order recurrence on "a". For this loop, the shorthand
3755 // scalar IR looks like:
3756 //
3757 // scalar.ph:
3758 // s_init = a[-1]
3759 // br scalar.body
3760 //
3761 // scalar.body:
3762 // i = phi [0, scalar.ph], [i+1, scalar.body]
3763 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3764 // s2 = a[i]
3765 // b[i] = s2 - s1
3766 // br cond, scalar.body, ...
3767 //
3768 // In this example, s1 is a recurrence because it's value depends on the
3769 // previous iteration. In the first phase of vectorization, we created a
3770 // vector phi v1 for s1. We now complete the vectorization and produce the
3771 // shorthand vector IR shown below (for VF = 4, UF = 1).
3772 //
3773 // vector.ph:
3774 // v_init = vector(..., ..., ..., a[-1])
3775 // br vector.body
3776 //
3777 // vector.body
3778 // i = phi [0, vector.ph], [i+4, vector.body]
3779 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3780 // v2 = a[i, i+1, i+2, i+3];
3781 // v3 = vector(v1(3), v2(0, 1, 2))
3782 // b[i, i+1, i+2, i+3] = v2 - v3
3783 // br cond, vector.body, middle.block
3784 //
3785 // middle.block:
3786 // x = v2(3)
3787 // br scalar.ph
3788 //
3789 // scalar.ph:
3790 // s_init = phi [x, middle.block], [a[-1], otherwise]
3791 // br scalar.body
3792 //
3793 // After execution completes the vector loop, we extract the next value of
3794 // the recurrence (x) to use as the initial value in the scalar loop.
3795
3796 // Extract the last vector element in the middle block. This will be the
3797 // initial value for the recurrence when jumping to the scalar loop.
3798 VPValue *PreviousDef = PhiR->getBackedgeValue();
3799 Value *Incoming = State.get(PreviousDef, UF - 1);
3800 auto *ExtractForScalar = Incoming;
3801 auto *IdxTy = Builder.getInt32Ty();
3802 if (VF.isVector()) {
3803 auto *One = ConstantInt::get(IdxTy, 1);
3804 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3805 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3806 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3807 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
3808 "vector.recur.extract");
3809 }
3810 // Extract the second last element in the middle block if the
3811 // Phi is used outside the loop. We need to extract the phi itself
3812 // and not the last element (the phi update in the current iteration). This
3813 // will be the value when jumping to the exit block from the LoopMiddleBlock,
3814 // when the scalar loop is not run at all.
3815 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3816 if (VF.isVector()) {
3817 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3818 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3819 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3820 Incoming, Idx, "vector.recur.extract.for.phi");
3821 } else if (UF > 1)
3822 // When loop is unrolled without vectorizing, initialize
3823 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
3824 // of `Incoming`. This is analogous to the vectorized case above: extracting
3825 // the second last element when VF > 1.
3826 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3827
3828 // Fix the initial value of the original recurrence in the scalar loop.
3829 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3830 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3831 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3832 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3833 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3834 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3835 Start->addIncoming(Incoming, BB);
3836 }
3837
3838 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3839 Phi->setName("scalar.recur");
3840
3841 // Finally, fix users of the recurrence outside the loop. The users will need
3842 // either the last value of the scalar recurrence or the last value of the
3843 // vector recurrence we extracted in the middle block. Since the loop is in
3844 // LCSSA form, we just need to find all the phi nodes for the original scalar
3845 // recurrence in the exit block, and then add an edge for the middle block.
3846 // Note that LCSSA does not imply single entry when the original scalar loop
3847 // had multiple exiting edges (as we always run the last iteration in the
3848 // scalar epilogue); in that case, there is no edge from middle to exit and
3849 // and thus no phis which needed updated.
3850 if (!Cost->requiresScalarEpilogue(VF))
3851 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3852 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
3853 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3854 State.Plan->removeLiveOut(&LCSSAPhi);
3855 }
3856}
3857
3858void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3859 VPTransformState &State) {
3860 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3861 // Get it's reduction variable descriptor.
3862 assert(Legal->isReductionVariable(OrigPhi) &&(static_cast <bool> (Legal->isReductionVariable(OrigPhi
) && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3863, __extension__
__PRETTY_FUNCTION__))
3863 "Unable to find the reduction variable")(static_cast <bool> (Legal->isReductionVariable(OrigPhi
) && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3863, __extension__
__PRETTY_FUNCTION__))
;
3864 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3865
3866 RecurKind RK = RdxDesc.getRecurrenceKind();
3867 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3868 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3869 State.setDebugLocFromInst(ReductionStartValue);
3870
3871 VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3872 // This is the vector-clone of the value that leaves the loop.
3873 Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3874
3875 // Wrap flags are in general invalid after vectorization, clear them.
3876 clearReductionWrapFlags(PhiR, State);
3877
3878 // Before each round, move the insertion point right between
3879 // the PHIs and the values we are going to write.
3880 // This allows us to write both PHINodes and the extractelement
3881 // instructions.
3882 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3883
3884 State.setDebugLocFromInst(LoopExitInst);
3885
3886 Type *PhiTy = OrigPhi->getType();
3887
3888 VPBasicBlock *LatchVPBB =
3889 PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
3890 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
3891 // If tail is folded by masking, the vector value to leave the loop should be
3892 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3893 // instead of the former. For an inloop reduction the reduction will already
3894 // be predicated, and does not need to be handled here.
3895 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3896 for (unsigned Part = 0; Part < UF; ++Part) {
3897 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3898 SelectInst *Sel = nullptr;
3899 for (User *U : VecLoopExitInst->users()) {
3900 if (isa<SelectInst>(U)) {
3901 assert(!Sel && "Reduction exit feeding two selects")(static_cast <bool> (!Sel && "Reduction exit feeding two selects"
) ? void (0) : __assert_fail ("!Sel && \"Reduction exit feeding two selects\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3901, __extension__
__PRETTY_FUNCTION__))
;
3902 Sel = cast<SelectInst>(U);
3903 } else
3904 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select")(static_cast <bool> (isa<PHINode>(U) && "Reduction exit must feed Phi's or select"
) ? void (0) : __assert_fail ("isa<PHINode>(U) && \"Reduction exit must feed Phi's or select\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3904, __extension__
__PRETTY_FUNCTION__))
;
3905 }
3906 assert(Sel && "Reduction exit feeds no select")(static_cast <bool> (Sel && "Reduction exit feeds no select"
) ? void (0) : __assert_fail ("Sel && \"Reduction exit feeds no select\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3906, __extension__
__PRETTY_FUNCTION__))
;
3907 State.reset(LoopExitInstDef, Sel, Part);
3908
3909 if (isa<FPMathOperator>(Sel))
3910 Sel->setFastMathFlags(RdxDesc.getFastMathFlags());
3911
3912 // If the target can create a predicated operator for the reduction at no
3913 // extra cost in the loop (for example a predicated vadd), it can be
3914 // cheaper for the select to remain in the loop than be sunk out of it,
3915 // and so use the select value for the phi instead of the old
3916 // LoopExitValue.
3917 if (PreferPredicatedReductionSelect ||
3918 TTI->preferPredicatedReductionSelect(
3919 RdxDesc.getOpcode(), PhiTy,
3920 TargetTransformInfo::ReductionFlags())) {
3921 auto *VecRdxPhi =
3922 cast<PHINode>(State.get(PhiR, Part));
3923 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3924 }
3925 }
3926 }
3927
3928 // If the vector reduction can be performed in a smaller type, we truncate
3929 // then extend the loop exit value to enable InstCombine to evaluate the
3930 // entire expression in the smaller type.
3931 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3932 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!")(static_cast <bool> (!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"
) ? void (0) : __assert_fail ("!PhiR->isInLoop() && \"Unexpected truncated inloop reduction!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3932, __extension__
__PRETTY_FUNCTION__))
;
3933 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3934 Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3935 VectorParts RdxParts(UF);
3936 for (unsigned Part = 0; Part < UF; ++Part) {
3937 RdxParts[Part] = State.get(LoopExitInstDef, Part);
3938 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3939 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3940 : Builder.CreateZExt(Trunc, VecTy);
3941 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3942 if (U != Trunc) {
3943 U->replaceUsesOfWith(RdxParts[Part], Extnd);
3944 RdxParts[Part] = Extnd;
3945 }
3946 }
3947 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3948 for (unsigned Part = 0; Part < UF; ++Part) {
3949 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3950 State.reset(LoopExitInstDef, RdxParts[Part], Part);
3951 }
3952 }
3953
3954 // Reduce all of the unrolled parts into a single vector.
3955 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3956 unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3957
3958 // The middle block terminator has already been assigned a DebugLoc here (the
3959 // OrigLoop's single latch terminator). We want the whole middle block to
3960 // appear to execute on this line because: (a) it is all compiler generated,
3961 // (b) these instructions are always executed after evaluating the latch
3962 // conditional branch, and (c) other passes may add new predecessors which
3963 // terminate on this line. This is the easiest way to ensure we don't
3964 // accidentally cause an extra step back into the loop while debugging.
3965 State.setDebugLocFromInst(LoopMiddleBlock->getTerminator());
3966 if (PhiR->isOrdered())
3967 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3968 else {
3969 // Floating-point operations should have some FMF to enable the reduction.
3970 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3971 Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3972 for (unsigned Part = 1; Part < UF; ++Part) {
3973 Value *RdxPart = State.get(LoopExitInstDef, Part);
3974 if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
3975 ReducedPartRdx = Builder.CreateBinOp(
3976 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
3977 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
3978 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
3979 ReducedPartRdx, RdxPart);
3980 else
3981 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
3982 }
3983 }
3984
3985 // Create the reduction after the loop. Note that inloop reductions create the
3986 // target reduction in the loop using a Reduction recipe.
3987 if (VF.isVector() && !PhiR->isInLoop()) {
3988 ReducedPartRdx =
3989 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
3990 // If the reduction can be performed in a smaller type, we need to extend
3991 // the reduction to the wider type before we branch to the original loop.
3992 if (PhiTy != RdxDesc.getRecurrenceType())
3993 ReducedPartRdx = RdxDesc.isSigned()
3994 ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
3995 : Builder.CreateZExt(ReducedPartRdx, PhiTy);
3996 }
3997
3998 PHINode *ResumePhi =
3999 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
4000
4001 // Create a phi node that merges control-flow from the backedge-taken check
4002 // block and the middle block.
4003 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4004 LoopScalarPreHeader->getTerminator());
4005
4006 // If we are fixing reductions in the epilogue loop then we should already
4007 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
4008 // we carry over the incoming values correctly.
4009 for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
4010 if (Incoming == LoopMiddleBlock)
4011 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
4012 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
4013 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
4014 Incoming);
4015 else
4016 BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4017 }
4018
4019 // Set the resume value for this reduction
4020 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4021
4022 // If there were stores of the reduction value to a uniform memory address
4023 // inside the loop, create the final store here.
4024 if (StoreInst *SI = RdxDesc.IntermediateStore) {
4025 StoreInst *NewSI =
4026 Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
4027 propagateMetadata(NewSI, SI);
4028
4029 // If the reduction value is used in other places,
4030 // then let the code below create PHI's for that.
4031 }
4032
4033 // Now, we need to fix the users of the reduction variable
4034 // inside and outside of the scalar remainder loop.
4035
4036 // We know that the loop is in LCSSA form. We need to update the PHI nodes
4037 // in the exit blocks. See comment on analogous loop in
4038 // fixFixedOrderRecurrence for a more complete explaination of the logic.
4039 if (!Cost->requiresScalarEpilogue(VF))
4040 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4041 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
4042 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4043 State.Plan->removeLiveOut(&LCSSAPhi);
4044 }
4045
4046 // Fix the scalar loop reduction variable with the incoming reduction sum
4047 // from the vector body and from the backedge value.
4048 int IncomingEdgeBlockIdx =
4049 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4050 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")(static_cast <bool> (IncomingEdgeBlockIdx >= 0 &&
"Invalid block index") ? void (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4050, __extension__
__PRETTY_FUNCTION__))
;
4051 // Pick the other block.
4052 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4053 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4054 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4055}
4056
4057void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
4058 VPTransformState &State) {
4059 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4060 RecurKind RK = RdxDesc.getRecurrenceKind();
4061 if (RK != RecurKind::Add && RK != RecurKind::Mul)
4062 return;
4063
4064 SmallVector<VPValue *, 8> Worklist;
4065 SmallPtrSet<VPValue *, 8> Visited;
4066 Worklist.push_back(PhiR);
4067 Visited.insert(PhiR);
4068
4069 while (!Worklist.empty()) {
4070 VPValue *Cur = Worklist.pop_back_val();
4071 for (unsigned Part = 0; Part < UF; ++Part) {
4072 Value *V = State.get(Cur, Part);
4073 if (!isa<OverflowingBinaryOperator>(V))
4074 break;
4075 cast<Instruction>(V)->dropPoisonGeneratingFlags();
4076 }
4077
4078 for (VPUser *U : Cur->users()) {
4079 auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
4080 if (!UserRecipe)
4081 continue;
4082 for (VPValue *V : UserRecipe->definedValues())
4083 if (Visited.insert(V).second)
4084 Worklist.push_back(V);
4085 }
4086 }
4087}
4088
4089void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4090 // The basic block and loop containing the predicated instruction.
4091 auto *PredBB = PredInst->getParent();
4092 auto *VectorLoop = LI->getLoopFor(PredBB);
4093
4094 // Initialize a worklist with the operands of the predicated instruction.
4095 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4096
4097 // Holds instructions that we need to analyze again. An instruction may be
4098 // reanalyzed if we don't yet know if we can sink it or not.
4099 SmallVector<Instruction *, 8> InstsToReanalyze;
4100
4101 // Returns true if a given use occurs in the predicated block. Phi nodes use
4102 // their operands in their corresponding predecessor blocks.
4103 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4104 auto *I = cast<Instruction>(U.getUser());
4105 BasicBlock *BB = I->getParent();
4106 if (auto *Phi = dyn_cast<PHINode>(I))
4107 BB = Phi->getIncomingBlock(
4108 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4109 return BB == PredBB;
4110 };
4111
4112 // Iteratively sink the scalarized operands of the predicated instruction
4113 // into the block we created for it. When an instruction is sunk, it's
4114 // operands are then added to the worklist. The algorithm ends after one pass
4115 // through the worklist doesn't sink a single instruction.
4116 bool Changed;
4117 do {
4118 // Add the instructions that need to be reanalyzed to the worklist, and
4119 // reset the changed indicator.
4120 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4121 InstsToReanalyze.clear();
4122 Changed = false;
4123
4124 while (!Worklist.empty()) {
4125 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4126
4127 // We can't sink an instruction if it is a phi node, is not in the loop,
4128 // or may have side effects.
4129 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4130 I->mayHaveSideEffects())
4131 continue;
4132
4133 // If the instruction is already in PredBB, check if we can sink its
4134 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4135 // sinking the scalar instruction I, hence it appears in PredBB; but it
4136 // may have failed to sink I's operands (recursively), which we try
4137 // (again) here.
4138 if (I->getParent() == PredBB) {
4139 Worklist.insert(I->op_begin(), I->op_end());
4140 continue;
4141 }
4142
4143 // It's legal to sink the instruction if all its uses occur in the
4144 // predicated block. Otherwise, there's nothing to do yet, and we may
4145 // need to reanalyze the instruction.
4146 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4147 InstsToReanalyze.push_back(I);
4148 continue;
4149 }
4150
4151 // Move the instruction to the beginning of the predicated block, and add
4152 // it's operands to the worklist.
4153 I->moveBefore(&*PredBB->getFirstInsertionPt());
4154 Worklist.insert(I->op_begin(), I->op_end());
4155
4156 // The sinking may have enabled other instructions to be sunk, so we will
4157 // need to iterate.
4158 Changed = true;
4159 }
4160 } while (Changed);
4161}
4162
4163void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
4164 VPTransformState &State) {
4165 auto Iter = vp_depth_first_deep(Plan.getEntry());
4166 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4167 for (VPRecipeBase &P : VPBB->phis()) {
4168 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
4169 if (!VPPhi)
4170 continue;
4171 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4172 // Make sure the builder has a valid insert point.
4173 Builder.SetInsertPoint(NewPhi);
4174 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4175 VPValue *Inc = VPPhi->getIncomingValue(i);
4176 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4177 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4178 }
4179 }
4180 }
4181}
4182
4183bool InnerLoopVectorizer::useOrderedReductions(
4184 const RecurrenceDescriptor &RdxDesc) {
4185 return Cost->useOrderedReductions(RdxDesc);
4186}
4187
4188void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4189 // We should not collect Scalars more than once per VF. Right now, this
4190 // function is called from collectUniformsAndScalars(), which already does
4191 // this check. Collecting Scalars for VF=1 does not make any sense.
4192 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&(static_cast <bool> (VF.isVector() && Scalars.find
(VF) == Scalars.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4193, __extension__
__PRETTY_FUNCTION__))
4193 "This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Scalars.find
(VF) == Scalars.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4193, __extension__
__PRETTY_FUNCTION__))
;
4194
4195 // This avoids any chances of creating a REPLICATE recipe during planning
4196 // since that would result in generation of scalarized code during execution,
4197 // which is not supported for scalable vectors.
4198 if (VF.isScalable()) {
4199 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
4200 return;
4201 }
4202
4203 SmallSetVector<Instruction *, 8> Worklist;
4204
4205 // These sets are used to seed the analysis with pointers used by memory
4206 // accesses that will remain scalar.
4207 SmallSetVector<Instruction *, 8> ScalarPtrs;
4208 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4209 auto *Latch = TheLoop->getLoopLatch();
4210
4211 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4212 // The pointer operands of loads and stores will be scalar as long as the
4213 // memory access is not a gather or scatter operation. The value operand of a
4214 // store will remain scalar if the store is scalarized.
4215 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4216 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4217 assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4218, __extension__
__PRETTY_FUNCTION__))
4218 "Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4218, __extension__
__PRETTY_FUNCTION__))
;
4219 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4220 if (Ptr == Store->getValueOperand())
4221 return WideningDecision == CM_Scalarize;
4222 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4223, __extension__
__PRETTY_FUNCTION__))
4223 "Ptr is neither a value or pointer operand")(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4223, __extension__
__PRETTY_FUNCTION__))
;
4224 return WideningDecision != CM_GatherScatter;
4225 };
4226
4227 // A helper that returns true if the given value is a bitcast or
4228 // getelementptr instruction contained in the loop.
4229 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4230 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4231 isa<GetElementPtrInst>(V)) &&
4232 !TheLoop->isLoopInvariant(V);
4233 };
4234
4235 // A helper that evaluates a memory access's use of a pointer. If the use will
4236 // be a scalar use and the pointer is only used by memory accesses, we place
4237 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4238 // PossibleNonScalarPtrs.
4239 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4240 // We only care about bitcast and getelementptr instructions contained in
4241 // the loop.
4242 if (!isLoopVaryingBitCastOrGEP(Ptr))
4243 return;
4244
4245 // If the pointer has already been identified as scalar (e.g., if it was
4246 // also identified as uniform), there's nothing to do.
4247 auto *I = cast<Instruction>(Ptr);
4248 if (Worklist.count(I))
4249 return;
4250
4251 // If the use of the pointer will be a scalar use, and all users of the
4252 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4253 // place the pointer in PossibleNonScalarPtrs.
4254 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4255 return isa<LoadInst>(U) || isa<StoreInst>(U);
4256 }))
4257 ScalarPtrs.insert(I);
4258 else
4259 PossibleNonScalarPtrs.insert(I);
4260 };
4261
4262 // We seed the scalars analysis with three classes of instructions: (1)
4263 // instructions marked uniform-after-vectorization and (2) bitcast,
4264 // getelementptr and (pointer) phi instructions used by memory accesses
4265 // requiring a scalar use.
4266 //
4267 // (1) Add to the worklist all instructions that have been identified as
4268 // uniform-after-vectorization.
4269 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4270
4271 // (2) Add to the worklist all bitcast and getelementptr instructions used by
4272 // memory accesses requiring a scalar use. The pointer operands of loads and
4273 // stores will be scalar as long as the memory accesses is not a gather or
4274 // scatter operation. The value operand of a store will remain scalar if the
4275 // store is scalarized.
4276 for (auto *BB : TheLoop->blocks())
4277 for (auto &I : *BB) {
4278 if (auto *Load = dyn_cast<LoadInst>(&I)) {
4279 evaluatePtrUse(Load, Load->getPointerOperand());
4280 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4281 evaluatePtrUse(Store, Store->getPointerOperand());
4282 evaluatePtrUse(Store, Store->getValueOperand());
4283 }
4284 }
4285 for (auto *I : ScalarPtrs)
4286 if (!PossibleNonScalarPtrs.count(I)) {
4287 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *I << "\n"; } } while (false)
;
4288 Worklist.insert(I);
4289 }
4290
4291 // Insert the forced scalars.
4292 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
4293 // induction variable when the PHI user is scalarized.
4294 auto ForcedScalar = ForcedScalars.find(VF);
4295 if (ForcedScalar != ForcedScalars.end())
4296 for (auto *I : ForcedScalar->second) {
4297 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found (forced) scalar instruction: "
<< *I << "\n"; } } while (false)
;
4298 Worklist.insert(I);
4299 }
4300
4301 // Expand the worklist by looking through any bitcasts and getelementptr
4302 // instructions we've already identified as scalar. This is similar to the
4303 // expansion step in collectLoopUniforms(); however, here we're only
4304 // expanding to include additional bitcasts and getelementptr instructions.
4305 unsigned Idx = 0;
4306 while (Idx != Worklist.size()) {
4307 Instruction *Dst = Worklist[Idx++];
4308 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4309 continue;
4310 auto *Src = cast<Instruction>(Dst->getOperand(0));
4311 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4312 auto *J = cast<Instruction>(U);
4313 return !TheLoop->contains(J) || Worklist.count(J) ||
4314 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4315 isScalarUse(J, Src));
4316 })) {
4317 Worklist.insert(Src);
4318 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Src << "\n"; } } while (false)
;
4319 }
4320 }
4321
4322 // An induction variable will remain scalar if all users of the induction
4323 // variable and induction variable update remain scalar.
4324 for (const auto &Induction : Legal->getInductionVars()) {
4325 auto *Ind = Induction.first;
4326 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4327
4328 // If tail-folding is applied, the primary induction variable will be used
4329 // to feed a vector compare.
4330 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4331 continue;
4332
4333 // Returns true if \p Indvar is a pointer induction that is used directly by
4334 // load/store instruction \p I.
4335 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4336 Instruction *I) {
4337 return Induction.second.getKind() ==
4338 InductionDescriptor::IK_PtrInduction &&
4339 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4340 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4341 };
4342
4343 // Determine if all users of the induction variable are scalar after
4344 // vectorization.
4345 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4346 auto *I = cast<Instruction>(U);
4347 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4348 IsDirectLoadStoreFromPtrIndvar(Ind, I);
4349 });
4350 if (!ScalarInd)
4351 continue;
4352
4353 // Determine if all users of the induction variable update instruction are
4354 // scalar after vectorization.
4355 auto ScalarIndUpdate =
4356 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4357 auto *I = cast<Instruction>(U);
4358 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4359 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4360 });
4361 if (!ScalarIndUpdate)
4362 continue;
4363
4364 // The induction variable and its update instruction will remain scalar.
4365 Worklist.insert(Ind);
4366 Worklist.insert(IndUpdate);
4367 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Ind << "\n"; } } while (false)
;
4368 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
4369 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
;
4370 }
4371
4372 Scalars[VF].insert(Worklist.begin(), Worklist.end());
4373}
4374
4375bool LoopVectorizationCostModel::isScalarWithPredication(
4376 Instruction *I, ElementCount VF) const {
4377 if (!isPredicatedInst(I))
4378 return false;
4379
4380 // Do we have a non-scalar lowering for this predicated
4381 // instruction? No - it is scalar with predication.
4382 switch(I->getOpcode()) {
4383 default:
4384 return true;
4385 case Instruction::Load:
4386 case Instruction::Store: {
4387 auto *Ptr = getLoadStorePointerOperand(I);
4388 auto *Ty = getLoadStoreType(I);
4389 Type *VTy = Ty;
4390 if (VF.isVector())
4391 VTy = VectorType::get(Ty, VF);
4392 const Align Alignment = getLoadStoreAlignment(I);
4393 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4394 TTI.isLegalMaskedGather(VTy, Alignment))
4395 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4396 TTI.isLegalMaskedScatter(VTy, Alignment));
4397 }
4398 case Instruction::UDiv:
4399 case Instruction::SDiv:
4400 case Instruction::SRem:
4401 case Instruction::URem: {
4402 // We have the option to use the safe-divisor idiom to avoid predication.
4403 // The cost based decision here will always select safe-divisor for
4404 // scalable vectors as scalarization isn't legal.
4405 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
4406 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
4407 }
4408 }
4409}
4410
4411bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
4412 if (!blockNeedsPredicationForAnyReason(I->getParent()))
4413 return false;
4414
4415 // Can we prove this instruction is safe to unconditionally execute?
4416 // If not, we must use some form of predication.
4417 switch(I->getOpcode()) {
4418 default:
4419 return false;
4420 case Instruction::Load:
4421 case Instruction::Store: {
4422 if (!Legal->isMaskRequired(I))
4423 return false;
4424 // When we know the load's address is loop invariant and the instruction
4425 // in the original scalar loop was unconditionally executed then we
4426 // don't need to mark it as a predicated instruction. Tail folding may
4427 // introduce additional predication, but we're guaranteed to always have
4428 // at least one active lane. We call Legal->blockNeedsPredication here
4429 // because it doesn't query tail-folding. For stores, we need to prove
4430 // both speculation safety (which follows from the same argument as loads),
4431 // but also must prove the value being stored is correct. The easiest
4432 // form of the later is to require that all values stored are the same.
4433 if (Legal->isUniformMemOp(*I) &&
4434 (isa<LoadInst>(I) ||
4435 (isa<StoreInst>(I) &&
4436 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
4437 !Legal->blockNeedsPredication(I->getParent()))
4438 return false;
4439 return true;
4440 }
4441 case Instruction::UDiv:
4442 case Instruction::SDiv:
4443 case Instruction::SRem:
4444 case Instruction::URem:
4445 // TODO: We can use the loop-preheader as context point here and get
4446 // context sensitive reasoning
4447 return !isSafeToSpeculativelyExecute(I);
4448 }
4449}
4450
4451std::pair<InstructionCost, InstructionCost>
4452LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
4453 ElementCount VF) const {
4454 assert(I->getOpcode() == Instruction::UDiv ||(static_cast <bool> (I->getOpcode() == Instruction::
UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode
() == Instruction::SRem || I->getOpcode() == Instruction::
URem) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4457, __extension__
__PRETTY_FUNCTION__))
4455 I->getOpcode() == Instruction::SDiv ||(static_cast <bool> (I->getOpcode() == Instruction::
UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode
() == Instruction::SRem || I->getOpcode() == Instruction::
URem) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4457, __extension__
__PRETTY_FUNCTION__))
4456 I->getOpcode() == Instruction::SRem ||(static_cast <bool> (I->getOpcode() == Instruction::
UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode
() == Instruction::SRem || I->getOpcode() == Instruction::
URem) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4457, __extension__
__PRETTY_FUNCTION__))
4457 I->getOpcode() == Instruction::URem)(static_cast <bool> (I->getOpcode() == Instruction::
UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode
() == Instruction::SRem || I->getOpcode() == Instruction::
URem) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4457, __extension__
__PRETTY_FUNCTION__))
;
4458 assert(!isSafeToSpeculativelyExecute(I))(static_cast <bool> (!isSafeToSpeculativelyExecute(I)) ?
void (0) : __assert_fail ("!isSafeToSpeculativelyExecute(I)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4458, __extension__
__PRETTY_FUNCTION__))
;
4459
4460 const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4461
4462 // Scalarization isn't legal for scalable vector types
4463 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
4464 if (!VF.isScalable()) {
4465 // Get the scalarization cost and scale this amount by the probability of
4466 // executing the predicated block. If the instruction is not predicated,
4467 // we fall through to the next case.
4468 ScalarizationCost = 0;
4469
4470 // These instructions have a non-void type, so account for the phi nodes
4471 // that we will create. This cost is likely to be zero. The phi node
4472 // cost, if any, should be scaled by the block probability because it
4473 // models a copy at the end of each predicated block.
4474 ScalarizationCost += VF.getKnownMinValue() *
4475 TTI.getCFInstrCost(Instruction::PHI, CostKind);
4476
4477 // The cost of the non-predicated instruction.
4478 ScalarizationCost += VF.getKnownMinValue() *
4479 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4480
4481 // The cost of insertelement and extractelement instructions needed for
4482 // scalarization.
4483 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4484
4485 // Scale the cost by the probability of executing the predicated blocks.
4486 // This assumes the predicated block for each vector lane is equally
4487 // likely.
4488 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4489 }
4490 InstructionCost SafeDivisorCost = 0;
4491
4492 auto *VecTy = ToVectorTy(I->getType(), VF);
4493
4494 // The cost of the select guard to ensure all lanes are well defined
4495 // after we speculate above any internal control flow.
4496 SafeDivisorCost += TTI.getCmpSelInstrCost(
4497 Instruction::Select, VecTy,
4498 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4499 CmpInst::BAD_ICMP_PREDICATE, CostKind);
4500
4501 // Certain instructions can be cheaper to vectorize if they have a constant
4502 // second vector operand. One example of this are shifts on x86.
4503 Value *Op2 = I->getOperand(1);
4504 auto Op2Info = TTI.getOperandInfo(Op2);
4505 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
4506 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
4507
4508 SmallVector<const Value *, 4> Operands(I->operand_values());
4509 SafeDivisorCost += TTI.getArithmeticInstrCost(
4510 I->getOpcode(), VecTy, CostKind,
4511 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4512 Op2Info, Operands, I);
4513 return {ScalarizationCost, SafeDivisorCost};
4514}
4515
4516bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4517 Instruction *I, ElementCount VF) {
4518 assert(isAccessInterleaved(I) && "Expecting interleaved access.")(static_cast <bool> (isAccessInterleaved(I) && "Expecting interleaved access."
) ? void (0) : __assert_fail ("isAccessInterleaved(I) && \"Expecting interleaved access.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4518, __extension__
__PRETTY_FUNCTION__))
;
4519 assert(getWideningDecision(I, VF) == CM_Unknown &&(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown
&& "Decision should not be set yet.") ? void (0) : __assert_fail
("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4520, __extension__
__PRETTY_FUNCTION__))
4520 "Decision should not be set yet.")(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown
&& "Decision should not be set yet.") ? void (0) : __assert_fail
("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4520, __extension__
__PRETTY_FUNCTION__))
;
4521 auto *Group = getInterleavedAccessGroup(I);
4522 assert(Group && "Must have a group.")(static_cast <bool> (Group && "Must have a group."
) ? void (0) : __assert_fail ("Group && \"Must have a group.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4522, __extension__
__PRETTY_FUNCTION__))
;
4523
4524 // If the instruction's allocated size doesn't equal it's type size, it
4525 // requires padding and will be scalarized.
4526 auto &DL = I->getModule()->getDataLayout();
4527 auto *ScalarTy = getLoadStoreType(I);
4528 if (hasIrregularType(ScalarTy, DL))
4529 return false;
4530
4531 // If the group involves a non-integral pointer, we may not be able to
4532 // losslessly cast all values to a common type.
4533 unsigned InterleaveFactor = Group->getFactor();
4534 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4535 for (unsigned i = 0; i < InterleaveFactor; i++) {
4536 Instruction *Member = Group->getMember(i);
4537 if (!Member)
4538 continue;
4539 auto *MemberTy = getLoadStoreType(Member);
4540 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4541 // Don't coerce non-integral pointers to integers or vice versa.
4542 if (MemberNI != ScalarNI) {
4543 // TODO: Consider adding special nullptr value case here
4544 return false;
4545 } else if (MemberNI && ScalarNI &&
4546 ScalarTy->getPointerAddressSpace() !=
4547 MemberTy->getPointerAddressSpace()) {
4548 return false;
4549 }
4550 }
4551
4552 // Check if masking is required.
4553 // A Group may need masking for one of two reasons: it resides in a block that
4554 // needs predication, or it was decided to use masking to deal with gaps
4555 // (either a gap at the end of a load-access that may result in a speculative
4556 // load, or any gaps in a store-access).
4557 bool PredicatedAccessRequiresMasking =
4558 blockNeedsPredicationForAnyReason(I->getParent()) &&
4559 Legal->isMaskRequired(I);
4560 bool LoadAccessWithGapsRequiresEpilogMasking =
4561 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4562 !isScalarEpilogueAllowed();
4563 bool StoreAccessWithGapsRequiresMasking =
4564 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4565 if (!PredicatedAccessRequiresMasking &&
4566 !LoadAccessWithGapsRequiresEpilogMasking &&
4567 !StoreAccessWithGapsRequiresMasking)
4568 return true;
4569
4570 // If masked interleaving is required, we expect that the user/target had
4571 // enabled it, because otherwise it either wouldn't have been created or
4572 // it should have been invalidated by the CostModel.
4573 assert(useMaskedInterleavedAccesses(TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled."
) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4574, __extension__
__PRETTY_FUNCTION__))
4574 "Masked interleave-groups for predicated accesses are not enabled.")(static_cast <bool> (useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled."
) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4574, __extension__
__PRETTY_FUNCTION__))
;
4575
4576 if (Group->isReverse())
4577 return false;
4578
4579 auto *Ty = getLoadStoreType(I);
4580 const Align Alignment = getLoadStoreAlignment(I);
4581 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4582 : TTI.isLegalMaskedStore(Ty, Alignment);
4583}
4584
4585bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4586 Instruction *I, ElementCount VF) {
4587 // Get and ensure we have a valid memory instruction.
4588 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction")(static_cast <bool> ((isa<LoadInst, StoreInst>(I)
) && "Invalid memory instruction") ? void (0) : __assert_fail
("(isa<LoadInst, StoreInst>(I)) && \"Invalid memory instruction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4588, __extension__
__PRETTY_FUNCTION__))
;
4589
4590 auto *Ptr = getLoadStorePointerOperand(I);
4591 auto *ScalarTy = getLoadStoreType(I);
4592
4593 // In order to be widened, the pointer should be consecutive, first of all.
4594 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4595 return false;
4596
4597 // If the instruction is a store located in a predicated block, it will be
4598 // scalarized.
4599 if (isScalarWithPredication(I, VF))
4600 return false;
4601
4602 // If the instruction's allocated size doesn't equal it's type size, it
4603 // requires padding and will be scalarized.
4604 auto &DL = I->getModule()->getDataLayout();
4605 if (hasIrregularType(ScalarTy, DL))
4606 return false;
4607
4608 return true;
4609}
4610
4611void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4612 // We should not collect Uniforms more than once per VF. Right now,
4613 // this function is called from collectUniformsAndScalars(), which
4614 // already does this check. Collecting Uniforms for VF=1 does not make any
4615 // sense.
4616
4617 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&(static_cast <bool> (VF.isVector() && Uniforms.
find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4618, __extension__
__PRETTY_FUNCTION__))
4618 "This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Uniforms.
find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4618, __extension__
__PRETTY_FUNCTION__))
;
4619
4620 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4621 // not analyze again. Uniforms.count(VF) will return 1.
4622 Uniforms[VF].clear();
4623
4624 // We now know that the loop is vectorizable!
4625 // Collect instructions inside the loop that will remain uniform after
4626 // vectorization.
4627
4628 // Global values, params and instructions outside of current loop are out of
4629 // scope.
4630 auto isOutOfScope = [&](Value *V) -> bool {
4631 Instruction *I = dyn_cast<Instruction>(V);
4632 return (!I || !TheLoop->contains(I));
4633 };
4634
4635 // Worklist containing uniform instructions demanding lane 0.
4636 SetVector<Instruction *> Worklist;
4637 BasicBlock *Latch = TheLoop->getLoopLatch();
4638
4639 // Add uniform instructions demanding lane 0 to the worklist. Instructions
4640 // that are scalar with predication must not be considered uniform after
4641 // vectorization, because that would create an erroneous replicating region
4642 // where only a single instance out of VF should be formed.
4643 // TODO: optimize such seldom cases if found important, see PR40816.
4644 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4645 if (isOutOfScope(I)) {
4646 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
4647 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
;
4648 return;
4649 }
4650 if (isScalarWithPredication(I, VF)) {
4651 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
4652 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
;
4653 return;
4654 }
4655 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
<< *I << "\n"; } } while (false)
;
4656 Worklist.insert(I);
4657 };
4658
4659 // Start with the conditional branch. If the branch condition is an
4660 // instruction contained in the loop that is only used by the branch, it is
4661 // uniform.
4662 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4663 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4664 addToWorklistIfAllowed(Cmp);
4665
4666 // Return true if all lanes perform the same memory operation, and we can
4667 // thus chose to execute only one.
4668 auto isUniformMemOpUse = [&](Instruction *I) {
4669 if (!Legal->isUniformMemOp(*I))
4670 return false;
4671 if (isa<LoadInst>(I))
4672 // Loading the same address always produces the same result - at least
4673 // assuming aliasing and ordering which have already been checked.
4674 return true;
4675 // Storing the same value on every iteration.
4676 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4677 };
4678
4679 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4680 InstWidening WideningDecision = getWideningDecision(I, VF);
4681 assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4682, __extension__
__PRETTY_FUNCTION__))
4682 "Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4682, __extension__
__PRETTY_FUNCTION__))
;
4683
4684 if (isUniformMemOpUse(I))
4685 return true;
4686
4687 return (WideningDecision == CM_Widen ||
4688 WideningDecision == CM_Widen_Reverse ||
4689 WideningDecision == CM_Interleave);
4690 };
4691
4692
4693 // Returns true if Ptr is the pointer operand of a memory access instruction
4694 // I, and I is known to not require scalarization.
4695 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4696 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4697 };
4698
4699 // Holds a list of values which are known to have at least one uniform use.
4700 // Note that there may be other uses which aren't uniform. A "uniform use"
4701 // here is something which only demands lane 0 of the unrolled iterations;
4702 // it does not imply that all lanes produce the same value (e.g. this is not
4703 // the usual meaning of uniform)
4704 SetVector<Value *> HasUniformUse;
4705
4706 // Scan the loop for instructions which are either a) known to have only
4707 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4708 for (auto *BB : TheLoop->blocks())
4709 for (auto &I : *BB) {
4710 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4711 switch (II->getIntrinsicID()) {
4712 case Intrinsic::sideeffect:
4713 case Intrinsic::experimental_noalias_scope_decl:
4714 case Intrinsic::assume:
4715 case Intrinsic::lifetime_start:
4716 case Intrinsic::lifetime_end:
4717 if (TheLoop->hasLoopInvariantOperands(&I))
4718 addToWorklistIfAllowed(&I);
4719 break;
4720 default:
4721 break;
4722 }
4723 }
4724
4725 // ExtractValue instructions must be uniform, because the operands are
4726 // known to be loop-invariant.
4727 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4728 assert(isOutOfScope(EVI->getAggregateOperand()) &&(static_cast <bool> (isOutOfScope(EVI->getAggregateOperand
()) && "Expected aggregate value to be loop invariant"
) ? void (0) : __assert_fail ("isOutOfScope(EVI->getAggregateOperand()) && \"Expected aggregate value to be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4729, __extension__
__PRETTY_FUNCTION__))
4729 "Expected aggregate value to be loop invariant")(static_cast <bool> (isOutOfScope(EVI->getAggregateOperand
()) && "Expected aggregate value to be loop invariant"
) ? void (0) : __assert_fail ("isOutOfScope(EVI->getAggregateOperand()) && \"Expected aggregate value to be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4729, __extension__
__PRETTY_FUNCTION__))
;
4730 addToWorklistIfAllowed(EVI);
4731 continue;
4732 }
4733
4734 // If there's no pointer operand, there's nothing to do.
4735 auto *Ptr = getLoadStorePointerOperand(&I);
4736 if (!Ptr)
4737 continue;
4738
4739 if (isUniformMemOpUse(&I))
4740 addToWorklistIfAllowed(&I);
4741
4742 if (isUniformDecision(&I, VF)) {
4743 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check")(static_cast <bool> (isVectorizedMemAccessUse(&I, Ptr
) && "consistency check") ? void (0) : __assert_fail (
"isVectorizedMemAccessUse(&I, Ptr) && \"consistency check\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4743, __extension__
__PRETTY_FUNCTION__))
;
4744 HasUniformUse.insert(Ptr);
4745 }
4746 }
4747
4748 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4749 // demanding) users. Since loops are assumed to be in LCSSA form, this
4750 // disallows uses outside the loop as well.
4751 for (auto *V : HasUniformUse) {
4752 if (isOutOfScope(V))
4753 continue;
4754 auto *I = cast<Instruction>(V);
4755 auto UsersAreMemAccesses =
4756 llvm::all_of(I->users(), [&](User *U) -> bool {
4757 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4758 });
4759 if (UsersAreMemAccesses)
4760 addToWorklistIfAllowed(I);
4761 }
4762
4763 // Expand Worklist in topological order: whenever a new instruction
4764 // is added , its users should be already inside Worklist. It ensures
4765 // a uniform instruction will only be used by uniform instructions.
4766 unsigned idx = 0;
4767 while (idx != Worklist.size()) {
4768 Instruction *I = Worklist[idx++];
4769
4770 for (auto *OV : I->operand_values()) {
4771 // isOutOfScope operands cannot be uniform instructions.
4772 if (isOutOfScope(OV))
4773 continue;
4774 // First order recurrence Phi's should typically be considered
4775 // non-uniform.
4776 auto *OP = dyn_cast<PHINode>(OV);
4777 if (OP && Legal->isFixedOrderRecurrence(OP))
4778 continue;
4779 // If all the users of the operand are uniform, then add the
4780 // operand into the uniform worklist.
4781 auto *OI = cast<Instruction>(OV);
4782 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4783 auto *J = cast<Instruction>(U);
4784 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4785 }))
4786 addToWorklistIfAllowed(OI);
4787 }
4788 }
4789
4790 // For an instruction to be added into Worklist above, all its users inside
4791 // the loop should also be in Worklist. However, this condition cannot be
4792 // true for phi nodes that form a cyclic dependence. We must process phi
4793 // nodes separately. An induction variable will remain uniform if all users
4794 // of the induction variable and induction variable update remain uniform.
4795 // The code below handles both pointer and non-pointer induction variables.
4796 for (const auto &Induction : Legal->getInductionVars()) {
4797 auto *Ind = Induction.first;
4798 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4799
4800 // Determine if all users of the induction variable are uniform after
4801 // vectorization.
4802 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4803 auto *I = cast<Instruction>(U);
4804 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4805 isVectorizedMemAccessUse(I, Ind);
4806 });
4807 if (!UniformInd)
4808 continue;
4809
4810 // Determine if all users of the induction variable update instruction are
4811 // uniform after vectorization.
4812 auto UniformIndUpdate =
4813 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4814 auto *I = cast<Instruction>(U);
4815 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4816 isVectorizedMemAccessUse(I, IndUpdate);
4817 });
4818 if (!UniformIndUpdate)
4819 continue;
4820
4821 // The induction variable and its update instruction will remain uniform.
4822 addToWorklistIfAllowed(Ind);
4823 addToWorklistIfAllowed(IndUpdate);
4824 }
4825
4826 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4827}
4828
4829bool LoopVectorizationCostModel::runtimeChecksRequired() {
4830 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Performing code size checks.\n"
; } } while (false)
;
4831
4832 if (Legal->getRuntimePointerChecking()->Need) {
4833 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4834 "runtime pointer checks needed. Enable vectorization of this "
4835 "loop with '#pragma clang loop vectorize(enable)' when "
4836 "compiling with -Os/-Oz",
4837 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4838 return true;
4839 }
4840
4841 if (!PSE.getPredicate().isAlwaysTrue()) {
4842 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4843 "runtime SCEV checks needed. Enable vectorization of this "
4844 "loop with '#pragma clang loop vectorize(enable)' when "
4845 "compiling with -Os/-Oz",
4846 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4847 return true;
4848 }
4849
4850 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4851 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4852 reportVectorizationFailure("Runtime stride check for small trip count",
4853 "runtime stride == 1 checks needed. Enable vectorization of "
4854 "this loop without such check by compiling with -Os/-Oz",
4855 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4856 return true;
4857 }
4858
4859 return false;
4860}
4861
4862ElementCount
4863LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4864 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4865 return ElementCount::getScalable(0);
4866
4867 if (Hints->isScalableVectorizationDisabled()) {
4868 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4869 "ScalableVectorizationDisabled", ORE, TheLoop);
4870 return ElementCount::getScalable(0);
4871 }
4872
4873 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalable vectorization is available\n"
; } } while (false)
;
4874
4875 auto MaxScalableVF = ElementCount::getScalable(
4876 std::numeric_limits<ElementCount::ScalarTy>::max());
4877
4878 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4879 // FIXME: While for scalable vectors this is currently sufficient, this should
4880 // be replaced by a more detailed mechanism that filters out specific VFs,
4881 // instead of invalidating vectorization for a whole set of VFs based on the
4882 // MaxVF.
4883
4884 // Disable scalable vectorization if the loop contains unsupported reductions.
4885 if (!canVectorizeReductions(MaxScalableVF)) {
4886 reportVectorizationInfo(
4887 "Scalable vectorization not supported for the reduction "
4888 "operations found in this loop.",
4889 "ScalableVFUnfeasible", ORE, TheLoop);
4890 return ElementCount::getScalable(0);
4891 }
4892
4893 // Disable scalable vectorization if the loop contains any instructions
4894 // with element types not supported for scalable vectors.
4895 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4896 return !Ty->isVoidTy() &&
4897 !this->TTI.isElementTypeLegalForScalableVector(Ty);
4898 })) {
4899 reportVectorizationInfo("Scalable vectorization is not supported "
4900 "for all element types found in this loop.",
4901 "ScalableVFUnfeasible", ORE, TheLoop);
4902 return ElementCount::getScalable(0);
4903 }
4904
4905 if (Legal->isSafeForAnyVectorWidth())
4906 return MaxScalableVF;
4907
4908 // Limit MaxScalableVF by the maximum safe dependence distance.
4909 std::optional<unsigned> MaxVScale = TTI.getMaxVScale();
4910 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
4911 MaxVScale =
4912 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
4913 MaxScalableVF =
4914 ElementCount::getScalable(MaxVScale ? (MaxSafeElements / *MaxVScale) : 0);
4915 if (!MaxScalableVF)
4916 reportVectorizationInfo(
4917 "Max legal vector width too small, scalable vectorization "
4918 "unfeasible.",
4919 "ScalableVFUnfeasible", ORE, TheLoop);
4920
4921 return MaxScalableVF;
4922}
4923
4924FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4925 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4926 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4927 unsigned SmallestType, WidestType;
4928 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4929
4930 // Get the maximum safe dependence distance in bits computed by LAA.
4931 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4932 // the memory accesses that is most restrictive (involved in the smallest
4933 // dependence distance).
4934 unsigned MaxSafeElements =
4935 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4936
4937 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4938 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4939
4940 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe fixed VF is: "
<< MaxSafeFixedVF << ".\n"; } } while (false)
4941 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe fixed VF is: "
<< MaxSafeFixedVF << ".\n"; } } while (false)
;
4942 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe scalable VF is: "
<< MaxSafeScalableVF << ".\n"; } } while (false)
4943 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe scalable VF is: "
<< MaxSafeScalableVF << ".\n"; } } while (false)
;
4944
4945 // First analyze the UserVF, fall back if the UserVF should be ignored.
4946 if (UserVF) {
4947 auto MaxSafeUserVF =
4948 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4949
4950 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4951 // If `VF=vscale x N` is safe, then so is `VF=N`
4952 if (UserVF.isScalable())
4953 return FixedScalableVFPair(
4954 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4955 else
4956 return UserVF;
4957 }
4958
4959 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF))(static_cast <bool> (ElementCount::isKnownGT(UserVF, MaxSafeUserVF
)) ? void (0) : __assert_fail ("ElementCount::isKnownGT(UserVF, MaxSafeUserVF)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4959, __extension__
__PRETTY_FUNCTION__))
;
4960
4961 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4962 // is better to ignore the hint and let the compiler choose a suitable VF.
4963 if (!UserVF.isScalable()) {
4964 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeFixedVF << ".\n"; } } while (false)
4965 << " is unsafe, clamping to max safe VF="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeFixedVF << ".\n"; } } while (false)
4966 << MaxSafeFixedVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeFixedVF << ".\n"; } } while (false)
;
4967 ORE->emit([&]() {
4968 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
4969 TheLoop->getStartLoc(),
4970 TheLoop->getHeader())
4971 << "User-specified vectorization factor "
4972 << ore::NV("UserVectorizationFactor", UserVF)
4973 << " is unsafe, clamping to maximum safe vectorization factor "
4974 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4975 });
4976 return MaxSafeFixedVF;
4977 }
4978
4979 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4980 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is ignored because scalable vectors are not "
"available.\n"; } } while (false)
4981 << " is ignored because scalable vectors are not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is ignored because scalable vectors are not "
"available.\n"; } } while (false)
4982 "available.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is ignored because scalable vectors are not "
"available.\n"; } } while (false)
;
4983 ORE->emit([&]() {
4984 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
4985 TheLoop->getStartLoc(),
4986 TheLoop->getHeader())
4987 << "User-specified vectorization factor "
4988 << ore::NV("UserVectorizationFactor", UserVF)
4989 << " is ignored because the target does not support scalable "
4990 "vectors. The compiler will pick a more suitable value.";
4991 });
4992 } else {
4993 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe. Ignoring scalable UserVF.\n"; }
} while (false)
4994 << " is unsafe. Ignoring scalable UserVF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe. Ignoring scalable UserVF.\n"; }
} while (false)
;
4995 ORE->emit([&]() {
4996 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
4997 TheLoop->getStartLoc(),
4998 TheLoop->getHeader())
4999 << "User-specified vectorization factor "
5000 << ore::NV("UserVectorizationFactor", UserVF)
5001 << " is unsafe. Ignoring the hint to let the compiler pick a "
5002 "more suitable value.";
5003 });
5004 }
5005 }
5006
5007 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestTypedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
5008 << " / " << WidestType << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
;
5009
5010 FixedScalableVFPair Result(ElementCount::getFixed(1),
5011 ElementCount::getScalable(0));
5012 if (auto MaxVF =
5013 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5014 MaxSafeFixedVF, FoldTailByMasking))
5015 Result.FixedVF = MaxVF;
5016
5017 if (auto MaxVF =
5018 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5019 MaxSafeScalableVF, FoldTailByMasking))
5020 if (MaxVF.isScalable()) {
5021 Result.ScalableVF = MaxVF;
5022 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found feasible scalable VF = "
<< MaxVF << "\n"; } } while (false)
5023 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found feasible scalable VF = "
<< MaxVF << "\n"; } } while (false)
;
5024 }
5025
5026 return Result;
5027}
5028
5029FixedScalableVFPair
5030LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5031 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5032 // TODO: It may by useful to do since it's still likely to be dynamically
5033 // uniform if the target can skip.
5034 reportVectorizationFailure(
5035 "Not inserting runtime ptr check for divergent target",
5036 "runtime pointer checks needed. Not enabled for divergent target",
5037 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5038 return FixedScalableVFPair::getNone();
5039 }
5040
5041 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5042 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found trip count: "
<< TC << '\n'; } } while (false)
;
5043 if (TC == 1) {
5044 reportVectorizationFailure("Single iteration (non) loop",
5045 "loop trip count is one, irrelevant for vectorization",
5046 "SingleIterationLoop", ORE, TheLoop);
5047 return FixedScalableVFPair::getNone();
5048 }
5049
5050 switch (ScalarEpilogueStatus) {
5051 case CM_ScalarEpilogueAllowed:
5052 return computeFeasibleMaxVF(TC, UserVF, false);
5053 case CM_ScalarEpilogueNotAllowedUsePredicate:
5054 [[fallthrough]];
5055 case CM_ScalarEpilogueNotNeededUsePredicate:
5056 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5057 dbgs() << "LV: vector predicate hint/switch found.\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5058 << "LV: Not allowing scalar epilogue, creating predicated "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5059 << "vector loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
;
5060 break;
5061 case CM_ScalarEpilogueNotAllowedLowTripLoop:
5062 // fallthrough as a special case of OptForSize
5063 case CM_ScalarEpilogueNotAllowedOptSize:
5064 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5065 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
5066 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
;
5067 else
5068 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
<< "count.\n"; } } while (false)
5069 << "count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
<< "count.\n"; } } while (false)
;
5070
5071 // Bail if runtime checks are required, which are not good when optimising
5072 // for size.
5073 if (runtimeChecksRequired())
5074 return FixedScalableVFPair::getNone();
5075
5076 break;
5077 }
5078
5079 // The only loops we can vectorize without a scalar epilogue, are loops with
5080 // a bottom-test and a single exiting block. We'd have to handle the fact
5081 // that not every instruction executes on the last iteration. This will
5082 // require a lane mask which varies through the vector loop body. (TODO)
5083 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5084 // If there was a tail-folding hint/switch, but we can't fold the tail by
5085 // masking, fallback to a vectorization with a scalar epilogue.
5086 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5087 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
5088 "scalar epilogue instead.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
;
5089 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5090 return computeFeasibleMaxVF(TC, UserVF, false);
5091 }
5092 return FixedScalableVFPair::getNone();
5093 }
5094
5095 // Now try the tail folding
5096
5097 // Invalidate interleave groups that require an epilogue if we can't mask
5098 // the interleave-group.
5099 if (!useMaskedInterleavedAccesses(TTI)) {
5100 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&(static_cast <bool> (WideningDecisions.empty() &&
Uniforms.empty() && Scalars.empty() && "No decisions should have been taken at this point"
) ? void (0) : __assert_fail ("WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && \"No decisions should have been taken at this point\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5101, __extension__
__PRETTY_FUNCTION__))
5101 "No decisions should have been taken at this point")(static_cast <bool> (WideningDecisions.empty() &&
Uniforms.empty() && Scalars.empty() && "No decisions should have been taken at this point"
) ? void (0) : __assert_fail ("WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && \"No decisions should have been taken at this point\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5101, __extension__
__PRETTY_FUNCTION__))
;
5102 // Note: There is no need to invalidate any cost modeling decisions here, as
5103 // non where taken so far.
5104 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5105 }
5106
5107 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5108 // Avoid tail folding if the trip count is known to be a multiple of any VF
5109 // we chose.
5110 // FIXME: The condition below pessimises the case for fixed-width vectors,
5111 // when scalable VFs are also candidates for vectorization.
5112 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5113 ElementCount MaxFixedVF = MaxFactors.FixedVF;
5114 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&(static_cast <bool> ((UserVF.isNonZero() || isPowerOf2_32
(MaxFixedVF.getFixedValue())) && "MaxFixedVF must be a power of 2"
) ? void (0) : __assert_fail ("(UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && \"MaxFixedVF must be a power of 2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5115, __extension__
__PRETTY_FUNCTION__))
5115 "MaxFixedVF must be a power of 2")(static_cast <bool> ((UserVF.isNonZero() || isPowerOf2_32
(MaxFixedVF.getFixedValue())) && "MaxFixedVF must be a power of 2"
) ? void (0) : __assert_fail ("(UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && \"MaxFixedVF must be a power of 2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5115, __extension__
__PRETTY_FUNCTION__))
;
5116 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5117 : MaxFixedVF.getFixedValue();
5118 ScalarEvolution *SE = PSE.getSE();
5119 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5120 const SCEV *ExitCount = SE->getAddExpr(
5121 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5122 const SCEV *Rem = SE->getURemExpr(
5123 SE->applyLoopGuards(ExitCount, TheLoop),
5124 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5125 if (Rem->isZero()) {
5126 // Accept MaxFixedVF if we do not have a tail.
5127 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: No tail will remain for any chosen VF.\n"
; } } while (false)
;
5128 return MaxFactors;
5129 }
5130 }
5131
5132 // If we don't know the precise trip count, or if the trip count that we
5133 // found modulo the vectorization factor is not zero, try to fold the tail
5134 // by masking.
5135 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5136 if (Legal->prepareToFoldTailByMasking()) {
5137 FoldTailByMasking = true;
5138 return MaxFactors;
5139 }
5140
5141 // If there was a tail-folding hint/switch, but we can't fold the tail by
5142 // masking, fallback to a vectorization with a scalar epilogue.
5143 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5144 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
5145 "scalar epilogue instead.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
;
5146 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5147 return MaxFactors;
5148 }
5149
5150 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5151 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"
; } } while (false)
;
5152 return FixedScalableVFPair::getNone();
5153 }
5154
5155 if (TC == 0) {
5156 reportVectorizationFailure(
5157 "Unable to calculate the loop count due to complex control flow",
5158 "unable to calculate the loop count due to complex control flow",
5159 "UnknownLoopCountComplexCFG", ORE, TheLoop);
5160 return FixedScalableVFPair::getNone();
5161 }
5162
5163 reportVectorizationFailure(
5164 "Cannot optimize for size and vectorize at the same time.",
5165 "cannot optimize for size and vectorize at the same time. "
5166 "Enable vectorization of this loop with '#pragma clang loop "
5167 "vectorize(enable)' when compiling with -Os/-Oz",
5168 "NoTailLoopWithOptForSize", ORE, TheLoop);
5169 return FixedScalableVFPair::getNone();
5170}
5171
5172ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5173 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5174 ElementCount MaxSafeVF, bool FoldTailByMasking) {
5175 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5176 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
5177 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5178 : TargetTransformInfo::RGK_FixedWidthVector);
5179
5180 // Convenience function to return the minimum of two ElementCounts.
5181 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5182 assert((LHS.isScalable() == RHS.isScalable()) &&(static_cast <bool> ((LHS.isScalable() == RHS.isScalable
()) && "Scalable flags must match") ? void (0) : __assert_fail
("(LHS.isScalable() == RHS.isScalable()) && \"Scalable flags must match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5183, __extension__
__PRETTY_FUNCTION__))
5183 "Scalable flags must match")(static_cast <bool> ((LHS.isScalable() == RHS.isScalable
()) && "Scalable flags must match") ? void (0) : __assert_fail
("(LHS.isScalable() == RHS.isScalable()) && \"Scalable flags must match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5183, __extension__
__PRETTY_FUNCTION__))
;
5184 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5185 };
5186
5187 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5188 // Note that both WidestRegister and WidestType may not be a powers of 2.
5189 auto MaxVectorElementCount = ElementCount::get(
5190 PowerOf2Floor(WidestRegister.getKnownMinValue() / WidestType),
5191 ComputeScalableMaxVF);
5192 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5193 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< (MaxVectorElementCount * WidestType) << " bits.\n"
; } } while (false)
5194 << (MaxVectorElementCount * WidestType) << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< (MaxVectorElementCount * WidestType) << " bits.\n"
; } } while (false)
;
5195
5196 if (!MaxVectorElementCount) {
5197 LLVM_DEBUG(dbgs() << "LV: The target has no "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no "
<< (ComputeScalableMaxVF ? "scalable" : "fixed") <<
" vector registers.\n"; } } while (false)
5198 << (ComputeScalableMaxVF ? "scalable" : "fixed")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no "
<< (ComputeScalableMaxVF ? "scalable" : "fixed") <<
" vector registers.\n"; } } while (false)
5199 << " vector registers.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no "
<< (ComputeScalableMaxVF ? "scalable" : "fixed") <<
" vector registers.\n"; } } while (false)
;
5200 return ElementCount::getFixed(1);
5201 }
5202
5203 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
5204 if (MaxVectorElementCount.isScalable() &&
5205 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5206 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5207 auto Min = Attr.getVScaleRangeMin();
5208 WidestRegisterMinEC *= Min;
5209 }
5210 if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC &&
5211 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5212 // If loop trip count (TC) is known at compile time there is no point in
5213 // choosing VF greater than TC (as done in the loop below). Select maximum
5214 // power of two which doesn't exceed TC.
5215 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5216 // when the TC is less than or equal to the known number of lanes.
5217 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5218 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: " << ClampedConstTripCount
<< "\n"; } } while (false)
5219 "exceeding the constant trip count: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: " << ClampedConstTripCount
<< "\n"; } } while (false)
5220 << ClampedConstTripCount << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: " << ClampedConstTripCount
<< "\n"; } } while (false)
;
5221 return ElementCount::getFixed(ClampedConstTripCount);
5222 }
5223
5224 TargetTransformInfo::RegisterKind RegKind =
5225 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5226 : TargetTransformInfo::RGK_FixedWidthVector;
5227 ElementCount MaxVF = MaxVectorElementCount;
5228 if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
5229 TTI.shouldMaximizeVectorBandwidth(RegKind))) {
5230 auto MaxVectorElementCountMaxBW = ElementCount::get(
5231 PowerOf2Floor(WidestRegister.getKnownMinValue() / SmallestType),
5232 ComputeScalableMaxVF);
5233 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5234
5235 // Collect all viable vectorization factors larger than the default MaxVF
5236 // (i.e. MaxVectorElementCount).
5237 SmallVector<ElementCount, 8> VFs;
5238 for (ElementCount VS = MaxVectorElementCount * 2;
5239 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5240 VFs.push_back(VS);
5241
5242 // For each VF calculate its register usage.
5243 auto RUs = calculateRegisterUsage(VFs);
5244
5245 // Select the largest VF which doesn't require more registers than existing
5246 // ones.
5247 for (int i = RUs.size() - 1; i >= 0; --i) {
5248 bool Selected = true;
5249 for (auto &pair : RUs[i].MaxLocalUsers) {
5250 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5251 if (pair.second > TargetNumRegisters)
5252 Selected = false;
5253 }
5254 if (Selected) {
5255 MaxVF = VFs[i];
5256 break;
5257 }
5258 }
5259 if (ElementCount MinVF =
5260 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5261 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5262 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
<< MaxVF << ") with target's minimum: " <<
MinVF << '\n'; } } while (false)
5263 << ") with target's minimum: " << MinVF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
<< MaxVF << ") with target's minimum: " <<
MinVF << '\n'; } } while (false)
;
5264 MaxVF = MinVF;
5265 }
5266 }
5267
5268 // Invalidate any widening decisions we might have made, in case the loop
5269 // requires prediction (decided later), but we have already made some
5270 // load/store widening decisions.
5271 invalidateCostModelingDecisions();
5272 }
5273 return MaxVF;
5274}
5275
5276std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5277 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5278 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5279 auto Min = Attr.getVScaleRangeMin();
5280 auto Max = Attr.getVScaleRangeMax();
5281 if (Max && Min == Max)
5282 return Max;
5283 }
5284
5285 return TTI.getVScaleForTuning();
5286}
5287
5288bool LoopVectorizationCostModel::isMoreProfitable(
5289 const VectorizationFactor &A, const VectorizationFactor &B) const {
5290 InstructionCost CostA = A.Cost;
5291 InstructionCost CostB = B.Cost;
5292
5293 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5294
5295 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5296 MaxTripCount) {
5297 // If we are folding the tail and the trip count is a known (possibly small)
5298 // constant, the trip count will be rounded up to an integer number of
5299 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5300 // which we compare directly. When not folding the tail, the total cost will
5301 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5302 // approximated with the per-lane cost below instead of using the tripcount
5303 // as here.
5304 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5305 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5306 return RTCostA < RTCostB;
5307 }
5308
5309 // Improve estimate for the vector width if it is scalable.
5310 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5311 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5312 if (std::optional<unsigned> VScale = getVScaleForTuning()) {
5313 if (A.Width.isScalable())
5314 EstimatedWidthA *= *VScale;
5315 if (B.Width.isScalable())
5316 EstimatedWidthB *= *VScale;
5317 }
5318
5319 // Assume vscale may be larger than 1 (or the value being tuned for),
5320 // so that scalable vectorization is slightly favorable over fixed-width
5321 // vectorization.
5322 if (A.Width.isScalable() && !B.Width.isScalable())
5323 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5324
5325 // To avoid the need for FP division:
5326 // (CostA / A.Width) < (CostB / B.Width)
5327 // <=> (CostA * B.Width) < (CostB * A.Width)
5328 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5329}
5330
5331VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5332 const ElementCountSet &VFCandidates) {
5333 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5334 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalar loop costs: "
<< ExpectedCost << ".\n"; } } while (false)
;
5335 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop")(static_cast <bool> (ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"
) ? void (0) : __assert_fail ("ExpectedCost.isValid() && \"Unexpected invalid cost for scalar loop\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5335, __extension__
__PRETTY_FUNCTION__))
;
5336 assert(VFCandidates.count(ElementCount::getFixed(1)) &&(static_cast <bool> (VFCandidates.count(ElementCount::getFixed
(1)) && "Expected Scalar VF to be a candidate") ? void
(0) : __assert_fail ("VFCandidates.count(ElementCount::getFixed(1)) && \"Expected Scalar VF to be a candidate\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5337, __extension__
__PRETTY_FUNCTION__))
5337 "Expected Scalar VF to be a candidate")(static_cast <bool> (VFCandidates.count(ElementCount::getFixed
(1)) && "Expected Scalar VF to be a candidate") ? void
(0) : __assert_fail ("VFCandidates.count(ElementCount::getFixed(1)) && \"Expected Scalar VF to be a candidate\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5337, __extension__
__PRETTY_FUNCTION__))
;
5338
5339 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
5340 ExpectedCost);
5341 VectorizationFactor ChosenFactor = ScalarCost;
5342
5343 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5344 if (ForceVectorization && VFCandidates.size() > 1) {
5345 // Ignore scalar width, because the user explicitly wants vectorization.
5346 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5347 // evaluation.
5348 ChosenFactor.Cost = InstructionCost::getMax();
5349 }
5350
5351 SmallVector<InstructionVFPair> InvalidCosts;
5352 for (const auto &i : VFCandidates) {
5353 // The cost for scalar VF=1 is already calculated, so ignore it.
5354 if (i.isScalar())
5355 continue;
5356
5357 VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5358 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5359
5360#ifndef NDEBUG
5361 unsigned AssumedMinimumVscale = 1;
5362 if (std::optional<unsigned> VScale = getVScaleForTuning())
5363 AssumedMinimumVscale = *VScale;
5364 unsigned Width =
5365 Candidate.Width.isScalable()
5366 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5367 : Candidate.Width.getFixedValue();
5368 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (Candidate.Cost / Width
); } } while (false)
5369 << " costs: " << (Candidate.Cost / Width))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (Candidate.Cost / Width
); } } while (false)
;
5370 if (i.isScalable())
5371 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " (assuming a minimum vscale of "
<< AssumedMinimumVscale << ")"; } } while (false
)
5372 << AssumedMinimumVscale << ")")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " (assuming a minimum vscale of "
<< AssumedMinimumVscale << ")"; } } while (false
)
;
5373 LLVM_DEBUG(dbgs() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << ".\n"; } } while (false
)
;
5374#endif
5375
5376 if (!C.second && !ForceVectorization) {
5377 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
5378 dbgs() << "LV: Not considering vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
5379 << " because it will not generate any vector instructions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
;
5380 continue;
5381 }
5382
5383 // If profitable add it to ProfitableVF list.
5384 if (isMoreProfitable(Candidate, ScalarCost))
5385 ProfitableVFs.push_back(Candidate);
5386
5387 if (isMoreProfitable(Candidate, ChosenFactor))
5388 ChosenFactor = Candidate;
5389 }
5390
5391 // Emit a report of VFs with invalid costs in the loop.
5392 if (!InvalidCosts.empty()) {
5393 // Group the remarks per instruction, keeping the instruction order from
5394 // InvalidCosts.
5395 std::map<Instruction *, unsigned> Numbering;
5396 unsigned I = 0;
5397 for (auto &Pair : InvalidCosts)
5398 if (!Numbering.count(Pair.first))
5399 Numbering[Pair.first] = I++;
5400
5401 // Sort the list, first on instruction(number) then on VF.
5402 llvm::sort(InvalidCosts,
5403 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5404 if (Numbering[A.first] != Numbering[B.first])
5405 return Numbering[A.first] < Numbering[B.first];
5406 ElementCountComparator ECC;
5407 return ECC(A.second, B.second);
5408 });
5409
5410 // For a list of ordered instruction-vf pairs:
5411 // [(load, vf1), (load, vf2), (store, vf1)]
5412 // Group the instructions together to emit separate remarks for:
5413 // load (vf1, vf2)
5414 // store (vf1)
5415 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5416 auto Subset = ArrayRef<InstructionVFPair>();
5417 do {
5418 if (Subset.empty())
5419 Subset = Tail.take_front(1);
5420
5421 Instruction *I = Subset.front().first;
5422
5423 // If the next instruction is different, or if there are no other pairs,
5424 // emit a remark for the collated subset. e.g.
5425 // [(load, vf1), (load, vf2))]
5426 // to emit:
5427 // remark: invalid costs for 'load' at VF=(vf, vf2)
5428 if (Subset == Tail || Tail[Subset.size()].first != I) {
5429 std::string OutString;
5430 raw_string_ostream OS(OutString);
5431 assert(!Subset.empty() && "Unexpected empty range")(static_cast <bool> (!Subset.empty() && "Unexpected empty range"
) ? void (0) : __assert_fail ("!Subset.empty() && \"Unexpected empty range\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5431, __extension__
__PRETTY_FUNCTION__))
;
5432 OS << "Instruction with invalid costs prevented vectorization at VF=(";
5433 for (const auto &Pair : Subset)
5434 OS << (Pair.second == Subset.front().second ? "" : ", ")
5435 << Pair.second;
5436 OS << "):";
5437 if (auto *CI = dyn_cast<CallInst>(I))
5438 OS << " call to " << CI->getCalledFunction()->getName();
5439 else
5440 OS << " " << I->getOpcodeName();
5441 OS.flush();
5442 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5443 Tail = Tail.drop_front(Subset.size());
5444 Subset = {};
5445 } else
5446 // Grow the subset by one element
5447 Subset = Tail.take_front(Subset.size() + 1);
5448 } while (!Tail.empty());
5449 }
5450
5451 if (!EnableCondStoresVectorization && NumPredStores) {
5452 reportVectorizationFailure("There are conditional stores.",
5453 "store that is conditionally executed prevents vectorization",
5454 "ConditionalStore", ORE, TheLoop);
5455 ChosenFactor = ScalarCost;
5456 }
5457
5458 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && !ChosenFactor
.Width.isScalar() && !isMoreProfitable(ChosenFactor, ScalarCost
)) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (false)
5459 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && !ChosenFactor
.Width.isScalar() && !isMoreProfitable(ChosenFactor, ScalarCost
)) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (false)
5460 << "LV: Vectorization seems to be not beneficial, "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && !ChosenFactor
.Width.isScalar() && !isMoreProfitable(ChosenFactor, ScalarCost
)) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (false)
5461 << "but was forced by a user.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && !ChosenFactor
.Width.isScalar() && !isMoreProfitable(ChosenFactor, ScalarCost
)) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (false)
;
5462 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Selecting VF: " <<
ChosenFactor.Width << ".\n"; } } while (false)
;
5463 return ChosenFactor;
5464}
5465
5466bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5467 const Loop &L, ElementCount VF) const {
5468 // Cross iteration phis such as reductions need special handling and are
5469 // currently unsupported.
5470 if (any_of(L.getHeader()->phis(),
5471 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
5472 return false;
5473
5474 // Phis with uses outside of the loop require special handling and are
5475 // currently unsupported.
5476 for (const auto &Entry : Legal->getInductionVars()) {
5477 // Look for uses of the value of the induction at the last iteration.
5478 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5479 for (User *U : PostInc->users())
5480 if (!L.contains(cast<Instruction>(U)))
5481 return false;
5482 // Look for uses of penultimate value of the induction.
5483 for (User *U : Entry.first->users())
5484 if (!L.contains(cast<Instruction>(U)))
5485 return false;
5486 }
5487
5488 // Epilogue vectorization code has not been auditted to ensure it handles
5489 // non-latch exits properly. It may be fine, but it needs auditted and
5490 // tested.
5491 if (L.getExitingBlock() != L.getLoopLatch())
5492 return false;
5493
5494 return true;
5495}
5496
5497bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5498 const ElementCount VF) const {
5499 // FIXME: We need a much better cost-model to take different parameters such
5500 // as register pressure, code size increase and cost of extra branches into
5501 // account. For now we apply a very crude heuristic and only consider loops
5502 // with vectorization factors larger than a certain value.
5503
5504 // Allow the target to opt out entirely.
5505 if (!TTI.preferEpilogueVectorization())
5506 return false;
5507
5508 // We also consider epilogue vectorization unprofitable for targets that don't
5509 // consider interleaving beneficial (eg. MVE).
5510 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5511 return false;
5512 // FIXME: We should consider changing the threshold for scalable
5513 // vectors to take VScaleForTuning into account.
5514 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
5515 return true;
5516 return false;
5517}
5518
5519VectorizationFactor
5520LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5521 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5522 VectorizationFactor Result = VectorizationFactor::Disabled();
5523 if (!EnableEpilogueVectorization) {
5524 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization is disabled.\n"
;; } } while (false)
;
5525 return Result;
5526 }
5527
5528 if (!isScalarEpilogueAllowed()) {
5529 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
"allowed.\n";; } } while (false)
5530 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
"allowed.\n";; } } while (false)
5531 "allowed.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
"allowed.\n";; } } while (false)
;
5532 return Result;
5533 }
5534
5535 // Not really a cost consideration, but check for unsupported cases here to
5536 // simplify the logic.
5537 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5538 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
"not a supported candidate.\n";; } } while (false)
5539 dbgs() << "LEV: Unable to vectorize epilogue because the loop is "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
"not a supported candidate.\n";; } } while (false)
5540 "not a supported candidate.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
"not a supported candidate.\n";; } } while (false)
;
5541 return Result;
5542 }
5543
5544 if (EpilogueVectorizationForceVF > 1) {
5545 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization factor is forced.\n"
;; } } while (false)
;
5546 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5547 if (LVP.hasPlanWithVF(ForcedEC))
5548 return {ForcedEC, 0, 0};
5549 else {
5550 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization forced factor is not viable.\n"
;; } } while (false)
5551 dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization forced factor is not viable.\n"
;; } } while (false)
5552 << "LEV: Epilogue vectorization forced factor is not viable.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization forced factor is not viable.\n"
;; } } while (false)
;
5553 return Result;
5554 }
5555 }
5556
5557 if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5558 TheLoop->getHeader()->getParent()->hasMinSize()) {
5559 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"
;; } } while (false)
5560 dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"
;; } } while (false)
5561 << "LEV: Epilogue vectorization skipped due to opt for size.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"
;; } } while (false)
;
5562 return Result;
5563 }
5564
5565 if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
5566 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization is not profitable for "
"this loop\n"; } } while (false)
5567 "this loop\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization is not profitable for "
"this loop\n"; } } while (false)
;
5568 return Result;
5569 }
5570
5571 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5572 // the main loop handles 8 lanes per iteration. We could still benefit from
5573 // vectorizing the epilogue loop with VF=4.
5574 ElementCount EstimatedRuntimeVF = MainLoopVF;
5575 if (MainLoopVF.isScalable()) {
5576 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5577 if (std::optional<unsigned> VScale = getVScaleForTuning())
5578 EstimatedRuntimeVF *= *VScale;
5579 }
5580
5581 for (auto &NextVF : ProfitableVFs)
5582 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5583 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
5584 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
5585 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
5586 LVP.hasPlanWithVF(NextVF.Width))
5587 Result = NextVF;
5588
5589 if (Result != VectorizationFactor::Disabled())
5590 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Vectorizing epilogue loop with VF = "
<< Result.Width << "\n";; } } while (false)
5591 << Result.Width << "\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Vectorizing epilogue loop with VF = "
<< Result.Width << "\n";; } } while (false)
;
5592 return Result;
5593}
5594
5595std::pair<unsigned, unsigned>
5596LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5597 unsigned MinWidth = -1U;
5598 unsigned MaxWidth = 8;
5599 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5600 // For in-loop reductions, no element types are added to ElementTypesInLoop
5601 // if there are no loads/stores in the loop. In this case, check through the
5602 // reduction variables to determine the maximum width.
5603 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5604 // Reset MaxWidth so that we can find the smallest type used by recurrences
5605 // in the loop.
5606 MaxWidth = -1U;
5607 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
5608 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5609 // When finding the min width used by the recurrence we need to account
5610 // for casts on the input operands of the recurrence.
5611 MaxWidth = std::min<unsigned>(
5612 MaxWidth, std::min<unsigned>(
5613 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5614 RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5615 }
5616 } else {
5617 for (Type *T : ElementTypesInLoop) {
5618 MinWidth = std::min<unsigned>(
5619 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5620 MaxWidth = std::max<unsigned>(
5621 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5622 }
5623 }
5624 return {MinWidth, MaxWidth};
5625}
5626
5627void LoopVectorizationCostModel::collectElementTypesForWidening() {
5628 ElementTypesInLoop.clear();
5629 // For each block.
5630 for (BasicBlock *BB : TheLoop->blocks()) {
5631 // For each instruction in the loop.
5632 for (Instruction &I : BB->instructionsWithoutDebug()) {
5633 Type *T = I.getType();
5634
5635 // Skip ignored values.
5636 if (ValuesToIgnore.count(&I))
5637 continue;
5638
5639 // Only examine Loads, Stores and PHINodes.
5640 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5641 continue;
5642
5643 // Examine PHI nodes that are reduction variables. Update the type to
5644 // account for the recurrence type.
5645 if (auto *PN = dyn_cast<PHINode>(&I)) {
5646 if (!Legal->isReductionVariable(PN))
5647 continue;
5648 const RecurrenceDescriptor &RdxDesc =
5649 Legal->getReductionVars().find(PN)->second;
5650 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5651 TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5652 RdxDesc.getRecurrenceType(),
5653 TargetTransformInfo::ReductionFlags()))
5654 continue;
5655 T = RdxDesc.getRecurrenceType();
5656 }
5657
5658 // Examine the stored values.
5659 if (auto *ST = dyn_cast<StoreInst>(&I))
5660 T = ST->getValueOperand()->getType();
5661
5662 assert(T->isSized() &&(static_cast <bool> (T->isSized() && "Expected the load/store/recurrence type to be sized"
) ? void (0) : __assert_fail ("T->isSized() && \"Expected the load/store/recurrence type to be sized\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5663, __extension__
__PRETTY_FUNCTION__))
5663 "Expected the load/store/recurrence type to be sized")(static_cast <bool> (T->isSized() && "Expected the load/store/recurrence type to be sized"
) ? void (0) : __assert_fail ("T->isSized() && \"Expected the load/store/recurrence type to be sized\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5663, __extension__
__PRETTY_FUNCTION__))
;
5664
5665 ElementTypesInLoop.insert(T);
5666 }
5667 }
5668}
5669
5670unsigned
5671LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5672 InstructionCost LoopCost) {
5673 // -- The interleave heuristics --
5674 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5675 // There are many micro-architectural considerations that we can't predict
5676 // at this level. For example, frontend pressure (on decode or fetch) due to
5677 // code size, or the number and capabilities of the execution ports.
5678 //
5679 // We use the following heuristics to select the interleave count:
5680 // 1. If the code has reductions, then we interleave to break the cross
5681 // iteration dependency.
5682 // 2. If the loop is really small, then we interleave to reduce the loop
5683 // overhead.
5684 // 3. We don't interleave if we think that we will spill registers to memory
5685 // due to the increased register pressure.
5686
5687 if (!isScalarEpilogueAllowed())
5688 return 1;
5689
5690 // We used the distance for the interleave count.
5691 if (Legal->getMaxSafeDepDistBytes() != -1U)
5692 return 1;
5693
5694 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5695 const bool HasReductions = !Legal->getReductionVars().empty();
5696 // Do not interleave loops with a relatively small known or estimated trip
5697 // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5698 // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5699 // because with the above conditions interleaving can expose ILP and break
5700 // cross iteration dependences for reductions.
5701 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5702 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5703 return 1;
5704
5705 // If we did not calculate the cost for VF (because the user selected the VF)
5706 // then we calculate the cost of VF here.
5707 if (LoopCost == 0) {
5708 LoopCost = expectedCost(VF).first;
5709 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost")(static_cast <bool> (LoopCost.isValid() && "Expected to have chosen a VF with valid cost"
) ? void (0) : __assert_fail ("LoopCost.isValid() && \"Expected to have chosen a VF with valid cost\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5709, __extension__
__PRETTY_FUNCTION__))
;
5710
5711 // Loop body is free and there is no need for interleaving.
5712 if (LoopCost == 0)
5713 return 1;
5714 }
5715
5716 RegisterUsage R = calculateRegisterUsage({VF})[0];
5717 // We divide by these constants so assume that we have at least one
5718 // instruction that uses at least one register.
5719 for (auto& pair : R.MaxLocalUsers) {
5720 pair.second = std::max(pair.second, 1U);
5721 }
5722
5723 // We calculate the interleave count using the following formula.
5724 // Subtract the number of loop invariants from the number of available
5725 // registers. These registers are used by all of the interleaved instances.
5726 // Next, divide the remaining registers by the number of registers that is
5727 // required by the loop, in order to estimate how many parallel instances
5728 // fit without causing spills. All of this is rounded down if necessary to be
5729 // a power of two. We want power of two interleave count to simplify any
5730 // addressing operations or alignment considerations.
5731 // We also want power of two interleave counts to ensure that the induction
5732 // variable of the vector loop wraps to zero, when tail is folded by masking;
5733 // this currently happens when OptForSize, in which case IC is set to 1 above.
5734 unsigned IC = UINT_MAX(2147483647 *2U +1U);
5735
5736 for (auto& pair : R.MaxLocalUsers) {
5737 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5738 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegistersdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
TargetNumRegisters << " registers of " << TTI.getRegisterClassName
(pair.first) << " register class\n"; } } while (false)
5739 << " registers of "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
TargetNumRegisters << " registers of " << TTI.getRegisterClassName
(pair.first) << " register class\n"; } } while (false)
5740 << TTI.getRegisterClassName(pair.first) << " register class\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
TargetNumRegisters << " registers of " << TTI.getRegisterClassName
(pair.first) << " register class\n"; } } while (false)
;
5741 if (VF.isScalar()) {
5742 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5743 TargetNumRegisters = ForceTargetNumScalarRegs;
5744 } else {
5745 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5746 TargetNumRegisters = ForceTargetNumVectorRegs;
5747 }
5748 unsigned MaxLocalUsers = pair.second;
5749 unsigned LoopInvariantRegs = 0;
5750 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5751 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5752
5753 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5754 // Don't count the induction variable as interleaved.
5755 if (EnableIndVarRegisterHeur) {
5756 TmpIC =
5757 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5758 std::max(1U, (MaxLocalUsers - 1)));
5759 }
5760
5761 IC = std::min(IC, TmpIC);
5762 }
5763
5764 // Clamp the interleave ranges to reasonable counts.
5765 unsigned MaxInterleaveCount =
5766 TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5767
5768 // Check if the user has overridden the max.
5769 if (VF.isScalar()) {
5770 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5771 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5772 } else {
5773 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5774 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5775 }
5776
5777 // If trip count is known or estimated compile time constant, limit the
5778 // interleave count to be less than the trip count divided by VF, provided it
5779 // is at least 1.
5780 //
5781 // For scalable vectors we can't know if interleaving is beneficial. It may
5782 // not be beneficial for small loops if none of the lanes in the second vector
5783 // iterations is enabled. However, for larger loops, there is likely to be a
5784 // similar benefit as for fixed-width vectors. For now, we choose to leave
5785 // the InterleaveCount as if vscale is '1', although if some information about
5786 // the vector is known (e.g. min vector size), we can make a better decision.
5787 if (BestKnownTC) {
5788 MaxInterleaveCount =
5789 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5790 // Make sure MaxInterleaveCount is greater than 0.
5791 MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5792 }
5793
5794 assert(MaxInterleaveCount > 0 &&(static_cast <bool> (MaxInterleaveCount > 0 &&
"Maximum interleave count must be greater than 0") ? void (0
) : __assert_fail ("MaxInterleaveCount > 0 && \"Maximum interleave count must be greater than 0\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5795, __extension__
__PRETTY_FUNCTION__))
5795 "Maximum interleave count must be greater than 0")(static_cast <bool> (MaxInterleaveCount > 0 &&
"Maximum interleave count must be greater than 0") ? void (0
) : __assert_fail ("MaxInterleaveCount > 0 && \"Maximum interleave count must be greater than 0\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5795, __extension__
__PRETTY_FUNCTION__))
;
5796
5797 // Clamp the calculated IC to be between the 1 and the max interleave count
5798 // that the target and trip count allows.
5799 if (IC > MaxInterleaveCount)
5800 IC = MaxInterleaveCount;
5801 else
5802 // Make sure IC is greater than 0.
5803 IC = std::max(1u, IC);
5804
5805 assert(IC > 0 && "Interleave count must be greater than 0.")(static_cast <bool> (IC > 0 && "Interleave count must be greater than 0."
) ? void (0) : __assert_fail ("IC > 0 && \"Interleave count must be greater than 0.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5805, __extension__
__PRETTY_FUNCTION__))
;
5806
5807 // Interleave if we vectorized this loop and there is a reduction that could
5808 // benefit from interleaving.
5809 if (VF.isVector() && HasReductions) {
5810 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving because of reductions.\n"
; } } while (false)
;
5811 return IC;
5812 }
5813
5814 // For any scalar loop that either requires runtime checks or predication we
5815 // are better off leaving this to the unroller. Note that if we've already
5816 // vectorized the loop we will have done the runtime check and so interleaving
5817 // won't require further checks.
5818 bool ScalarInterleavingRequiresPredication =
5819 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5820 return Legal->blockNeedsPredication(BB);
5821 }));
5822 bool ScalarInterleavingRequiresRuntimePointerCheck =
5823 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5824
5825 // We want to interleave small loops in order to reduce the loop overhead and
5826 // potentially expose ILP opportunities.
5827 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop cost is " <<
LoopCost << '\n' << "LV: IC is " << IC <<
'\n' << "LV: VF is " << VF << '\n'; } } while
(false)
5828 << "LV: IC is " << IC << '\n'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop cost is " <<
LoopCost << '\n' << "LV: IC is " << IC <<
'\n' << "LV: VF is " << VF << '\n'; } } while
(false)
5829 << "LV: VF is " << VF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop cost is " <<
LoopCost << '\n' << "LV: IC is " << IC <<
'\n' << "LV: VF is " << VF << '\n'; } } while
(false)
;
5830 const bool AggressivelyInterleaveReductions =
5831 TTI.enableAggressiveInterleaving(HasReductions);
5832 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5833 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5834 // We assume that the cost overhead is 1 and we use the cost model
5835 // to estimate the cost of the loop and interleave until the cost of the
5836 // loop overhead is about 5% of the cost of the loop.
5837 unsigned SmallIC = std::min(
5838 IC, (unsigned)PowerOf2Floor(SmallLoopCost / *LoopCost.getValue()));
5839
5840 // Interleave until store/load ports (estimated by max interleave count) are
5841 // saturated.
5842 unsigned NumStores = Legal->getNumStores();
5843 unsigned NumLoads = Legal->getNumLoads();
5844 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5845 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5846
5847 // There is little point in interleaving for reductions containing selects
5848 // and compares when VF=1 since it may just create more overhead than it's
5849 // worth for loops with small trip counts. This is because we still have to
5850 // do the final reduction after the loop.
5851 bool HasSelectCmpReductions =
5852 HasReductions &&
5853 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5854 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5855 return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
5856 RdxDesc.getRecurrenceKind());
5857 });
5858 if (HasSelectCmpReductions) {
5859 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not interleaving select-cmp reductions.\n"
; } } while (false)
;
5860 return 1;
5861 }
5862
5863 // If we have a scalar reduction (vector reductions are already dealt with
5864 // by this point), we can increase the critical path length if the loop
5865 // we're interleaving is inside another loop. For tree-wise reductions
5866 // set the limit to 2, and for ordered reductions it's best to disable
5867 // interleaving entirely.
5868 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5869 bool HasOrderedReductions =
5870 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5871 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5872 return RdxDesc.isOrdered();
5873 });
5874 if (HasOrderedReductions) {
5875 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not interleaving scalar ordered reductions.\n"
; } } while (false)
5876 dbgs() << "LV: Not interleaving scalar ordered reductions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not interleaving scalar ordered reductions.\n"
; } } while (false)
;
5877 return 1;
5878 }
5879
5880 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5881 SmallIC = std::min(SmallIC, F);
5882 StoresIC = std::min(StoresIC, F);
5883 LoadsIC = std::min(LoadsIC, F);
5884 }
5885
5886 if (EnableLoadStoreRuntimeInterleave &&
5887 std::max(StoresIC, LoadsIC) > SmallIC) {
5888 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to saturate store or load ports.\n"
; } } while (false)
5889 dbgs() << "LV: Interleaving to saturate store or load ports.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to saturate store or load ports.\n"
; } } while (false)
;
5890 return std::max(StoresIC, LoadsIC);
5891 }
5892
5893 // If there are scalar reductions and TTI has enabled aggressive
5894 // interleaving for reductions, we will interleave to expose ILP.
5895 if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5896 AggressivelyInterleaveReductions) {
5897 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to expose ILP.\n"
; } } while (false)
;
5898 // Interleave no less than SmallIC but not as aggressive as the normal IC
5899 // to satisfy the rare situation when resources are too limited.
5900 return std::max(IC / 2, SmallIC);
5901 } else {
5902 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to reduce branch cost.\n"
; } } while (false)
;
5903 return SmallIC;
5904 }
5905 }
5906
5907 // Interleave if this is a large loop (small loops are already dealt with by
5908 // this point) that could benefit from interleaving.
5909 if (AggressivelyInterleaveReductions) {
5910 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to expose ILP.\n"
; } } while (false)
;
5911 return IC;
5912 }
5913
5914 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not Interleaving.\n"
; } } while (false)
;
5915 return 1;
5916}
5917
5918SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5919LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5920 // This function calculates the register usage by measuring the highest number
5921 // of values that are alive at a single location. Obviously, this is a very
5922 // rough estimation. We scan the loop in a topological order in order and
5923 // assign a number to each instruction. We use RPO to ensure that defs are
5924 // met before their users. We assume that each instruction that has in-loop
5925 // users starts an interval. We record every time that an in-loop value is
5926 // used, so we have a list of the first and last occurrences of each
5927 // instruction. Next, we transpose this data structure into a multi map that
5928 // holds the list of intervals that *end* at a specific location. This multi
5929 // map allows us to perform a linear search. We scan the instructions linearly
5930 // and record each time that a new interval starts, by placing it in a set.
5931 // If we find this value in the multi-map then we remove it from the set.
5932 // The max register usage is the maximum size of the set.
5933 // We also search for instructions that are defined outside the loop, but are
5934 // used inside the loop. We need this number separately from the max-interval
5935 // usage number because when we unroll, loop-invariant values do not take
5936 // more register.
5937 LoopBlocksDFS DFS(TheLoop);
5938 DFS.perform(LI);
5939
5940 RegisterUsage RU;
5941
5942 // Each 'key' in the map opens a new interval. The values
5943 // of the map are the index of the 'last seen' usage of the
5944 // instruction that is the key.
5945 using IntervalMap = DenseMap<Instruction *, unsigned>;
5946
5947 // Maps instruction to its index.
5948 SmallVector<Instruction *, 64> IdxToInstr;
5949 // Marks the end of each interval.
5950 IntervalMap EndPoint;
5951 // Saves the list of instruction indices that are used in the loop.
5952 SmallPtrSet<Instruction *, 8> Ends;
5953 // Saves the list of values that are used in the loop but are defined outside
5954 // the loop (not including non-instruction values such as arguments and
5955 // constants).
5956 SmallPtrSet<Value *, 8> LoopInvariants;
5957
5958 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5959 for (Instruction &I : BB->instructionsWithoutDebug()) {
5960 IdxToInstr.push_back(&I);
5961
5962 // Save the end location of each USE.
5963 for (Value *U : I.operands()) {
5964 auto *Instr = dyn_cast<Instruction>(U);
5965
5966 // Ignore non-instruction values such as arguments, constants, etc.
5967 // FIXME: Might need some motivation why these values are ignored. If
5968 // for example an argument is used inside the loop it will increase the
5969 // register pressure (so shouldn't we add it to LoopInvariants).
5970 if (!Instr)
5971 continue;
5972
5973 // If this instruction is outside the loop then record it and continue.
5974 if (!TheLoop->contains(Instr)) {
5975 LoopInvariants.insert(Instr);
5976 continue;
5977 }
5978
5979 // Overwrite previous end points.
5980 EndPoint[Instr] = IdxToInstr.size();
5981 Ends.insert(Instr);
5982 }
5983 }
5984 }
5985
5986 // Saves the list of intervals that end with the index in 'key'.
5987 using InstrList = SmallVector<Instruction *, 2>;
5988 DenseMap<unsigned, InstrList> TransposeEnds;
5989
5990 // Transpose the EndPoints to a list of values that end at each index.
5991 for (auto &Interval : EndPoint)
5992 TransposeEnds[Interval.second].push_back(Interval.first);
5993
5994 SmallPtrSet<Instruction *, 8> OpenIntervals;
5995 SmallVector<RegisterUsage, 8> RUs(VFs.size());
5996 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5997
5998 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): Calculating max register usage:\n"
; } } while (false)
;
5999
6000 const auto &TTICapture = TTI;
6001 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6002 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6003 return 0;
6004 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
6005 };
6006
6007 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6008 Instruction *I = IdxToInstr[i];
6009
6010 // Remove all of the instructions that end at this location.
6011 InstrList &List = TransposeEnds[i];
6012 for (Instruction *ToRemove : List)
6013 OpenIntervals.erase(ToRemove);
6014
6015 // Ignore instructions that are never used within the loop.
6016 if (!Ends.count(I))
6017 continue;
6018
6019 // Skip ignored values.
6020 if (ValuesToIgnore.count(I))
6021 continue;
6022
6023 // For each VF find the maximum usage of registers.
6024 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6025 // Count the number of registers used, per register class, given all open
6026 // intervals.
6027 // Note that elements in this SmallMapVector will be default constructed
6028 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
6029 // there is no previous entry for ClassID.
6030 SmallMapVector<unsigned, unsigned, 4> RegUsage;
6031
6032 if (VFs[j].isScalar()) {
6033 for (auto *Inst : OpenIntervals) {
6034 unsigned ClassID =
6035 TTI.getRegisterClassForType(false, Inst->getType());
6036 // FIXME: The target might use more than one register for the type
6037 // even in the scalar case.
6038 RegUsage[ClassID] += 1;
6039 }
6040 } else {
6041 collectUniformsAndScalars(VFs[j]);
6042 for (auto *Inst : OpenIntervals) {
6043 // Skip ignored values for VF > 1.
6044 if (VecValuesToIgnore.count(Inst))
6045 continue;
6046 if (isScalarAfterVectorization(Inst, VFs[j])) {
6047 unsigned ClassID =
6048 TTI.getRegisterClassForType(false, Inst->getType());
6049 // FIXME: The target might use more than one register for the type
6050 // even in the scalar case.
6051 RegUsage[ClassID] += 1;
6052 } else {
6053 unsigned ClassID =
6054 TTI.getRegisterClassForType(true, Inst->getType());
6055 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6056 }
6057 }
6058 }
6059
6060 for (auto& pair : RegUsage) {
6061 auto &Entry = MaxUsages[j][pair.first];
6062 Entry = std::max(Entry, pair.second);
6063 }
6064 }
6065
6066 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): At #" <<
i << " Interval # " << OpenIntervals.size() <<
'\n'; } } while (false)
6067 << OpenIntervals.size() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): At #" <<
i << " Interval # " << OpenIntervals.size() <<
'\n'; } } while (false)
;
6068
6069 // Add the current instruction to the list of open intervals.
6070 OpenIntervals.insert(I);
6071 }
6072
6073 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6074 // Note that elements in this SmallMapVector will be default constructed
6075 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
6076 // there is no previous entry for ClassID.
6077 SmallMapVector<unsigned, unsigned, 4> Invariant;
6078
6079 for (auto *Inst : LoopInvariants) {
6080 // FIXME: The target might use more than one register for the type
6081 // even in the scalar case.
6082 unsigned Usage =
6083 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6084 unsigned ClassID =
6085 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6086 Invariant[ClassID] += Usage;
6087 }
6088
6089 LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6090 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6091 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6092 << " item\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6093 for (const auto &pair : MaxUsages[i]) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6094 dbgs() << "LV(REG): RegisterClass: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6095 << TTI.getRegisterClassName(pair.first) << ", " << pair.seconddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6096 << " registers\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6097 }do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6098 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6099 << " item\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6100 for (const auto &pair : Invariant) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6101 dbgs() << "LV(REG): RegisterClass: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6102 << TTI.getRegisterClassName(pair.first) << ", " << pair.seconddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6103 << " registers\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6104 }do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6105 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
;
6106
6107 RU.LoopInvariantRegs = Invariant;
6108 RU.MaxLocalUsers = MaxUsages[i];
6109 RUs[i] = RU;
6110 }
6111
6112 return RUs;
6113}
6114
6115bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
6116 ElementCount VF) {
6117 // TODO: Cost model for emulated masked load/store is completely
6118 // broken. This hack guides the cost model to use an artificially
6119 // high enough value to practically disable vectorization with such
6120 // operations, except where previously deployed legality hack allowed
6121 // using very low cost values. This is to avoid regressions coming simply
6122 // from moving "masked load/store" check from legality to cost model.
6123 // Masked Load/Gather emulation was previously never allowed.
6124 // Limited number of Masked Store/Scatter emulation was allowed.
6125 assert((isPredicatedInst(I)) &&(static_cast <bool> ((isPredicatedInst(I)) && "Expecting a scalar emulated instruction"
) ? void (0) : __assert_fail ("(isPredicatedInst(I)) && \"Expecting a scalar emulated instruction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6126, __extension__
__PRETTY_FUNCTION__))
6126 "Expecting a scalar emulated instruction")(static_cast <bool> ((isPredicatedInst(I)) && "Expecting a scalar emulated instruction"
) ? void (0) : __assert_fail ("(isPredicatedInst(I)) && \"Expecting a scalar emulated instruction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6126, __extension__
__PRETTY_FUNCTION__))
;
6127 return isa<LoadInst>(I) ||
6128 (isa<StoreInst>(I) &&
6129 NumPredStores > NumberOfStoresToPredicate);
6130}
6131
6132void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6133 // If we aren't vectorizing the loop, or if we've already collected the
6134 // instructions to scalarize, there's nothing to do. Collection may already
6135 // have occurred if we have a user-selected VF and are now computing the
6136 // expected cost for interleaving.
6137 if (VF.isScalar() || VF.isZero() ||
6138 InstsToScalarize.find(VF) != InstsToScalarize.end())
6139 return;
6140
6141 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6142 // not profitable to scalarize any instructions, the presence of VF in the
6143 // map will indicate that we've analyzed it already.
6144 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6145
6146 PredicatedBBsAfterVectorization[VF].clear();
6147
6148 // Find all the instructions that are scalar with predication in the loop and
6149 // determine if it would be better to not if-convert the blocks they are in.
6150 // If so, we also record the instructions to scalarize.
6151 for (BasicBlock *BB : TheLoop->blocks()) {
6152 if (!blockNeedsPredicationForAnyReason(BB))
6153 continue;
6154 for (Instruction &I : *BB)
6155 if (isScalarWithPredication(&I, VF)) {
6156 ScalarCostsTy ScalarCosts;
6157 // Do not apply discount if scalable, because that would lead to
6158 // invalid scalarization costs.
6159 // Do not apply discount logic if hacked cost is needed
6160 // for emulated masked memrefs.
6161 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
6162 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6163 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6164 // Remember that BB will remain after vectorization.
6165 PredicatedBBsAfterVectorization[VF].insert(BB);
6166 }
6167 }
6168}
6169
6170InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
6171 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6172 assert(!isUniformAfterVectorization(PredInst, VF) &&(static_cast <bool> (!isUniformAfterVectorization(PredInst
, VF) && "Instruction marked uniform-after-vectorization will be predicated"
) ? void (0) : __assert_fail ("!isUniformAfterVectorization(PredInst, VF) && \"Instruction marked uniform-after-vectorization will be predicated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6173, __extension__
__PRETTY_FUNCTION__))
6173 "Instruction marked uniform-after-vectorization will be predicated")(static_cast <bool> (!isUniformAfterVectorization(PredInst
, VF) && "Instruction marked uniform-after-vectorization will be predicated"
) ? void (0) : __assert_fail ("!isUniformAfterVectorization(PredInst, VF) && \"Instruction marked uniform-after-vectorization will be predicated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6173, __extension__
__PRETTY_FUNCTION__))
;
6174
6175 // Initialize the discount to zero, meaning that the scalar version and the
6176 // vector version cost the same.
6177 InstructionCost Discount = 0;
6178
6179 // Holds instructions to analyze. The instructions we visit are mapped in
6180 // ScalarCosts. Those instructions are the ones that would be scalarized if
6181 // we find that the scalar version costs less.
6182 SmallVector<Instruction *, 8> Worklist;
6183
6184 // Returns true if the given instruction can be scalarized.
6185 auto canBeScalarized = [&](Instruction *I) -> bool {
6186 // We only attempt to scalarize instructions forming a single-use chain
6187 // from the original predicated block that would otherwise be vectorized.
6188 // Although not strictly necessary, we give up on instructions we know will
6189 // already be scalar to avoid traversing chains that are unlikely to be
6190 // beneficial.
6191 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6192 isScalarAfterVectorization(I, VF))
6193 return false;
6194
6195 // If the instruction is scalar with predication, it will be analyzed
6196 // separately. We ignore it within the context of PredInst.
6197 if (isScalarWithPredication(I, VF))
6198 return false;
6199
6200 // If any of the instruction's operands are uniform after vectorization,
6201 // the instruction cannot be scalarized. This prevents, for example, a
6202 // masked load from being scalarized.
6203 //
6204 // We assume we will only emit a value for lane zero of an instruction
6205 // marked uniform after vectorization, rather than VF identical values.
6206 // Thus, if we scalarize an instruction that uses a uniform, we would
6207 // create uses of values corresponding to the lanes we aren't emitting code
6208 // for. This behavior can be changed by allowing getScalarValue to clone
6209 // the lane zero values for uniforms rather than asserting.
6210 for (Use &U : I->operands())
6211 if (auto *J = dyn_cast<Instruction>(U.get()))
6212 if (isUniformAfterVectorization(J, VF))
6213 return false;
6214
6215 // Otherwise, we can scalarize the instruction.
6216 return true;
6217 };
6218
6219 // Compute the expected cost discount from scalarizing the entire expression
6220 // feeding the predicated instruction. We currently only consider expressions
6221 // that are single-use instruction chains.
6222 Worklist.push_back(PredInst);
6223 while (!Worklist.empty()) {
6224 Instruction *I = Worklist.pop_back_val();
6225
6226 // If we've already analyzed the instruction, there's nothing to do.
6227 if (ScalarCosts.find(I) != ScalarCosts.end())
6228 continue;
6229
6230 // Compute the cost of the vector instruction. Note that this cost already
6231 // includes the scalarization overhead of the predicated instruction.
6232 InstructionCost VectorCost = getInstructionCost(I, VF).first;
6233
6234 // Compute the cost of the scalarized instruction. This cost is the cost of
6235 // the instruction as if it wasn't if-converted and instead remained in the
6236 // predicated block. We will scale this cost by block probability after
6237 // computing the scalarization overhead.
6238 InstructionCost ScalarCost =
6239 VF.getFixedValue() *
6240 getInstructionCost(I, ElementCount::getFixed(1)).first;
6241
6242 // Compute the scalarization overhead of needed insertelement instructions
6243 // and phi nodes.
6244 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6245 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6246 ScalarCost += TTI.getScalarizationOverhead(
6247 cast<VectorType>(ToVectorTy(I->getType(), VF)),
6248 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
6249 /*Extract*/ false, CostKind);
6250 ScalarCost +=
6251 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
6252 }
6253
6254 // Compute the scalarization overhead of needed extractelement
6255 // instructions. For each of the instruction's operands, if the operand can
6256 // be scalarized, add it to the worklist; otherwise, account for the
6257 // overhead.
6258 for (Use &U : I->operands())
6259 if (auto *J = dyn_cast<Instruction>(U.get())) {
6260 assert(VectorType::isValidElementType(J->getType()) &&(static_cast <bool> (VectorType::isValidElementType(J->
getType()) && "Instruction has non-scalar type") ? void
(0) : __assert_fail ("VectorType::isValidElementType(J->getType()) && \"Instruction has non-scalar type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6261, __extension__
__PRETTY_FUNCTION__))
6261 "Instruction has non-scalar type")(static_cast <bool> (VectorType::isValidElementType(J->
getType()) && "Instruction has non-scalar type") ? void
(0) : __assert_fail ("VectorType::isValidElementType(J->getType()) && \"Instruction has non-scalar type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6261, __extension__
__PRETTY_FUNCTION__))
;
6262 if (canBeScalarized(J))
6263 Worklist.push_back(J);
6264 else if (needsExtract(J, VF)) {
6265 ScalarCost += TTI.getScalarizationOverhead(
6266 cast<VectorType>(ToVectorTy(J->getType(), VF)),
6267 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
6268 /*Extract*/ true, CostKind);
6269 }
6270 }
6271
6272 // Scale the total scalar cost by block probability.
6273 ScalarCost /= getReciprocalPredBlockProb();
6274
6275 // Compute the discount. A non-negative discount means the vector version
6276 // of the instruction costs more, and scalarizing would be beneficial.
6277 Discount += VectorCost - ScalarCost;
6278 ScalarCosts[I] = ScalarCost;
6279 }
6280
6281 return Discount;
6282}
6283
6284LoopVectorizationCostModel::VectorizationCostTy
6285LoopVectorizationCostModel::expectedCost(
6286 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6287 VectorizationCostTy Cost;
6288
6289 // For each block.
6290 for (BasicBlock *BB : TheLoop->blocks()) {
6291 VectorizationCostTy BlockCost;
6292
6293 // For each instruction in the old loop.
6294 for (Instruction &I : BB->instructionsWithoutDebug()) {
6295 // Skip ignored values.
6296 if (ValuesToIgnore.count(&I) ||
6297 (VF.isVector() && VecValuesToIgnore.count(&I)))
6298 continue;
6299
6300 VectorizationCostTy C = getInstructionCost(&I, VF);
6301
6302 // Check if we should override the cost.
6303 if (C.first.isValid() &&
6304 ForceTargetInstructionCost.getNumOccurrences() > 0)
6305 C.first = InstructionCost(ForceTargetInstructionCost);
6306
6307 // Keep a list of instructions with invalid costs.
6308 if (Invalid && !C.first.isValid())
6309 Invalid->emplace_back(&I, VF);
6310
6311 BlockCost.first += C.first;
6312 BlockCost.second |= C.second;
6313 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.firstdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
<< C.first << " for VF " << VF << " For instruction: "
<< I << '\n'; } } while (false)
6314 << " for VF " << VF << " For instruction: " << Ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
<< C.first << " for VF " << VF << " For instruction: "
<< I << '\n'; } } while (false)
6315 << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
<< C.first << " for VF " << VF << " For instruction: "
<< I << '\n'; } } while (false)
;
6316 }
6317
6318 // If we are vectorizing a predicated block, it will have been
6319 // if-converted. This means that the block's instructions (aside from
6320 // stores and instructions that may divide by zero) will now be
6321 // unconditionally executed. For the scalar case, we may not always execute
6322 // the predicated block, if it is an if-else block. Thus, scale the block's
6323 // cost by the probability of executing it. blockNeedsPredication from
6324 // Legal is used so as to not include all blocks in tail folded loops.
6325 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6326 BlockCost.first /= getReciprocalPredBlockProb();
6327
6328 Cost.first += BlockCost.first;
6329 Cost.second |= BlockCost.second;
6330 }
6331
6332 return Cost;
6333}
6334
6335/// Gets Address Access SCEV after verifying that the access pattern
6336/// is loop invariant except the induction variable dependence.
6337///
6338/// This SCEV can be sent to the Target in order to estimate the address
6339/// calculation cost.
6340static const SCEV *getAddressAccessSCEV(
6341 Value *Ptr,
6342 LoopVectorizationLegality *Legal,
6343 PredicatedScalarEvolution &PSE,
6344 const Loop *TheLoop) {
6345
6346 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6347 if (!Gep)
6348 return nullptr;
6349
6350 // We are looking for a gep with all loop invariant indices except for one
6351 // which should be an induction variable.
6352 auto SE = PSE.getSE();
6353 unsigned NumOperands = Gep->getNumOperands();
6354 for (unsigned i = 1; i < NumOperands; ++i) {
6355 Value *Opd = Gep->getOperand(i);
6356 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6357 !Legal->isInductionVariable(Opd))
6358 return nullptr;
6359 }
6360
6361 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6362 return PSE.getSCEV(Ptr);
6363}
6364
6365static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6366 return Legal->hasStride(I->getOperand(0)) ||
6367 Legal->hasStride(I->getOperand(1));
6368}
6369
6370InstructionCost
6371LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6372 ElementCount VF) {
6373 assert(VF.isVector() &&(static_cast <bool> (VF.isVector() && "Scalarization cost of instruction implies vectorization."
) ? void (0) : __assert_fail ("VF.isVector() && \"Scalarization cost of instruction implies vectorization.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6374, __extension__
__PRETTY_FUNCTION__))
6374 "Scalarization cost of instruction implies vectorization.")(static_cast <bool> (VF.isVector() && "Scalarization cost of instruction implies vectorization."
) ? void (0) : __assert_fail ("VF.isVector() && \"Scalarization cost of instruction implies vectorization.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6374, __extension__
__PRETTY_FUNCTION__))
;
6375 if (VF.isScalable())
6376 return InstructionCost::getInvalid();
6377
6378 Type *ValTy = getLoadStoreType(I);
6379 auto SE = PSE.getSE();
6380
6381 unsigned AS = getLoadStoreAddressSpace(I);
6382 Value *Ptr = getLoadStorePointerOperand(I);
6383 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6384 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6385 // that it is being called from this specific place.
6386
6387 // Figure out whether the access is strided and get the stride value
6388 // if it's known in compile time
6389 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6390
6391 // Get the cost of the scalar memory instruction and address computation.
6392 InstructionCost Cost =
6393 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6394
6395 // Don't pass *I here, since it is scalar but will actually be part of a
6396 // vectorized loop where the user of it is a vectorized instruction.
6397 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6398 const Align Alignment = getLoadStoreAlignment(I);
6399 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
6400 ValTy->getScalarType(),
6401 Alignment, AS, CostKind);
6402
6403 // Get the overhead of the extractelement and insertelement instructions
6404 // we might create due to scalarization.
6405 Cost += getScalarizationOverhead(I, VF, CostKind);
6406
6407 // If we have a predicated load/store, it will need extra i1 extracts and
6408 // conditional branches, but may not be executed for each vector lane. Scale
6409 // the cost by the probability of executing the predicated block.
6410 if (isPredicatedInst(I)) {
6411 Cost /= getReciprocalPredBlockProb();
6412
6413 // Add the cost of an i1 extract and a branch
6414 auto *Vec_i1Ty =
6415 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6416 Cost += TTI.getScalarizationOverhead(
6417 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6418 /*Insert=*/false, /*Extract=*/true, CostKind);
6419 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
6420
6421 if (useEmulatedMaskMemRefHack(I, VF))
6422 // Artificially setting to a high enough value to practically disable
6423 // vectorization with such operations.
6424 Cost = 3000000;
6425 }
6426
6427 return Cost;
6428}
6429
6430InstructionCost
6431LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6432 ElementCount VF) {
6433 Type *ValTy = getLoadStoreType(I);
6434 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6435 Value *Ptr = getLoadStorePointerOperand(I);
6436 unsigned AS = getLoadStoreAddressSpace(I);
6437 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6438 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6439
6440 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&(static_cast <bool> ((ConsecutiveStride == 1 || ConsecutiveStride
== -1) && "Stride should be 1 or -1 for consecutive memory access"
) ? void (0) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Stride should be 1 or -1 for consecutive memory access\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6441, __extension__
__PRETTY_FUNCTION__))
6441 "Stride should be 1 or -1 for consecutive memory access")(static_cast <bool> ((ConsecutiveStride == 1 || ConsecutiveStride
== -1) && "Stride should be 1 or -1 for consecutive memory access"
) ? void (0) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Stride should be 1 or -1 for consecutive memory access\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6441, __extension__
__PRETTY_FUNCTION__))
;
6442 const Align Alignment = getLoadStoreAlignment(I);
6443 InstructionCost Cost = 0;
6444 if (Legal->isMaskRequired(I)) {
6445 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6446 CostKind);
6447 } else {
6448 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6449 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6450 CostKind, OpInfo, I);
6451 }
6452
6453 bool Reverse = ConsecutiveStride < 0;
6454 if (Reverse)
6455 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6456 std::nullopt, CostKind, 0);
6457 return Cost;
6458}
6459
6460InstructionCost
6461LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6462 ElementCount VF) {
6463 assert(Legal->isUniformMemOp(*I))(static_cast <bool> (Legal->isUniformMemOp(*I)) ? void
(0) : __assert_fail ("Legal->isUniformMemOp(*I)", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6463, __extension__ __PRETTY_FUNCTION__))
;
6464
6465 Type *ValTy = getLoadStoreType(I);
6466 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6467 const Align Alignment = getLoadStoreAlignment(I);
6468 unsigned AS = getLoadStoreAddressSpace(I);
6469 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6470 if (isa<LoadInst>(I)) {
6471 return TTI.getAddressComputationCost(ValTy) +
6472 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6473 CostKind) +
6474 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6475 }
6476 StoreInst *SI = cast<StoreInst>(I);
6477
6478 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6479 return TTI.getAddressComputationCost(ValTy) +
6480 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6481 CostKind) +
6482 (isLoopInvariantStoreValue
6483 ? 0
6484 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6485 CostKind, VF.getKnownMinValue() - 1));
6486}
6487
6488InstructionCost
6489LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6490 ElementCount VF) {
6491 Type *ValTy = getLoadStoreType(I);
6492 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6493 const Align Alignment = getLoadStoreAlignment(I);
6494 const Value *Ptr = getLoadStorePointerOperand(I);
6495
6496 return TTI.getAddressComputationCost(VectorTy) +
6497 TTI.getGatherScatterOpCost(
6498 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6499 TargetTransformInfo::TCK_RecipThroughput, I);
6500}
6501
6502InstructionCost
6503LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6504 ElementCount VF) {
6505 // TODO: Once we have support for interleaving with scalable vectors
6506 // we can calculate the cost properly here.
6507 if (VF.isScalable())
6508 return InstructionCost::getInvalid();
6509
6510 Type *ValTy = getLoadStoreType(I);
6511 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6512 unsigned AS = getLoadStoreAddressSpace(I);
6513 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6514
6515 auto Group = getInterleavedAccessGroup(I);
6516 assert(Group && "Fail to get an interleaved access group.")(static_cast <bool> (Group && "Fail to get an interleaved access group."
) ? void (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6516, __extension__
__PRETTY_FUNCTION__))
;
6517
6518 unsigned InterleaveFactor = Group->getFactor();
6519 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6520
6521 // Holds the indices of existing members in the interleaved group.
6522 SmallVector<unsigned, 4> Indices;
6523 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6524 if (Group->getMember(IF))
6525 Indices.push_back(IF);
6526
6527 // Calculate the cost of the whole interleaved group.
6528 bool UseMaskForGaps =
6529 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6530 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6531 InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6532 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6533 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
6534
6535 if (Group->isReverse()) {
6536 // TODO: Add support for reversed masked interleaved access.
6537 assert(!Legal->isMaskRequired(I) &&(static_cast <bool> (!Legal->isMaskRequired(I) &&
"Reverse masked interleaved access not supported.") ? void (
0) : __assert_fail ("!Legal->isMaskRequired(I) && \"Reverse masked interleaved access not supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6538, __extension__
__PRETTY_FUNCTION__))
6538 "Reverse masked interleaved access not supported.")(static_cast <bool> (!Legal->isMaskRequired(I) &&
"Reverse masked interleaved access not supported.") ? void (
0) : __assert_fail ("!Legal->isMaskRequired(I) && \"Reverse masked interleaved access not supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6538, __extension__
__PRETTY_FUNCTION__))
;
6539 Cost += Group->getNumMembers() *
6540 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6541 std::nullopt, CostKind, 0);
6542 }
6543 return Cost;
6544}
6545
6546std::optional<InstructionCost>
6547LoopVectorizationCostModel::getReductionPatternCost(
6548 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6549 using namespace llvm::PatternMatch;
6550 // Early exit for no inloop reductions
6551 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6552 return std::nullopt;
6553 auto *VectorTy = cast<VectorType>(Ty);
6554
6555 // We are looking for a pattern of, and finding the minimal acceptable cost:
6556 // reduce(mul(ext(A), ext(B))) or
6557 // reduce(mul(A, B)) or
6558 // reduce(ext(A)) or
6559 // reduce(A).
6560 // The basic idea is that we walk down the tree to do that, finding the root
6561 // reduction instruction in InLoopReductionImmediateChains. From there we find
6562 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6563 // of the components. If the reduction cost is lower then we return it for the
6564 // reduction instruction and 0 for the other instructions in the pattern. If
6565 // it is not we return an invalid cost specifying the orignal cost method
6566 // should be used.
6567 Instruction *RetI = I;
6568 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6569 if (!RetI->hasOneUser())
6570 return std::nullopt;
6571 RetI = RetI->user_back();
6572 }
6573
6574 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6575 RetI->user_back()->getOpcode() == Instruction::Add) {
6576 RetI = RetI->user_back();
6577 }
6578
6579 // Test if the found instruction is a reduction, and if not return an invalid
6580 // cost specifying the parent to use the original cost modelling.
6581 if (!InLoopReductionImmediateChains.count(RetI))
6582 return std::nullopt;
6583
6584 // Find the reduction this chain is a part of and calculate the basic cost of
6585 // the reduction on its own.
6586 Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6587 Instruction *ReductionPhi = LastChain;
6588 while (!isa<PHINode>(ReductionPhi))
6589 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6590
6591 const RecurrenceDescriptor &RdxDesc =
6592 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6593
6594 InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6595 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6596
6597 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6598 // normal fmul instruction to the cost of the fadd reduction.
6599 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6600 BaseCost +=
6601 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6602
6603 // If we're using ordered reductions then we can just return the base cost
6604 // here, since getArithmeticReductionCost calculates the full ordered
6605 // reduction cost when FP reassociation is not allowed.
6606 if (useOrderedReductions(RdxDesc))
6607 return BaseCost;
6608
6609 // Get the operand that was not the reduction chain and match it to one of the
6610 // patterns, returning the better cost if it is found.
6611 Instruction *RedOp = RetI->getOperand(1) == LastChain
6612 ? dyn_cast<Instruction>(RetI->getOperand(0))
6613 : dyn_cast<Instruction>(RetI->getOperand(1));
6614
6615 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6616
6617 Instruction *Op0, *Op1;
6618 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6619 match(RedOp,
6620 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6621 match(Op0, m_ZExtOrSExt(m_Value())) &&
6622 Op0->getOpcode() == Op1->getOpcode() &&
6623 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6624 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6625 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6626
6627 // Matched reduce.add(ext(mul(ext(A), ext(B)))
6628 // Note that the extend opcodes need to all match, or if A==B they will have
6629 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6630 // which is equally fine.
6631 bool IsUnsigned = isa<ZExtInst>(Op0);
6632 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6633 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6634
6635 InstructionCost ExtCost =
6636 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6637 TTI::CastContextHint::None, CostKind, Op0);
6638 InstructionCost MulCost =
6639 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6640 InstructionCost Ext2Cost =
6641 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6642 TTI::CastContextHint::None, CostKind, RedOp);
6643
6644 InstructionCost RedCost = TTI.getMulAccReductionCost(
6645 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6646
6647 if (RedCost.isValid() &&
6648 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6649 return I == RetI ? RedCost : 0;
6650 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6651 !TheLoop->isLoopInvariant(RedOp)) {
6652 // Matched reduce(ext(A))
6653 bool IsUnsigned = isa<ZExtInst>(RedOp);
6654 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6655 InstructionCost RedCost = TTI.getExtendedReductionCost(
6656 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6657 RdxDesc.getFastMathFlags(), CostKind);
6658
6659 InstructionCost ExtCost =
6660 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6661 TTI::CastContextHint::None, CostKind, RedOp);
6662 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6663 return I == RetI ? RedCost : 0;
6664 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6665 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6666 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6667 Op0->getOpcode() == Op1->getOpcode() &&
6668 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6669 bool IsUnsigned = isa<ZExtInst>(Op0);
6670 Type *Op0Ty = Op0->getOperand(0)->getType();
6671 Type *Op1Ty = Op1->getOperand(0)->getType();
6672 Type *LargestOpTy =
6673 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6674 : Op0Ty;
6675 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6676
6677 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6678 // different sizes. We take the largest type as the ext to reduce, and add
6679 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6680 InstructionCost ExtCost0 = TTI.getCastInstrCost(
6681 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6682 TTI::CastContextHint::None, CostKind, Op0);
6683 InstructionCost ExtCost1 = TTI.getCastInstrCost(
6684 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6685 TTI::CastContextHint::None, CostKind, Op1);
6686 InstructionCost MulCost =
6687 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6688
6689 InstructionCost RedCost = TTI.getMulAccReductionCost(
6690 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6691 InstructionCost ExtraExtCost = 0;
6692 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6693 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6694 ExtraExtCost = TTI.getCastInstrCost(
6695 ExtraExtOp->getOpcode(), ExtType,
6696 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6697 TTI::CastContextHint::None, CostKind, ExtraExtOp);
6698 }
6699
6700 if (RedCost.isValid() &&
6701 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6702 return I == RetI ? RedCost : 0;
6703 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6704 // Matched reduce.add(mul())
6705 InstructionCost MulCost =
6706 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6707
6708 InstructionCost RedCost = TTI.getMulAccReductionCost(
6709 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6710
6711 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6712 return I == RetI ? RedCost : 0;
6713 }
6714 }
6715
6716 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6717}
6718
6719InstructionCost
6720LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6721 ElementCount VF) {
6722 // Calculate scalar cost only. Vectorization cost should be ready at this
6723 // moment.
6724 if (VF.isScalar()) {
6725 Type *ValTy = getLoadStoreType(I);
6726 const Align Alignment = getLoadStoreAlignment(I);
6727 unsigned AS = getLoadStoreAddressSpace(I);
6728
6729 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6730 return TTI.getAddressComputationCost(ValTy) +
6731 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6732 TTI::TCK_RecipThroughput, OpInfo, I);
6733 }
6734 return getWideningCost(I, VF);
6735}
6736
6737LoopVectorizationCostModel::VectorizationCostTy
6738LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6739 ElementCount VF) {
6740 // If we know that this instruction will remain uniform, check the cost of
6741 // the scalar version.
6742 if (isUniformAfterVectorization(I, VF))
6743 VF = ElementCount::getFixed(1);
6744
6745 if (VF.isVector() && isProfitableToScalarize(I, VF))
6746 return VectorizationCostTy(InstsToScalarize[VF][I], false);
6747
6748 // Forced scalars do not have any scalarization overhead.
6749 auto ForcedScalar = ForcedScalars.find(VF);
6750 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6751 auto InstSet = ForcedScalar->second;
6752 if (InstSet.count(I))
6753 return VectorizationCostTy(
6754 (getInstructionCost(I, ElementCount::getFixed(1)).first *
6755 VF.getKnownMinValue()),
6756 false);
6757 }
6758
6759 Type *VectorTy;
6760 InstructionCost C = getInstructionCost(I, VF, VectorTy);
6761
6762 bool TypeNotScalarized = false;
6763 if (VF.isVector() && VectorTy->isVectorTy()) {
6764 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6765 if (VF.isScalable())
6766 // <vscale x 1 x iN> is assumed to be profitable over iN because
6767 // scalable registers are a distinct register class from scalar ones.
6768 // If we ever find a target which wants to lower scalable vectors
6769 // back to scalars, we'll need to update this code to explicitly
6770 // ask TTI about the register class uses for each part.
6771 TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6772 else
6773 TypeNotScalarized = NumParts < VF.getKnownMinValue();
6774 } else
6775 C = InstructionCost::getInvalid();
6776 }
6777 return VectorizationCostTy(C, TypeNotScalarized);
6778}
6779
6780InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6781 Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
6782
6783 // There is no mechanism yet to create a scalable scalarization loop,
6784 // so this is currently Invalid.
6785 if (VF.isScalable())
6786 return InstructionCost::getInvalid();
6787
6788 if (VF.isScalar())
6789 return 0;
6790
6791 InstructionCost Cost = 0;
6792 Type *RetTy = ToVectorTy(I->getType(), VF);
6793 if (!RetTy->isVoidTy() &&
6794 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6795 Cost += TTI.getScalarizationOverhead(
6796 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6797 /*Insert*/ true,
6798 /*Extract*/ false, CostKind);
6799
6800 // Some targets keep addresses scalar.
6801 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6802 return Cost;
6803
6804 // Some targets support efficient element stores.
6805 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6806 return Cost;
6807
6808 // Collect operands to consider.
6809 CallInst *CI = dyn_cast<CallInst>(I);
6810 Instruction::op_range Ops = CI ? CI->args() : I->operands();
6811
6812 // Skip operands that do not require extraction/scalarization and do not incur
6813 // any overhead.
6814 SmallVector<Type *> Tys;
6815 for (auto *V : filterExtractingOperands(Ops, VF))
6816 Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6817 return Cost + TTI.getOperandsScalarizationOverhead(
6818 filterExtractingOperands(Ops, VF), Tys, CostKind);
6819}
6820
6821void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6822 if (VF.isScalar())
6823 return;
6824 NumPredStores = 0;
6825 for (BasicBlock *BB : TheLoop->blocks()) {
6826 // For each instruction in the old loop.
6827 for (Instruction &I : *BB) {
6828 Value *Ptr = getLoadStorePointerOperand(&I);
6829 if (!Ptr)
6830 continue;
6831
6832 // TODO: We should generate better code and update the cost model for
6833 // predicated uniform stores. Today they are treated as any other
6834 // predicated store (see added test cases in
6835 // invariant-store-vectorization.ll).
6836 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6837 NumPredStores++;
6838
6839 if (Legal->isUniformMemOp(I)) {
6840 auto isLegalToScalarize = [&]() {
6841 if (!VF.isScalable())
6842 // Scalarization of fixed length vectors "just works".
6843 return true;
6844
6845 // We have dedicated lowering for unpredicated uniform loads and
6846 // stores. Note that even with tail folding we know that at least
6847 // one lane is active (i.e. generalized predication is not possible
6848 // here), and the logic below depends on this fact.
6849 if (!foldTailByMasking())
6850 return true;
6851
6852 // For scalable vectors, a uniform memop load is always
6853 // uniform-by-parts and we know how to scalarize that.
6854 if (isa<LoadInst>(I))
6855 return true;
6856
6857 // A uniform store isn't neccessarily uniform-by-part
6858 // and we can't assume scalarization.
6859 auto &SI = cast<StoreInst>(I);
6860 return TheLoop->isLoopInvariant(SI.getValueOperand());
6861 };
6862
6863 const InstructionCost GatherScatterCost =
6864 isLegalGatherOrScatter(&I, VF) ?
6865 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6866
6867 // Load: Scalar load + broadcast
6868 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6869 // FIXME: This cost is a significant under-estimate for tail folded
6870 // memory ops.
6871 const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6872 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6873
6874 // Choose better solution for the current VF, Note that Invalid
6875 // costs compare as maximumal large. If both are invalid, we get
6876 // scalable invalid which signals a failure and a vectorization abort.
6877 if (GatherScatterCost < ScalarizationCost)
6878 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6879 else
6880 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6881 continue;
6882 }
6883
6884 // We assume that widening is the best solution when possible.
6885 if (memoryInstructionCanBeWidened(&I, VF)) {
6886 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6887 int ConsecutiveStride = Legal->isConsecutivePtr(
6888 getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6889 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&(static_cast <bool> ((ConsecutiveStride == 1 || ConsecutiveStride
== -1) && "Expected consecutive stride.") ? void (0)
: __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Expected consecutive stride.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6890, __extension__
__PRETTY_FUNCTION__))
6890 "Expected consecutive stride.")(static_cast <bool> ((ConsecutiveStride == 1 || ConsecutiveStride
== -1) && "Expected consecutive stride.") ? void (0)
: __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Expected consecutive stride.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6890, __extension__
__PRETTY_FUNCTION__))
;
6891 InstWidening Decision =
6892 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6893 setWideningDecision(&I, VF, Decision, Cost);
6894 continue;
6895 }
6896
6897 // Choose between Interleaving, Gather/Scatter or Scalarization.
6898 InstructionCost InterleaveCost = InstructionCost::getInvalid();
6899 unsigned NumAccesses = 1;
6900 if (isAccessInterleaved(&I)) {
6901 auto Group = getInterleavedAccessGroup(&I);
6902 assert(Group && "Fail to get an interleaved access group.")(static_cast <bool> (Group && "Fail to get an interleaved access group."
) ? void (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6902, __extension__
__PRETTY_FUNCTION__))
;
6903
6904 // Make one decision for the whole group.
6905 if (getWideningDecision(&I, VF) != CM_Unknown)
6906 continue;
6907
6908 NumAccesses = Group->getNumMembers();
6909 if (interleavedAccessCanBeWidened(&I, VF))
6910 InterleaveCost = getInterleaveGroupCost(&I, VF);
6911 }
6912
6913 InstructionCost GatherScatterCost =
6914 isLegalGatherOrScatter(&I, VF)
6915 ? getGatherScatterCost(&I, VF) * NumAccesses
6916 : InstructionCost::getInvalid();
6917
6918 InstructionCost ScalarizationCost =
6919 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6920
6921 // Choose better solution for the current VF,
6922 // write down this decision and use it during vectorization.
6923 InstructionCost Cost;
6924 InstWidening Decision;
6925 if (InterleaveCost <= GatherScatterCost &&
6926 InterleaveCost < ScalarizationCost) {
6927 Decision = CM_Interleave;
6928 Cost = InterleaveCost;
6929 } else if (GatherScatterCost < ScalarizationCost) {
6930 Decision = CM_GatherScatter;
6931 Cost = GatherScatterCost;
6932 } else {
6933 Decision = CM_Scalarize;
6934 Cost = ScalarizationCost;
6935 }
6936 // If the instructions belongs to an interleave group, the whole group
6937 // receives the same decision. The whole group receives the cost, but
6938 // the cost will actually be assigned to one instruction.
6939 if (auto Group = getInterleavedAccessGroup(&I))
6940 setWideningDecision(Group, VF, Decision, Cost);
6941 else
6942 setWideningDecision(&I, VF, Decision, Cost);
6943 }
6944 }
6945
6946 // Make sure that any load of address and any other address computation
6947 // remains scalar unless there is gather/scatter support. This avoids
6948 // inevitable extracts into address registers, and also has the benefit of
6949 // activating LSR more, since that pass can't optimize vectorized
6950 // addresses.
6951 if (TTI.prefersVectorizedAddressing())
6952 return;
6953
6954 // Start with all scalar pointer uses.
6955 SmallPtrSet<Instruction *, 8> AddrDefs;
6956 for (BasicBlock *BB : TheLoop->blocks())
6957 for (Instruction &I : *BB) {
6958 Instruction *PtrDef =
6959 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6960 if (PtrDef && TheLoop->contains(PtrDef) &&
6961 getWideningDecision(&I, VF) != CM_GatherScatter)
6962 AddrDefs.insert(PtrDef);
6963 }
6964
6965 // Add all instructions used to generate the addresses.
6966 SmallVector<Instruction *, 4> Worklist;
6967 append_range(Worklist, AddrDefs);
6968 while (!Worklist.empty()) {
6969 Instruction *I = Worklist.pop_back_val();
6970 for (auto &Op : I->operands())
6971 if (auto *InstOp = dyn_cast<Instruction>(Op))
6972 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6973 AddrDefs.insert(InstOp).second)
6974 Worklist.push_back(InstOp);
6975 }
6976
6977 for (auto *I : AddrDefs) {
6978 if (isa<LoadInst>(I)) {
6979 // Setting the desired widening decision should ideally be handled in
6980 // by cost functions, but since this involves the task of finding out
6981 // if the loaded register is involved in an address computation, it is
6982 // instead changed here when we know this is the case.
6983 InstWidening Decision = getWideningDecision(I, VF);
6984 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6985 // Scalarize a widened load of address.
6986 setWideningDecision(
6987 I, VF, CM_Scalarize,
6988 (VF.getKnownMinValue() *
6989 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6990 else if (auto Group = getInterleavedAccessGroup(I)) {
6991 // Scalarize an interleave group of address loads.
6992 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6993 if (Instruction *Member = Group->getMember(I))
6994 setWideningDecision(
6995 Member, VF, CM_Scalarize,
6996 (VF.getKnownMinValue() *
6997 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6998 }
6999 }
7000 } else
7001 // Make sure I gets scalarized and a cost estimate without
7002 // scalarization overhead.
7003 ForcedScalars[VF].insert(I);
7004 }
7005}
7006
7007InstructionCost
7008LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7009 Type *&VectorTy) {
7010 Type *RetTy = I->getType();
7011 if (canTruncateToMinimalBitwidth(I, VF))
7012 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7013 auto SE = PSE.getSE();
7014 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7015
7016 auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7017 ElementCount VF) -> bool {
7018 if (VF.isScalar())
7019 return true;
7020
7021 auto Scalarized = InstsToScalarize.find(VF);
7022 assert(Scalarized != InstsToScalarize.end() &&(static_cast <bool> (Scalarized != InstsToScalarize.end
() && "VF not yet analyzed for scalarization profitability"
) ? void (0) : __assert_fail ("Scalarized != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7023, __extension__
__PRETTY_FUNCTION__))
7023 "VF not yet analyzed for scalarization profitability")(static_cast <bool> (Scalarized != InstsToScalarize.end
() && "VF not yet analyzed for scalarization profitability"
) ? void (0) : __assert_fail ("Scalarized != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7023, __extension__
__PRETTY_FUNCTION__))
;
7024 return !Scalarized->second.count(I) &&
7025 llvm::all_of(I->users(), [&](User *U) {
7026 auto *UI = cast<Instruction>(U);
7027 return !Scalarized->second.count(UI);
7028 });
7029 };
7030 (void) hasSingleCopyAfterVectorization;
7031
7032 if (isScalarAfterVectorization(I, VF)) {
7033 // With the exception of GEPs and PHIs, after scalarization there should
7034 // only be one copy of the instruction generated in the loop. This is
7035 // because the VF is either 1, or any instructions that need scalarizing
7036 // have already been dealt with by the the time we get here. As a result,
7037 // it means we don't have to multiply the instruction cost by VF.
7038 assert(I->getOpcode() == Instruction::GetElementPtr ||(static_cast <bool> (I->getOpcode() == Instruction::
GetElementPtr || I->getOpcode() == Instruction::PHI || (I->
getOpcode() == Instruction::BitCast && I->getType(
)->isPointerTy()) || hasSingleCopyAfterVectorization(I, VF
)) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::GetElementPtr || I->getOpcode() == Instruction::PHI || (I->getOpcode() == Instruction::BitCast && I->getType()->isPointerTy()) || hasSingleCopyAfterVectorization(I, VF)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7042, __extension__
__PRETTY_FUNCTION__))
7039 I->getOpcode() == Instruction::PHI ||(static_cast <bool> (I->getOpcode() == Instruction::
GetElementPtr || I->getOpcode() == Instruction::PHI || (I->
getOpcode() == Instruction::BitCast && I->getType(
)->isPointerTy()) || hasSingleCopyAfterVectorization(I, VF
)) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::GetElementPtr || I->getOpcode() == Instruction::PHI || (I->getOpcode() == Instruction::BitCast && I->getType()->isPointerTy()) || hasSingleCopyAfterVectorization(I, VF)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7042, __extension__
__PRETTY_FUNCTION__))
7040 (I->getOpcode() == Instruction::BitCast &&(static_cast <bool> (I->getOpcode() == Instruction::
GetElementPtr || I->getOpcode() == Instruction::PHI || (I->
getOpcode() == Instruction::BitCast && I->getType(
)->isPointerTy()) || hasSingleCopyAfterVectorization(I, VF
)) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::GetElementPtr || I->getOpcode() == Instruction::PHI || (I->getOpcode() == Instruction::BitCast && I->getType()->isPointerTy()) || hasSingleCopyAfterVectorization(I, VF)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7042, __extension__
__PRETTY_FUNCTION__))
7041 I->getType()->isPointerTy()) ||(static_cast <bool> (I->getOpcode() == Instruction::
GetElementPtr || I->getOpcode() == Instruction::PHI || (I->
getOpcode() == Instruction::BitCast && I->getType(
)->isPointerTy()) || hasSingleCopyAfterVectorization(I, VF
)) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::GetElementPtr || I->getOpcode() == Instruction::PHI || (I->getOpcode() == Instruction::BitCast && I->getType()->isPointerTy()) || hasSingleCopyAfterVectorization(I, VF)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7042, __extension__
__PRETTY_FUNCTION__))
7042 hasSingleCopyAfterVectorization(I, VF))(static_cast <bool> (I->getOpcode() == Instruction::
GetElementPtr || I->getOpcode() == Instruction::PHI || (I->
getOpcode() == Instruction::BitCast && I->getType(
)->isPointerTy()) || hasSingleCopyAfterVectorization(I, VF
)) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::GetElementPtr || I->getOpcode() == Instruction::PHI || (I->getOpcode() == Instruction::BitCast && I->getType()->isPointerTy()) || hasSingleCopyAfterVectorization(I, VF)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7042, __extension__
__PRETTY_FUNCTION__))
;
7043 VectorTy = RetTy;
7044 } else
7045 VectorTy = ToVectorTy(RetTy, VF);
7046
7047 // TODO: We need to estimate the cost of intrinsic calls.
7048 switch (I->getOpcode()) {
7049 case Instruction::GetElementPtr:
7050 // We mark this instruction as zero-cost because the cost of GEPs in
7051 // vectorized code depends on whether the corresponding memory instruction
7052 // is scalarized or not. Therefore, we handle GEPs with the memory
7053 // instruction cost.
7054 return 0;
7055 case Instruction::Br: {
7056 // In cases of scalarized and predicated instructions, there will be VF
7057 // predicated blocks in the vectorized loop. Each branch around these
7058 // blocks requires also an extract of its vector compare i1 element.
7059 bool ScalarPredicatedBB = false;
7060 BranchInst *BI = cast<BranchInst>(I);
7061 if (VF.isVector() && BI->isConditional() &&
7062 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
7063 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
7064 ScalarPredicatedBB = true;
7065
7066 if (ScalarPredicatedBB) {
7067 // Not possible to scalarize scalable vector with predicated instructions.
7068 if (VF.isScalable())
7069 return InstructionCost::getInvalid();
7070 // Return cost for branches around scalarized and predicated blocks.
7071 auto *Vec_i1Ty =
7072 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7073 return (
7074 TTI.getScalarizationOverhead(
7075 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
7076 /*Insert*/ false, /*Extract*/ true, CostKind) +
7077 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7078 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7079 // The back-edge branch will remain, as will all scalar branches.
7080 return TTI.getCFInstrCost(Instruction::Br, CostKind);
7081 else
7082 // This branch will be eliminated by if-conversion.
7083 return 0;
7084 // Note: We currently assume zero cost for an unconditional branch inside
7085 // a predicated block since it will become a fall-through, although we
7086 // may decide in the future to call TTI for all branches.
7087 }
7088 case Instruction::PHI: {
7089 auto *Phi = cast<PHINode>(I);
7090
7091 // First-order recurrences are replaced by vector shuffles inside the loop.
7092 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
7093 SmallVector<int> Mask(VF.getKnownMinValue());
7094 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
7095 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
7096 cast<VectorType>(VectorTy), Mask, CostKind,
7097 VF.getKnownMinValue() - 1);
7098 }
7099
7100 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7101 // converted into select instructions. We require N - 1 selects per phi
7102 // node, where N is the number of incoming values.
7103 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7104 return (Phi->getNumIncomingValues() - 1) *
7105 TTI.getCmpSelInstrCost(
7106 Instruction::Select, ToVectorTy(Phi->getType(), VF),
7107 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7108 CmpInst::BAD_ICMP_PREDICATE, CostKind);
7109
7110 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7111 }
7112 case Instruction::UDiv:
7113 case Instruction::SDiv:
7114 case Instruction::URem:
7115 case Instruction::SRem:
7116 if (VF.isVector() && isPredicatedInst(I)) {
7117 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
7118 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
7119 ScalarCost : SafeDivisorCost;
7120 }
7121 // We've proven all lanes safe to speculate, fall through.
7122 [[fallthrough]];
7123 case Instruction::Add:
7124 case Instruction::FAdd:
7125 case Instruction::Sub:
7126 case Instruction::FSub:
7127 case Instruction::Mul:
7128 case Instruction::FMul:
7129 case Instruction::FDiv:
7130 case Instruction::FRem:
7131 case Instruction::Shl:
7132 case Instruction::LShr:
7133 case Instruction::AShr:
7134 case Instruction::And:
7135 case Instruction::Or:
7136 case Instruction::Xor: {
7137 // Since we will replace the stride by 1 the multiplication should go away.
7138 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7139 return 0;
7140
7141 // Detect reduction patterns
7142 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7143 return *RedCost;
7144
7145 // Certain instructions can be cheaper to vectorize if they have a constant
7146 // second vector operand. One example of this are shifts on x86.
7147 Value *Op2 = I->getOperand(1);
7148 auto Op2Info = TTI.getOperandInfo(Op2);
7149 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7150 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
7151
7152 SmallVector<const Value *, 4> Operands(I->operand_values());
7153 return TTI.getArithmeticInstrCost(
7154 I->getOpcode(), VectorTy, CostKind,
7155 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7156 Op2Info, Operands, I);
7157 }
7158 case Instruction::FNeg: {
7159 return TTI.getArithmeticInstrCost(
7160 I->getOpcode(), VectorTy, CostKind,
7161 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7162 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7163 I->getOperand(0), I);
7164 }
7165 case Instruction::Select: {
7166 SelectInst *SI = cast<SelectInst>(I);
7167 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7168 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7169
7170 const Value *Op0, *Op1;
7171 using namespace llvm::PatternMatch;
7172 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7173 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7174 // select x, y, false --> x & y
7175 // select x, true, y --> x | y
7176 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
7177 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
7178 assert(Op0->getType()->getScalarSizeInBits() == 1 &&(static_cast <bool> (Op0->getType()->getScalarSizeInBits
() == 1 && Op1->getType()->getScalarSizeInBits(
) == 1) ? void (0) : __assert_fail ("Op0->getType()->getScalarSizeInBits() == 1 && Op1->getType()->getScalarSizeInBits() == 1"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7179, __extension__
__PRETTY_FUNCTION__))
7179 Op1->getType()->getScalarSizeInBits() == 1)(static_cast <bool> (Op0->getType()->getScalarSizeInBits
() == 1 && Op1->getType()->getScalarSizeInBits(
) == 1) ? void (0) : __assert_fail ("Op0->getType()->getScalarSizeInBits() == 1 && Op1->getType()->getScalarSizeInBits() == 1"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7179, __extension__
__PRETTY_FUNCTION__))
;
7180
7181 SmallVector<const Value *, 2> Operands{Op0, Op1};
7182 return TTI.getArithmeticInstrCost(
7183 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7184 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
7185 }
7186
7187 Type *CondTy = SI->getCondition()->getType();
7188 if (!ScalarCond)
7189 CondTy = VectorType::get(CondTy, VF);
7190
7191 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7192 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7193 Pred = Cmp->getPredicate();
7194 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7195 CostKind, I);
7196 }
7197 case Instruction::ICmp:
7198 case Instruction::FCmp: {
7199 Type *ValTy = I->getOperand(0)->getType();
7200 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7201 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7202 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7203 VectorTy = ToVectorTy(ValTy, VF);
7204 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7205 cast<CmpInst>(I)->getPredicate(), CostKind,
7206 I);
7207 }
7208 case Instruction::Store:
7209 case Instruction::Load: {
7210 ElementCount Width = VF;
7211 if (Width.isVector()) {
7212 InstWidening Decision = getWideningDecision(I, Width);
7213 assert(Decision != CM_Unknown &&(static_cast <bool> (Decision != CM_Unknown && "CM decision should be taken at this point"
) ? void (0) : __assert_fail ("Decision != CM_Unknown && \"CM decision should be taken at this point\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7214, __extension__
__PRETTY_FUNCTION__))
7214 "CM decision should be taken at this point")(static_cast <bool> (Decision != CM_Unknown && "CM decision should be taken at this point"
) ? void (0) : __assert_fail ("Decision != CM_Unknown && \"CM decision should be taken at this point\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7214, __extension__
__PRETTY_FUNCTION__))
;
7215 if (getWideningCost(I, VF) == InstructionCost::getInvalid())
7216 return InstructionCost::getInvalid();
7217 if (Decision == CM_Scalarize)
7218 Width = ElementCount::getFixed(1);
7219 }
7220 VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7221 return getMemoryInstructionCost(I, VF);
7222 }
7223 case Instruction::BitCast:
7224 if (I->getType()->isPointerTy())
7225 return 0;
7226 [[fallthrough]];
7227 case Instruction::ZExt:
7228 case Instruction::SExt:
7229 case Instruction::FPToUI:
7230 case Instruction::FPToSI:
7231 case Instruction::FPExt:
7232 case Instruction::PtrToInt:
7233 case Instruction::IntToPtr:
7234 case Instruction::SIToFP:
7235 case Instruction::UIToFP:
7236 case Instruction::Trunc:
7237 case Instruction::FPTrunc: {
7238 // Computes the CastContextHint from a Load/Store instruction.
7239 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7240 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&(static_cast <bool> ((isa<LoadInst>(I) || isa<
StoreInst>(I)) && "Expected a load or a store!") ?
void (0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected a load or a store!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7241, __extension__
__PRETTY_FUNCTION__))
7241 "Expected a load or a store!")(static_cast <bool> ((isa<LoadInst>(I) || isa<
StoreInst>(I)) && "Expected a load or a store!") ?
void (0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected a load or a store!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7241, __extension__
__PRETTY_FUNCTION__))
;
7242
7243 if (VF.isScalar() || !TheLoop->contains(I))
7244 return TTI::CastContextHint::Normal;
7245
7246 switch (getWideningDecision(I, VF)) {
7247 case LoopVectorizationCostModel::CM_GatherScatter:
7248 return TTI::CastContextHint::GatherScatter;
7249 case LoopVectorizationCostModel::CM_Interleave:
7250 return TTI::CastContextHint::Interleave;
7251 case LoopVectorizationCostModel::CM_Scalarize:
7252 case LoopVectorizationCostModel::CM_Widen:
7253 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7254 : TTI::CastContextHint::Normal;
7255 case LoopVectorizationCostModel::CM_Widen_Reverse:
7256 return TTI::CastContextHint::Reversed;
7257 case LoopVectorizationCostModel::CM_Unknown:
7258 llvm_unreachable("Instr did not go through cost modelling?")::llvm::llvm_unreachable_internal("Instr did not go through cost modelling?"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7258)
;
7259 }
7260
7261 llvm_unreachable("Unhandled case!")::llvm::llvm_unreachable_internal("Unhandled case!", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7261)
;
7262 };
7263
7264 unsigned Opcode = I->getOpcode();
7265 TTI::CastContextHint CCH = TTI::CastContextHint::None;
7266 // For Trunc, the context is the only user, which must be a StoreInst.
7267 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7268 if (I->hasOneUse())
7269 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7270 CCH = ComputeCCH(Store);
7271 }
7272 // For Z/Sext, the context is the operand, which must be a LoadInst.
7273 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7274 Opcode == Instruction::FPExt) {
7275 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7276 CCH = ComputeCCH(Load);
7277 }
7278
7279 // We optimize the truncation of induction variables having constant
7280 // integer steps. The cost of these truncations is the same as the scalar
7281 // operation.
7282 if (isOptimizableIVTruncate(I, VF)) {
7283 auto *Trunc = cast<TruncInst>(I);
7284 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7285 Trunc->getSrcTy(), CCH, CostKind, Trunc);
7286 }
7287
7288 // Detect reduction patterns
7289 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7290 return *RedCost;
7291
7292 Type *SrcScalarTy = I->getOperand(0)->getType();
7293 Type *SrcVecTy =
7294 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7295 if (canTruncateToMinimalBitwidth(I, VF)) {
7296 // This cast is going to be shrunk. This may remove the cast or it might
7297 // turn it into slightly different cast. For example, if MinBW == 16,
7298 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7299 //
7300 // Calculate the modified src and dest types.
7301 Type *MinVecTy = VectorTy;
7302 if (Opcode == Instruction::Trunc) {
7303 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7304 VectorTy =
7305 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7306 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7307 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7308 VectorTy =
7309 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7310 }
7311 }
7312
7313 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7314 }
7315 case Instruction::Call: {
7316 if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7317 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7318 return *RedCost;
7319 bool NeedToScalarize;
7320 CallInst *CI = cast<CallInst>(I);
7321 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7322 if (getVectorIntrinsicIDForCall(CI, TLI)) {
7323 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7324 return std::min(CallCost, IntrinsicCost);
7325 }
7326 return CallCost;
7327 }
7328 case Instruction::ExtractValue:
7329 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7330 case Instruction::Alloca:
7331 // We cannot easily widen alloca to a scalable alloca, as
7332 // the result would need to be a vector of pointers.
7333 if (VF.isScalable())
7334 return InstructionCost::getInvalid();
7335 [[fallthrough]];
7336 default:
7337 // This opcode is unknown. Assume that it is the same as 'mul'.
7338 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7339 } // end of switch.
7340}
7341
7342char LoopVectorize::ID = 0;
7343
7344static const char lv_name[] = "Loop Vectorization";
7345
7346INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)static void *initializeLoopVectorizePassOnce(PassRegistry &
Registry) {
7347INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);
7348INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)initializeBasicAAWrapperPassPass(Registry);
7349INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)initializeGlobalsAAWrapperPassPass(Registry);
7350INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)initializeAssumptionCacheTrackerPass(Registry);
7351INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)initializeBlockFrequencyInfoWrapperPassPass(Registry);
7352INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)initializeDominatorTreeWrapperPassPass(Registry);
7353INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)initializeScalarEvolutionWrapperPassPass(Registry);
7354INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)initializeLoopInfoWrapperPassPass(Registry);
7355INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)initializeLoopAccessLegacyAnalysisPass(Registry);
7356INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)initializeDemandedBitsWrapperPassPass(Registry);
7357INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)initializeOptimizationRemarkEmitterWrapperPassPass(Registry);
7358INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)initializeProfileSummaryInfoWrapperPassPass(Registry);
7359INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)initializeInjectTLIMappingsLegacyPass(Registry);
7360INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)PassInfo *PI = new PassInfo( lv_name, "loop-vectorize", &
LoopVectorize::ID, PassInfo::NormalCtor_t(callDefaultCtor<
LoopVectorize>), false, false); Registry.registerPass(*PI,
true); return PI; } static llvm::once_flag InitializeLoopVectorizePassFlag
; void llvm::initializeLoopVectorizePass(PassRegistry &Registry
) { llvm::call_once(InitializeLoopVectorizePassFlag, initializeLoopVectorizePassOnce
, std::ref(Registry)); }
7361
7362namespace llvm {
7363
7364Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7365
7366Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7367 bool VectorizeOnlyWhenForced) {
7368 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7369}
7370
7371} // end namespace llvm
7372
7373void LoopVectorizationCostModel::collectValuesToIgnore() {
7374 // Ignore ephemeral values.
7375 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7376
7377 // Find all stores to invariant variables. Since they are going to sink
7378 // outside the loop we do not need calculate cost for them.
7379 for (BasicBlock *BB : TheLoop->blocks())
7380 for (Instruction &I : *BB) {
7381 StoreInst *SI;
7382 if ((SI = dyn_cast<StoreInst>(&I)) &&
7383 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7384 ValuesToIgnore.insert(&I);
7385 }
7386
7387 // Ignore type-promoting instructions we identified during reduction
7388 // detection.
7389 for (const auto &Reduction : Legal->getReductionVars()) {
7390 const RecurrenceDescriptor &RedDes = Reduction.second;
7391 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7392 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7393 }
7394 // Ignore type-casting instructions we identified during induction
7395 // detection.
7396 for (const auto &Induction : Legal->getInductionVars()) {
7397 const InductionDescriptor &IndDes = Induction.second;
7398 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7399 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7400 }
7401}
7402
7403void LoopVectorizationCostModel::collectInLoopReductions() {
7404 for (const auto &Reduction : Legal->getReductionVars()) {
7405 PHINode *Phi = Reduction.first;
7406 const RecurrenceDescriptor &RdxDesc = Reduction.second;
7407
7408 // We don't collect reductions that are type promoted (yet).
7409 if (RdxDesc.getRecurrenceType() != Phi->getType())
7410 continue;
7411
7412 // If the target would prefer this reduction to happen "in-loop", then we
7413 // want to record it as such.
7414 unsigned Opcode = RdxDesc.getOpcode();
7415 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7416 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7417 TargetTransformInfo::ReductionFlags()))
7418 continue;
7419
7420 // Check that we can correctly put the reductions into the loop, by
7421 // finding the chain of operations that leads from the phi to the loop
7422 // exit value.
7423 SmallVector<Instruction *, 4> ReductionOperations =
7424 RdxDesc.getReductionOpChain(Phi, TheLoop);
7425 bool InLoop = !ReductionOperations.empty();
7426 if (InLoop) {
7427 InLoopReductionChains[Phi] = ReductionOperations;
7428 // Add the elements to InLoopReductionImmediateChains for cost modelling.
7429 Instruction *LastChain = Phi;
7430 for (auto *I : ReductionOperations) {
7431 InLoopReductionImmediateChains[I] = LastChain;
7432 LastChain = I;
7433 }
7434 }
7435 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
InLoop ? "inloop" : "out of loop") << " reduction for phi: "
<< *Phi << "\n"; } } while (false)
7436 << " reduction for phi: " << *Phi << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
InLoop ? "inloop" : "out of loop") << " reduction for phi: "
<< *Phi << "\n"; } } while (false)
;
7437 }
7438}
7439
7440// TODO: we could return a pair of values that specify the max VF and
7441// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7442// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7443// doesn't have a cost model that can choose which plan to execute if
7444// more than one is generated.
7445static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7446 LoopVectorizationCostModel &CM) {
7447 unsigned WidestType;
7448 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7449 return WidestVectorRegBits / WidestType;
7450}
7451
7452VectorizationFactor
7453LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7454 assert(!UserVF.isScalable() && "scalable vectors not yet supported")(static_cast <bool> (!UserVF.isScalable() && "scalable vectors not yet supported"
) ? void (0) : __assert_fail ("!UserVF.isScalable() && \"scalable vectors not yet supported\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7454, __extension__
__PRETTY_FUNCTION__))
;
7455 ElementCount VF = UserVF;
7456 // Outer loop handling: They may require CFG and instruction level
7457 // transformations before even evaluating whether vectorization is profitable.
7458 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7459 // the vectorization pipeline.
7460 if (!OrigLoop->isInnermost()) {
7461 // If the user doesn't provide a vectorization factor, determine a
7462 // reasonable one.
7463 if (UserVF.isZero()) {
7464 VF = ElementCount::getFixed(determineVPlanVF(
7465 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7466 .getFixedValue(),
7467 CM));
7468 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: VPlan computed VF "
<< VF << ".\n"; } } while (false)
;
7469
7470 // Make sure we have a VF > 1 for stress testing.
7471 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7472 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: VPlan stress testing: "
<< "overriding computed VF.\n"; } } while (false)
7473 << "overriding computed VF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: VPlan stress testing: "
<< "overriding computed VF.\n"; } } while (false)
;
7474 VF = ElementCount::getFixed(4);
7475 }
7476 }
7477 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.")(static_cast <bool> (EnableVPlanNativePath && "VPlan-native path is not enabled."
) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is not enabled.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7477, __extension__
__PRETTY_FUNCTION__))
;
7478 assert(isPowerOf2_32(VF.getKnownMinValue()) &&(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
)) && "VF needs to be a power of two") ? void (0) : __assert_fail
("isPowerOf2_32(VF.getKnownMinValue()) && \"VF needs to be a power of two\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7479, __extension__
__PRETTY_FUNCTION__))
7479 "VF needs to be a power of two")(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
)) && "VF needs to be a power of two") ? void (0) : __assert_fail
("isPowerOf2_32(VF.getKnownMinValue()) && \"VF needs to be a power of two\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7479, __extension__
__PRETTY_FUNCTION__))
;
7480 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
!UserVF.isZero() ? "user " : "") << "VF " << VF <<
" to build VPlans.\n"; } } while (false)
7481 << "VF " << VF << " to build VPlans.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
!UserVF.isZero() ? "user " : "") << "VF " << VF <<
" to build VPlans.\n"; } } while (false)
;
7482 buildVPlans(VF, VF);
7483
7484 // For VPlan build stress testing, we bail out after VPlan construction.
7485 if (VPlanBuildStressTest)
7486 return VectorizationFactor::Disabled();
7487
7488 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7489 }
7490
7491 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
"VPlan-native path.\n"; } } while (false)
7492 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
"VPlan-native path.\n"; } } while (false)
7493 "VPlan-native path.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
"VPlan-native path.\n"; } } while (false)
;
7494 return VectorizationFactor::Disabled();
7495}
7496
7497std::optional<VectorizationFactor>
7498LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7499 assert(OrigLoop->isInnermost() && "Inner loop expected.")(static_cast <bool> (OrigLoop->isInnermost() &&
"Inner loop expected.") ? void (0) : __assert_fail ("OrigLoop->isInnermost() && \"Inner loop expected.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7499, __extension__
__PRETTY_FUNCTION__))
;
7500 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7501 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7502 return std::nullopt;
7503
7504 // Invalidate interleave groups if all blocks of loop will be predicated.
7505 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7506 !useMaskedInterleavedAccesses(*TTI)) {
7507 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
7508 dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
7509 << "LV: Invalidate all interleaved groups due to fold-tail by masking "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
7510 "which requires masked-interleaved support.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
;
7511 if (CM.InterleaveInfo.invalidateGroups())
7512 // Invalidating interleave groups also requires invalidating all decisions
7513 // based on them, which includes widening decisions and uniform and scalar
7514 // values.
7515 CM.invalidateCostModelingDecisions();
7516 }
7517
7518 ElementCount MaxUserVF =
7519 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7520 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7521 if (!UserVF.isZero() && UserVFIsLegal) {
7522 assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&(static_cast <bool> (isPowerOf2_32(UserVF.getKnownMinValue
()) && "VF needs to be a power of two") ? void (0) : __assert_fail
("isPowerOf2_32(UserVF.getKnownMinValue()) && \"VF needs to be a power of two\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7523, __extension__
__PRETTY_FUNCTION__))
7523 "VF needs to be a power of two")(static_cast <bool> (isPowerOf2_32(UserVF.getKnownMinValue
()) && "VF needs to be a power of two") ? void (0) : __assert_fail
("isPowerOf2_32(UserVF.getKnownMinValue()) && \"VF needs to be a power of two\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7523, __extension__
__PRETTY_FUNCTION__))
;
7524 // Collect the instructions (and their associated costs) that will be more
7525 // profitable to scalarize.
7526 if (CM.selectUserVectorizationFactor(UserVF)) {
7527 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using user VF " <<
UserVF << ".\n"; } } while (false)
;
7528 CM.collectInLoopReductions();
7529 buildVPlansWithVPRecipes(UserVF, UserVF);
7530 LLVM_DEBUG(printPlans(dbgs()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { printPlans(dbgs()); } } while (false)
;
7531 return {{UserVF, 0, 0}};
7532 } else
7533 reportVectorizationInfo("UserVF ignored because of invalid costs.",
7534 "InvalidCost", ORE, OrigLoop);
7535 }
7536
7537 // Populate the set of Vectorization Factor Candidates.
7538 ElementCountSet VFCandidates;
7539 for (auto VF = ElementCount::getFixed(1);
7540 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7541 VFCandidates.insert(VF);
7542 for (auto VF = ElementCount::getScalable(1);
7543 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7544 VFCandidates.insert(VF);
7545
7546 for (const auto &VF : VFCandidates) {
7547 // Collect Uniform and Scalar instructions after vectorization with VF.
7548 CM.collectUniformsAndScalars(VF);
7549
7550 // Collect the instructions (and their associated costs) that will be more
7551 // profitable to scalarize.
7552 if (VF.isVector())
7553 CM.collectInstsToScalarize(VF);
7554 }
7555
7556 CM.collectInLoopReductions();
7557 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7558 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7559
7560 LLVM_DEBUG(printPlans(dbgs()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { printPlans(dbgs()); } } while (false)
;
7561 if (!MaxFactors.hasVector())
7562 return VectorizationFactor::Disabled();
7563
7564 // Select the optimal vectorization factor.
7565 VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates);
7566 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.")(static_cast <bool> ((VF.Width.isScalar() || VF.ScalarCost
> 0) && "when vectorizing, the scalar cost must be non-zero."
) ? void (0) : __assert_fail ("(VF.Width.isScalar() || VF.ScalarCost > 0) && \"when vectorizing, the scalar cost must be non-zero.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7566, __extension__
__PRETTY_FUNCTION__))
;
7567 return VF;
7568}
7569
7570VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7571 assert(count_if(VPlans,(static_cast <bool> (count_if(VPlans, [VF](const VPlanPtr
&Plan) { return Plan->hasVF(VF); }) == 1 && "Best VF has not a single VPlan."
) ? void (0) : __assert_fail ("count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 1 && \"Best VF has not a single VPlan.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7574, __extension__
__PRETTY_FUNCTION__))
7572 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==(static_cast <bool> (count_if(VPlans, [VF](const VPlanPtr
&Plan) { return Plan->hasVF(VF); }) == 1 && "Best VF has not a single VPlan."
) ? void (0) : __assert_fail ("count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 1 && \"Best VF has not a single VPlan.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7574, __extension__
__PRETTY_FUNCTION__))
7573 1 &&(static_cast <bool> (count_if(VPlans, [VF](const VPlanPtr
&Plan) { return Plan->hasVF(VF); }) == 1 && "Best VF has not a single VPlan."
) ? void (0) : __assert_fail ("count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 1 && \"Best VF has not a single VPlan.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7574, __extension__
__PRETTY_FUNCTION__))
7574 "Best VF has not a single VPlan.")(static_cast <bool> (count_if(VPlans, [VF](const VPlanPtr
&Plan) { return Plan->hasVF(VF); }) == 1 && "Best VF has not a single VPlan."
) ? void (0) : __assert_fail ("count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 1 && \"Best VF has not a single VPlan.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7574, __extension__
__PRETTY_FUNCTION__))
;
7575
7576 for (const VPlanPtr &Plan : VPlans) {
7577 if (Plan->hasVF(VF))
7578 return *Plan.get();
7579 }
7580 llvm_unreachable("No plan found!")::llvm::llvm_unreachable_internal("No plan found!", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7580)
;
7581}
7582
7583static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7584 SmallVector<Metadata *, 4> MDs;
7585 // Reserve first location for self reference to the LoopID metadata node.
7586 MDs.push_back(nullptr);
7587 bool IsUnrollMetadata = false;
7588 MDNode *LoopID = L->getLoopID();
7589 if (LoopID) {
7590 // First find existing loop unrolling disable metadata.
7591 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7592 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7593 if (MD) {
7594 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7595 IsUnrollMetadata =
7596 S && S->getString().startswith("llvm.loop.unroll.disable");
7597 }
7598 MDs.push_back(LoopID->getOperand(i));
7599 }
7600 }
7601
7602 if (!IsUnrollMetadata) {
7603 // Add runtime unroll disable metadata.
7604 LLVMContext &Context = L->getHeader()->getContext();
7605 SmallVector<Metadata *, 1> DisableOperands;
7606 DisableOperands.push_back(
7607 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7608 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7609 MDs.push_back(DisableNode);
7610 MDNode *NewLoopID = MDNode::get(Context, MDs);
7611 // Set operand 0 to refer to the loop id itself.
7612 NewLoopID->replaceOperandWith(0, NewLoopID);
7613 L->setLoopID(NewLoopID);
7614 }
7615}
7616
7617void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7618 VPlan &BestVPlan,
7619 InnerLoopVectorizer &ILV,
7620 DominatorTree *DT,
7621 bool IsEpilogueVectorization) {
7622 assert(BestVPlan.hasVF(BestVF) &&(static_cast <bool> (BestVPlan.hasVF(BestVF) &&
"Trying to execute plan with unsupported VF") ? void (0) : __assert_fail
("BestVPlan.hasVF(BestVF) && \"Trying to execute plan with unsupported VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7623, __extension__
__PRETTY_FUNCTION__))
7623 "Trying to execute plan with unsupported VF")(static_cast <bool> (BestVPlan.hasVF(BestVF) &&
"Trying to execute plan with unsupported VF") ? void (0) : __assert_fail
("BestVPlan.hasVF(BestVF) && \"Trying to execute plan with unsupported VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7623, __extension__
__PRETTY_FUNCTION__))
;
7624 assert(BestVPlan.hasUF(BestUF) &&(static_cast <bool> (BestVPlan.hasUF(BestUF) &&
"Trying to execute plan with unsupported UF") ? void (0) : __assert_fail
("BestVPlan.hasUF(BestUF) && \"Trying to execute plan with unsupported UF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7625, __extension__
__PRETTY_FUNCTION__))
7625 "Trying to execute plan with unsupported UF")(static_cast <bool> (BestVPlan.hasUF(BestUF) &&
"Trying to execute plan with unsupported UF") ? void (0) : __assert_fail
("BestVPlan.hasUF(BestUF) && \"Trying to execute plan with unsupported UF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7625, __extension__
__PRETTY_FUNCTION__))
;
7626
7627 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Executing best plan with VF="
<< BestVF << ", UF=" << BestUF << '\n'
; } } while (false)
7628 << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Executing best plan with VF="
<< BestVF << ", UF=" << BestUF << '\n'
; } } while (false)
;
7629
7630 // Workaround! Compute the trip count of the original loop and cache it
7631 // before we start modifying the CFG. This code has a systemic problem
7632 // wherein it tries to run analysis over partially constructed IR; this is
7633 // wrong, and not simply for SCEV. The trip count of the original loop
7634 // simply happens to be prone to hitting this in practice. In theory, we
7635 // can hit the same issue for any SCEV, or ValueTracking query done during
7636 // mutation. See PR49900.
7637 ILV.getOrCreateTripCount(OrigLoop->getLoopPreheader());
7638
7639 if (!IsEpilogueVectorization)
7640 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7641
7642 // Perform the actual loop transformation.
7643
7644 // 1. Set up the skeleton for vectorization, including vector pre-header and
7645 // middle block. The vector loop is created during VPlan execution.
7646 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7647 Value *CanonicalIVStartValue;
7648 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7649 ILV.createVectorizedLoopSkeleton();
7650
7651 // Only use noalias metadata when using memory checks guaranteeing no overlap
7652 // across all iterations.
7653 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7654 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7655 !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7656
7657 // We currently don't use LoopVersioning for the actual loop cloning but we
7658 // still use it to add the noalias metadata.
7659 // TODO: Find a better way to re-use LoopVersioning functionality to add
7660 // metadata.
7661 State.LVer = std::make_unique<LoopVersioning>(
7662 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7663 PSE.getSE());
7664 State.LVer->prepareNoAliasMetadata();
7665 }
7666
7667 ILV.collectPoisonGeneratingRecipes(State);
7668
7669 ILV.printDebugTracesAtStart();
7670
7671 //===------------------------------------------------===//
7672 //
7673 // Notice: any optimization or new instruction that go
7674 // into the code below should also be implemented in
7675 // the cost-model.
7676 //
7677 //===------------------------------------------------===//
7678
7679 // 2. Copy and widen instructions from the old loop into the new loop.
7680 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7681 ILV.getOrCreateVectorTripCount(nullptr),
7682 CanonicalIVStartValue, State,
7683 IsEpilogueVectorization);
7684
7685 BestVPlan.execute(&State);
7686
7687 // Keep all loop hints from the original loop on the vector loop (we'll
7688 // replace the vectorizer-specific hints below).
7689 MDNode *OrigLoopID = OrigLoop->getLoopID();
7690
7691 std::optional<MDNode *> VectorizedLoopID =
7692 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7693 LLVMLoopVectorizeFollowupVectorized});
7694
7695 VPBasicBlock *HeaderVPBB =
7696 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7697 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7698 if (VectorizedLoopID)
7699 L->setLoopID(*VectorizedLoopID);
7700 else {
7701 // Keep all loop hints from the original loop on the vector loop (we'll
7702 // replace the vectorizer-specific hints below).
7703 if (MDNode *LID = OrigLoop->getLoopID())
7704 L->setLoopID(LID);
7705
7706 LoopVectorizeHints Hints(L, true, *ORE);
7707 Hints.setAlreadyVectorized();
7708 }
7709 AddRuntimeUnrollDisableMetaData(L);
7710
7711 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7712 // predication, updating analyses.
7713 ILV.fixVectorizedLoop(State, BestVPlan);
7714
7715 ILV.printDebugTracesAtEnd();
7716}
7717
7718#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7719void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7720 for (const auto &Plan : VPlans)
7721 if (PrintVPlansInDotFormat)
7722 Plan->printDOT(O);
7723 else
7724 Plan->print(O);
7725}
7726#endif
7727
7728Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7729
7730//===--------------------------------------------------------------------===//
7731// EpilogueVectorizerMainLoop
7732//===--------------------------------------------------------------------===//
7733
7734/// This function is partially responsible for generating the control flow
7735/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7736std::pair<BasicBlock *, Value *>
7737EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7738 createVectorLoopSkeleton("");
7739
7740 // Generate the code to check the minimum iteration count of the vector
7741 // epilogue (see below).
7742 EPI.EpilogueIterationCountCheck =
7743 emitIterationCountCheck(LoopScalarPreHeader, true);
7744 EPI.EpilogueIterationCountCheck->setName("iter.check");
7745
7746 // Generate the code to check any assumptions that we've made for SCEV
7747 // expressions.
7748 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7749
7750 // Generate the code that checks at runtime if arrays overlap. We put the
7751 // checks into a separate block to make the more common case of few elements
7752 // faster.
7753 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7754
7755 // Generate the iteration count check for the main loop, *after* the check
7756 // for the epilogue loop, so that the path-length is shorter for the case
7757 // that goes directly through the vector epilogue. The longer-path length for
7758 // the main loop is compensated for, by the gain from vectorizing the larger
7759 // trip count. Note: the branch will get updated later on when we vectorize
7760 // the epilogue.
7761 EPI.MainLoopIterationCountCheck =
7762 emitIterationCountCheck(LoopScalarPreHeader, false);
7763
7764 // Generate the induction variable.
7765 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7766
7767 // Skip induction resume value creation here because they will be created in
7768 // the second pass for the scalar loop. The induction resume values for the
7769 // inductions in the epilogue loop are created before executing the plan for
7770 // the epilogue loop.
7771
7772 return {completeLoopSkeleton(), nullptr};
7773}
7774
7775void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7776 LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:"
<< EPI.MainLoopUF << ", Epilogue Loop VF:" <<
EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF
<< "\n"; }; } } while (false)
7777 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:"
<< EPI.MainLoopUF << ", Epilogue Loop VF:" <<
EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF
<< "\n"; }; } } while (false)
7778 << "Main Loop VF:" << EPI.MainLoopVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:"
<< EPI.MainLoopUF << ", Epilogue Loop VF:" <<
EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF
<< "\n"; }; } } while (false)
7779 << ", Main Loop UF:" << EPI.MainLoopUFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:"
<< EPI.MainLoopUF << ", Epilogue Loop VF:" <<
EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF
<< "\n"; }; } } while (false)
7780 << ", Epilogue Loop VF:" << EPI.EpilogueVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:"
<< EPI.MainLoopUF << ", Epilogue Loop VF:" <<
EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF
<< "\n"; }; } } while (false)
7781 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:"
<< EPI.MainLoopUF << ", Epilogue Loop VF:" <<
EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF
<< "\n"; }; } } while (false)
7782 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:"
<< EPI.MainLoopUF << ", Epilogue Loop VF:" <<
EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF
<< "\n"; }; } } while (false)
;
7783}
7784
7785void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7786 DEBUG_WITH_TYPE(VerboseDebug, {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
(VerboseDebug)) { { dbgs() << "intermediate fn:\n" <<
*OrigLoop->getHeader()->getParent() << "\n"; }; }
} while (false)
7787 dbgs() << "intermediate fn:\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
(VerboseDebug)) { { dbgs() << "intermediate fn:\n" <<
*OrigLoop->getHeader()->getParent() << "\n"; }; }
} while (false)
7788 << *OrigLoop->getHeader()->getParent() << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
(VerboseDebug)) { { dbgs() << "intermediate fn:\n" <<
*OrigLoop->getHeader()->getParent() << "\n"; }; }
} while (false)
7789 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
(VerboseDebug)) { { dbgs() << "intermediate fn:\n" <<
*OrigLoop->getHeader()->getParent() << "\n"; }; }
} while (false)
;
7790}
7791
7792BasicBlock *
7793EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7794 bool ForEpilogue) {
7795 assert(Bypass && "Expected valid bypass basic block.")(static_cast <bool> (Bypass && "Expected valid bypass basic block."
) ? void (0) : __assert_fail ("Bypass && \"Expected valid bypass basic block.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7795, __extension__
__PRETTY_FUNCTION__))
;
7796 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7797 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7798 Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
7799 // Reuse existing vector loop preheader for TC checks.
7800 // Note that new preheader block is generated for vector loop.
7801 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7802 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7803
7804 // Generate code to check if the loop's trip count is less than VF * UF of the
7805 // main vector loop.
7806 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
7807 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7808
7809 Value *CheckMinIters = Builder.CreateICmp(
7810 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7811 "min.iters.check");
7812
7813 if (!ForEpilogue)
7814 TCCheckBlock->setName("vector.main.loop.iter.check");
7815
7816 // Create new preheader for vector loop.
7817 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7818 DT, LI, nullptr, "vector.ph");
7819
7820 if (ForEpilogue) {
7821 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7823, __extension__
__PRETTY_FUNCTION__))
7822 DT->getNode(Bypass)->getIDom()) &&(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7823, __extension__
__PRETTY_FUNCTION__))
7823 "TC check is expected to dominate Bypass")(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7823, __extension__
__PRETTY_FUNCTION__))
;
7824
7825 // Update dominator for Bypass & LoopExit.
7826 DT->changeImmediateDominator(Bypass, TCCheckBlock);
7827 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7828 // For loops with multiple exits, there's no edge from the middle block
7829 // to exit blocks (as the epilogue must run) and thus no need to update
7830 // the immediate dominator of the exit blocks.
7831 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7832
7833 LoopBypassBlocks.push_back(TCCheckBlock);
7834
7835 // Save the trip count so we don't have to regenerate it in the
7836 // vec.epilog.iter.check. This is safe to do because the trip count
7837 // generated here dominates the vector epilog iter check.
7838 EPI.TripCount = Count;
7839 }
7840
7841 ReplaceInstWithInst(
7842 TCCheckBlock->getTerminator(),
7843 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7844
7845 return TCCheckBlock;
7846}
7847
7848//===--------------------------------------------------------------------===//
7849// EpilogueVectorizerEpilogueLoop
7850//===--------------------------------------------------------------------===//
7851
7852/// This function is partially responsible for generating the control flow
7853/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7854std::pair<BasicBlock *, Value *>
7855EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7856 createVectorLoopSkeleton("vec.epilog.");
7857
7858 // Now, compare the remaining count and if there aren't enough iterations to
7859 // execute the vectorized epilogue skip to the scalar part.
7860 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7861 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7862 LoopVectorPreHeader =
7863 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7864 LI, nullptr, "vec.epilog.ph");
7865 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7866 VecEpilogueIterationCountCheck);
7867
7868 // Adjust the control flow taking the state info from the main loop
7869 // vectorization into account.
7870 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&(static_cast <bool> (EPI.MainLoopIterationCountCheck &&
EPI.EpilogueIterationCountCheck && "expected this to be saved from the previous pass."
) ? void (0) : __assert_fail ("EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && \"expected this to be saved from the previous pass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7871, __extension__
__PRETTY_FUNCTION__))
7871 "expected this to be saved from the previous pass.")(static_cast <bool> (EPI.MainLoopIterationCountCheck &&
EPI.EpilogueIterationCountCheck && "expected this to be saved from the previous pass."
) ? void (0) : __assert_fail ("EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && \"expected this to be saved from the previous pass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7871, __extension__
__PRETTY_FUNCTION__))
;
7872 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7873 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7874
7875 DT->changeImmediateDominator(LoopVectorPreHeader,
7876 EPI.MainLoopIterationCountCheck);
7877
7878 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7879 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7880
7881 if (EPI.SCEVSafetyCheck)
7882 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7883 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7884 if (EPI.MemSafetyCheck)
7885 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7886 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7887
7888 DT->changeImmediateDominator(
7889 VecEpilogueIterationCountCheck,
7890 VecEpilogueIterationCountCheck->getSinglePredecessor());
7891
7892 DT->changeImmediateDominator(LoopScalarPreHeader,
7893 EPI.EpilogueIterationCountCheck);
7894 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7895 // If there is an epilogue which must run, there's no edge from the
7896 // middle block to exit blocks and thus no need to update the immediate
7897 // dominator of the exit blocks.
7898 DT->changeImmediateDominator(LoopExitBlock,
7899 EPI.EpilogueIterationCountCheck);
7900
7901 // Keep track of bypass blocks, as they feed start values to the induction and
7902 // reduction phis in the scalar loop preheader.
7903 if (EPI.SCEVSafetyCheck)
7904 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7905 if (EPI.MemSafetyCheck)
7906 LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7907 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7908
7909 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7910 // reductions which merge control-flow from the latch block and the middle
7911 // block. Update the incoming values here and move the Phi into the preheader.
7912 SmallVector<PHINode *, 4> PhisInBlock;
7913 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7914 PhisInBlock.push_back(&Phi);
7915
7916 for (PHINode *Phi : PhisInBlock) {
7917 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7918 Phi->replaceIncomingBlockWith(
7919 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7920 VecEpilogueIterationCountCheck);
7921
7922 // If the phi doesn't have an incoming value from the
7923 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7924 // value and also those from other check blocks. This is needed for
7925 // reduction phis only.
7926 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7927 return EPI.EpilogueIterationCountCheck == IncB;
7928 }))
7929 continue;
7930 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7931 if (EPI.SCEVSafetyCheck)
7932 Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7933 if (EPI.MemSafetyCheck)
7934 Phi->removeIncomingValue(EPI.MemSafetyCheck);
7935 }
7936
7937 // Generate a resume induction for the vector epilogue and put it in the
7938 // vector epilogue preheader
7939 Type *IdxTy = Legal->getWidestInductionType();
7940 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7941 LoopVectorPreHeader->getFirstNonPHI());
7942 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7943 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7944 EPI.MainLoopIterationCountCheck);
7945
7946 // Generate induction resume values. These variables save the new starting
7947 // indexes for the scalar loop. They are used to test if there are any tail
7948 // iterations left once the vector loop has completed.
7949 // Note that when the vectorized epilogue is skipped due to iteration count
7950 // check, then the resume value for the induction variable comes from
7951 // the trip count of the main vector loop, hence passing the AdditionalBypass
7952 // argument.
7953 createInductionResumeValues({VecEpilogueIterationCountCheck,
7954 EPI.VectorTripCount} /* AdditionalBypass */);
7955
7956 return {completeLoopSkeleton(), EPResumeVal};
7957}
7958
7959BasicBlock *
7960EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7961 BasicBlock *Bypass, BasicBlock *Insert) {
7962
7963 assert(EPI.TripCount &&(static_cast <bool> (EPI.TripCount && "Expected trip count to have been safed in the first pass."
) ? void (0) : __assert_fail ("EPI.TripCount && \"Expected trip count to have been safed in the first pass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7964, __extension__
__PRETTY_FUNCTION__))
7964 "Expected trip count to have been safed in the first pass.")(static_cast <bool> (EPI.TripCount && "Expected trip count to have been safed in the first pass."
) ? void (0) : __assert_fail ("EPI.TripCount && \"Expected trip count to have been safed in the first pass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7964, __extension__
__PRETTY_FUNCTION__))
;
7965 assert((static_cast <bool> ((!isa<Instruction>(EPI.TripCount
) || DT->dominates(cast<Instruction>(EPI.TripCount)->
getParent(), Insert)) && "saved trip count does not dominate insertion point."
) ? void (0) : __assert_fail ("(!isa<Instruction>(EPI.TripCount) || DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && \"saved trip count does not dominate insertion point.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7968, __extension__
__PRETTY_FUNCTION__))
7966 (!isa<Instruction>(EPI.TripCount) ||(static_cast <bool> ((!isa<Instruction>(EPI.TripCount
) || DT->dominates(cast<Instruction>(EPI.TripCount)->
getParent(), Insert)) && "saved trip count does not dominate insertion point."
) ? void (0) : __assert_fail ("(!isa<Instruction>(EPI.TripCount) || DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && \"saved trip count does not dominate insertion point.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7968, __extension__
__PRETTY_FUNCTION__))
7967 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&(static_cast <bool> ((!isa<Instruction>(EPI.TripCount
) || DT->dominates(cast<Instruction>(EPI.TripCount)->
getParent(), Insert)) && "saved trip count does not dominate insertion point."
) ? void (0) : __assert_fail ("(!isa<Instruction>(EPI.TripCount) || DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && \"saved trip count does not dominate insertion point.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7968, __extension__
__PRETTY_FUNCTION__))
7968 "saved trip count does not dominate insertion point.")(static_cast <bool> ((!isa<Instruction>(EPI.TripCount
) || DT->dominates(cast<Instruction>(EPI.TripCount)->
getParent(), Insert)) && "saved trip count does not dominate insertion point."
) ? void (0) : __assert_fail ("(!isa<Instruction>(EPI.TripCount) || DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && \"saved trip count does not dominate insertion point.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7968, __extension__
__PRETTY_FUNCTION__))
;
7969 Value *TC = EPI.TripCount;
7970 IRBuilder<> Builder(Insert->getTerminator());
7971 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7972
7973 // Generate code to check if the loop's trip count is less than VF * UF of the
7974 // vector epilogue loop.
7975 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
7976 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7977
7978 Value *CheckMinIters =
7979 Builder.CreateICmp(P, Count,
7980 createStepForVF(Builder, Count->getType(),
7981 EPI.EpilogueVF, EPI.EpilogueUF),
7982 "min.epilog.iters.check");
7983
7984 ReplaceInstWithInst(
7985 Insert->getTerminator(),
7986 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7987
7988 LoopBypassBlocks.push_back(Insert);
7989 return Insert;
7990}
7991
7992void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7993 LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
<< "Epilogue Loop VF:" << EPI.EpilogueVF <<
", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
}; } } while (false)
7994 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
<< "Epilogue Loop VF:" << EPI.EpilogueVF <<
", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
}; } } while (false)
7995 << "Epilogue Loop VF:" << EPI.EpilogueVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
<< "Epilogue Loop VF:" << EPI.EpilogueVF <<
", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
}; } } while (false)
7996 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
<< "Epilogue Loop VF:" << EPI.EpilogueVF <<
", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
}; } } while (false)
7997 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
<< "Epilogue Loop VF:" << EPI.EpilogueVF <<
", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
}; } } while (false)
;
7998}
7999
8000void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8001 DEBUG_WITH_TYPE(VerboseDebug, {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
(VerboseDebug)) { { dbgs() << "final fn:\n" << *OrigLoop
->getHeader()->getParent() << "\n"; }; } } while (
false)
8002 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
(VerboseDebug)) { { dbgs() << "final fn:\n" << *OrigLoop
->getHeader()->getParent() << "\n"; }; } } while (
false)
8003 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
(VerboseDebug)) { { dbgs() << "final fn:\n" << *OrigLoop
->getHeader()->getParent() << "\n"; }; } } while (
false)
;
8004}
8005
8006bool LoopVectorizationPlanner::getDecisionAndClampRange(
8007 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8008 assert(!Range.isEmpty() && "Trying to test an empty VF range.")(static_cast <bool> (!Range.isEmpty() && "Trying to test an empty VF range."
) ? void (0) : __assert_fail ("!Range.isEmpty() && \"Trying to test an empty VF range.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8008, __extension__
__PRETTY_FUNCTION__))
;
8009 bool PredicateAtRangeStart = Predicate(Range.Start);
8010
8011 for (ElementCount TmpVF = Range.Start * 2;
8012 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8013 if (Predicate(TmpVF) != PredicateAtRangeStart) {
8014 Range.End = TmpVF;
8015 break;
8016 }
8017
8018 return PredicateAtRangeStart;
8019}
8020
8021/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8022/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8023/// of VF's starting at a given VF and extending it as much as possible. Each
8024/// vectorization decision can potentially shorten this sub-range during
8025/// buildVPlan().
8026void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8027 ElementCount MaxVF) {
8028 auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8029 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8030 VFRange SubRange = {VF, MaxVFPlusOne};
8031 VPlans.push_back(buildVPlan(SubRange));
8032 VF = SubRange.End;
8033 }
8034}
8035
8036VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8037 VPlanPtr &Plan) {
8038 assert(is_contained(predecessors(Dst), Src) && "Invalid edge")(static_cast <bool> (is_contained(predecessors(Dst), Src
) && "Invalid edge") ? void (0) : __assert_fail ("is_contained(predecessors(Dst), Src) && \"Invalid edge\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8038, __extension__
__PRETTY_FUNCTION__))
;
8039
8040 // Look for cached value.
8041 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8042 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8043 if (ECEntryIt != EdgeMaskCache.end())
8044 return ECEntryIt->second;
8045
8046 VPValue *SrcMask = createBlockInMask(Src, Plan);
8047
8048 // The terminator has to be a branch inst!
8049 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8050 assert(BI && "Unexpected terminator found")(static_cast <bool> (BI && "Unexpected terminator found"
) ? void (0) : __assert_fail ("BI && \"Unexpected terminator found\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8050, __extension__
__PRETTY_FUNCTION__))
;
8051
8052 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8053 return EdgeMaskCache[Edge] = SrcMask;
8054
8055 // If source is an exiting block, we know the exit edge is dynamically dead
8056 // in the vector loop, and thus we don't need to restrict the mask. Avoid
8057 // adding uses of an otherwise potentially dead instruction.
8058 if (OrigLoop->isLoopExiting(Src))
8059 return EdgeMaskCache[Edge] = SrcMask;
8060
8061 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8062 assert(EdgeMask && "No Edge Mask found for condition")(static_cast <bool> (EdgeMask && "No Edge Mask found for condition"
) ? void (0) : __assert_fail ("EdgeMask && \"No Edge Mask found for condition\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8062, __extension__
__PRETTY_FUNCTION__))
;
8063
8064 if (BI->getSuccessor(0) != Dst)
8065 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8066
8067 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8068 // The condition is 'SrcMask && EdgeMask', which is equivalent to
8069 // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8070 // The select version does not introduce new UB if SrcMask is false and
8071 // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8072 VPValue *False = Plan->getOrAddVPValue(
8073 ConstantInt::getFalse(BI->getCondition()->getType()));
8074 EdgeMask =
8075 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8076 }
8077
8078 return EdgeMaskCache[Edge] = EdgeMask;
8079}
8080
8081VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8082 assert(OrigLoop->contains(BB) && "Block is not a part of a loop")(static_cast <bool> (OrigLoop->contains(BB) &&
"Block is not a part of a loop") ? void (0) : __assert_fail (
"OrigLoop->contains(BB) && \"Block is not a part of a loop\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8082, __extension__
__PRETTY_FUNCTION__))
;
2
Assuming the condition is true
3
'?' condition is true
8083
8084 // Look for cached value.
8085 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8086 if (BCEntryIt != BlockMaskCache.end())
4
Taking false branch
8087 return BCEntryIt->second;
8088
8089 // All-one mask is modelled as no-mask following the convention for masked
8090 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8091 VPValue *BlockMask = nullptr;
8092
8093 if (OrigLoop->getHeader() == BB) {
5
Assuming the condition is false
6
Taking false branch
8094 if (!CM.blockNeedsPredicationForAnyReason(BB))
8095 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8096
8097 assert(CM.foldTailByMasking() && "must fold the tail")(static_cast <bool> (CM.foldTailByMasking() && "must fold the tail"
) ? void (0) : __assert_fail ("CM.foldTailByMasking() && \"must fold the tail\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8097, __extension__
__PRETTY_FUNCTION__))
;
8098
8099 // If we're using the active lane mask for control flow, then we get the
8100 // mask from the active lane mask PHI that is cached in the VPlan.
8101 PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask();
8102 if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow)
8103 return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi();
8104
8105 // Introduce the early-exit compare IV <= BTC to form header block mask.
8106 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8107 // constructing the desired canonical IV in the header block as its first
8108 // non-phi instructions.
8109
8110 VPBasicBlock *HeaderVPBB =
8111 Plan->getVectorLoopRegion()->getEntryBasicBlock();
8112 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8113 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
8114 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
8115
8116 VPBuilder::InsertPointGuard Guard(Builder);
8117 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8118 if (EmitGetActiveLaneMask != PredicationStyle::None) {
8119 VPValue *TC = Plan->getOrCreateTripCount();
8120 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
8121 nullptr, "active.lane.mask");
8122 } else {
8123 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8124 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8125 }
8126 return BlockMaskCache[BB] = BlockMask;
8127 }
8128
8129 // This is the block mask. We OR all incoming edges.
8130 for (auto *Predecessor : predecessors(BB)) {
8131 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8132 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
7
Assuming 'EdgeMask' is non-null
8
Taking false branch
11
Assuming 'EdgeMask' is non-null
12
Taking false branch
23
Assuming 'EdgeMask' is null
24
Taking true branch
8133 return BlockMaskCache[BB] = EdgeMask;
25
Potential leak of memory pointed to by 'BlockMask'
8134
8135 if (!BlockMask
8.1
'BlockMask' is null
12.1
'BlockMask' is non-null
8.1
'BlockMask' is null
12.1
'BlockMask' is non-null
) { // BlockMask has its initialized nullptr value.
9
Taking true branch
13
Taking false branch
8136 BlockMask = EdgeMask;
8137 continue;
10
Execution continues on line 8130
8138 }
8139
8140 BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
14
Calling 'VPBuilder::createOr'
22
Returned allocated memory
8141 }
8142
8143 return BlockMaskCache[BB] = BlockMask;
8144}
8145
8146VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8147 ArrayRef<VPValue *> Operands,
8148 VFRange &Range,
8149 VPlanPtr &Plan) {
8150 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&(static_cast <bool> ((isa<LoadInst>(I) || isa<
StoreInst>(I)) && "Must be called with either a load or store"
) ? void (0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Must be called with either a load or store\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8151, __extension__
__PRETTY_FUNCTION__))
8151 "Must be called with either a load or store")(static_cast <bool> ((isa<LoadInst>(I) || isa<
StoreInst>(I)) && "Must be called with either a load or store"
) ? void (0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Must be called with either a load or store\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8151, __extension__
__PRETTY_FUNCTION__))
;
8152
8153 auto willWiden = [&](ElementCount VF) -> bool {
8154 LoopVectorizationCostModel::InstWidening Decision =
8155 CM.getWideningDecision(I, VF);
8156 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&(static_cast <bool> (Decision != LoopVectorizationCostModel
::CM_Unknown && "CM decision should be taken at this point."
) ? void (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8157, __extension__
__PRETTY_FUNCTION__))
8157 "CM decision should be taken at this point.")(static_cast <bool> (Decision != LoopVectorizationCostModel
::CM_Unknown && "CM decision should be taken at this point."
) ? void (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8157, __extension__
__PRETTY_FUNCTION__))
;
8158 if (Decision == LoopVectorizationCostModel::CM_Interleave)
8159 return true;
8160 if (CM.isScalarAfterVectorization(I, VF) ||
8161 CM.isProfitableToScalarize(I, VF))
8162 return false;
8163 return Decision != LoopVectorizationCostModel::CM_Scalarize;
8164 };
8165
8166 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8167 return nullptr;
8168
8169 VPValue *Mask = nullptr;
8170 if (Legal->isMaskRequired(I))
8171 Mask = createBlockInMask(I->getParent(), Plan);
8172
8173 // Determine if the pointer operand of the access is either consecutive or
8174 // reverse consecutive.
8175 LoopVectorizationCostModel::InstWidening Decision =
8176 CM.getWideningDecision(I, Range.Start);
8177 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8178 bool Consecutive =
8179 Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8180
8181 if (LoadInst *Load = dyn_cast<LoadInst>(I))
8182 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8183 Consecutive, Reverse);
8184
8185 StoreInst *Store = cast<StoreInst>(I);
8186 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8187 Mask, Consecutive, Reverse);
8188}
8189
8190/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8191/// insert a recipe to expand the step for the induction recipe.
8192static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(
8193 PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start,
8194 const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM,
8195 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) {
8196 // Returns true if an instruction \p I should be scalarized instead of
8197 // vectorized for the chosen vectorization factor.
8198 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
8199 return CM.isScalarAfterVectorization(I, VF) ||
8200 CM.isProfitableToScalarize(I, VF);
8201 };
8202
8203 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
8204 [&](ElementCount VF) {
8205 return ShouldScalarizeInstruction(PhiOrTrunc, VF);
8206 },
8207 Range);
8208 assert(IndDesc.getStartValue() ==(static_cast <bool> (IndDesc.getStartValue() == Phi->
getIncomingValueForBlock(OrigLoop.getLoopPreheader())) ? void
(0) : __assert_fail ("IndDesc.getStartValue() == Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8209, __extension__
__PRETTY_FUNCTION__))
8209 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()))(static_cast <bool> (IndDesc.getStartValue() == Phi->
getIncomingValueForBlock(OrigLoop.getLoopPreheader())) ? void
(0) : __assert_fail ("IndDesc.getStartValue() == Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8209, __extension__
__PRETTY_FUNCTION__))
;
8210 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&(static_cast <bool> (SE.isLoopInvariant(IndDesc.getStep
(), &OrigLoop) && "step must be loop invariant") ?
void (0) : __assert_fail ("SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && \"step must be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8211, __extension__
__PRETTY_FUNCTION__))
8211 "step must be loop invariant")(static_cast <bool> (SE.isLoopInvariant(IndDesc.getStep
(), &OrigLoop) && "step must be loop invariant") ?
void (0) : __assert_fail ("SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && \"step must be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8211, __extension__
__PRETTY_FUNCTION__))
;
8212
8213 VPValue *Step =
8214 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8215 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8216 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI,
8217 !NeedsScalarIVOnly);
8218 }
8219 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here")(static_cast <bool> (isa<PHINode>(PhiOrTrunc) &&
"must be a phi node here") ? void (0) : __assert_fail ("isa<PHINode>(PhiOrTrunc) && \"must be a phi node here\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8219, __extension__
__PRETTY_FUNCTION__))
;
8220 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc,
8221 !NeedsScalarIVOnly);
8222}
8223
8224VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
8225 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
8226
8227 // Check if this is an integer or fp induction. If so, build the recipe that
8228 // produces its scalar and vector values.
8229 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8230 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan,
8231 *PSE.getSE(), *OrigLoop, Range);
8232
8233 // Check if this is pointer induction. If so, build the recipe for it.
8234 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8235 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8236 *PSE.getSE());
8237 assert(isa<SCEVConstant>(II->getStep()))(static_cast <bool> (isa<SCEVConstant>(II->getStep
())) ? void (0) : __assert_fail ("isa<SCEVConstant>(II->getStep())"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8237, __extension__
__PRETTY_FUNCTION__))
;
8238 return new VPWidenPointerInductionRecipe(
8239 Phi, Operands[0], Step, *II,
8240 LoopVectorizationPlanner::getDecisionAndClampRange(
8241 [&](ElementCount VF) {
8242 return CM.isScalarAfterVectorization(Phi, VF);
8243 },
8244 Range));
8245 }
8246 return nullptr;
8247}
8248
8249VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8250 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
8251 // Optimize the special case where the source is a constant integer
8252 // induction variable. Notice that we can only optimize the 'trunc' case
8253 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8254 // (c) other casts depend on pointer size.
8255
8256 // Determine whether \p K is a truncation based on an induction variable that
8257 // can be optimized.
8258 auto isOptimizableIVTruncate =
8259 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8260 return [=](ElementCount VF) -> bool {
8261 return CM.isOptimizableIVTruncate(K, VF);
8262 };
8263 };
8264
8265 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8266 isOptimizableIVTruncate(I), Range)) {
8267
8268 auto *Phi = cast<PHINode>(I->getOperand(0));
8269 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8270 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8271 return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan,
8272 *PSE.getSE(), *OrigLoop, Range);
8273 }
8274 return nullptr;
8275}
8276
8277VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8278 ArrayRef<VPValue *> Operands,
8279 VPlanPtr &Plan) {
8280 // If all incoming values are equal, the incoming VPValue can be used directly
8281 // instead of creating a new VPBlendRecipe.
8282 if (llvm::all_equal(Operands))
8283 return Operands[0];
8284
8285 unsigned NumIncoming = Phi->getNumIncomingValues();
8286 // For in-loop reductions, we do not need to create an additional select.
8287 VPValue *InLoopVal = nullptr;
8288 for (unsigned In = 0; In < NumIncoming; In++) {
8289 PHINode *PhiOp =
8290 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8291 if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8292 assert(!InLoopVal && "Found more than one in-loop reduction!")(static_cast <bool> (!InLoopVal && "Found more than one in-loop reduction!"
) ? void (0) : __assert_fail ("!InLoopVal && \"Found more than one in-loop reduction!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8292, __extension__
__PRETTY_FUNCTION__))
;
8293 InLoopVal = Operands[In];
8294 }
8295 }
8296
8297 assert((!InLoopVal || NumIncoming == 2) &&(static_cast <bool> ((!InLoopVal || NumIncoming == 2) &&
"Found an in-loop reduction for PHI with unexpected number of "
"incoming values") ? void (0) : __assert_fail ("(!InLoopVal || NumIncoming == 2) && \"Found an in-loop reduction for PHI with unexpected number of \" \"incoming values\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8299, __extension__
__PRETTY_FUNCTION__))
8298 "Found an in-loop reduction for PHI with unexpected number of "(static_cast <bool> ((!InLoopVal || NumIncoming == 2) &&
"Found an in-loop reduction for PHI with unexpected number of "
"incoming values") ? void (0) : __assert_fail ("(!InLoopVal || NumIncoming == 2) && \"Found an in-loop reduction for PHI with unexpected number of \" \"incoming values\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8299, __extension__
__PRETTY_FUNCTION__))
8299 "incoming values")(static_cast <bool> ((!InLoopVal || NumIncoming == 2) &&
"Found an in-loop reduction for PHI with unexpected number of "
"incoming values") ? void (0) : __assert_fail ("(!InLoopVal || NumIncoming == 2) && \"Found an in-loop reduction for PHI with unexpected number of \" \"incoming values\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8299, __extension__
__PRETTY_FUNCTION__))
;
8300 if (InLoopVal)
8301 return Operands[Operands[0] == InLoopVal ? 1 : 0];
8302
8303 // We know that all PHIs in non-header blocks are converted into selects, so
8304 // we don't have to worry about the insertion order and we can just use the
8305 // builder. At this point we generate the predication tree. There may be
8306 // duplications since this is a simple recursive scan, but future
8307 // optimizations will clean it up.
8308 SmallVector<VPValue *, 2> OperandsWithMask;
8309
8310 for (unsigned In = 0; In < NumIncoming; In++) {
8311 VPValue *EdgeMask =
8312 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8313 assert((EdgeMask || NumIncoming == 1) &&(static_cast <bool> ((EdgeMask || NumIncoming == 1) &&
"Multiple predecessors with one having a full mask") ? void (
0) : __assert_fail ("(EdgeMask || NumIncoming == 1) && \"Multiple predecessors with one having a full mask\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8314, __extension__
__PRETTY_FUNCTION__))
8314 "Multiple predecessors with one having a full mask")(static_cast <bool> ((EdgeMask || NumIncoming == 1) &&
"Multiple predecessors with one having a full mask") ? void (
0) : __assert_fail ("(EdgeMask || NumIncoming == 1) && \"Multiple predecessors with one having a full mask\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8314, __extension__
__PRETTY_FUNCTION__))
;
8315 OperandsWithMask.push_back(Operands[In]);
8316 if (EdgeMask)
8317 OperandsWithMask.push_back(EdgeMask);
8318 }
8319 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8320}
8321
8322VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8323 ArrayRef<VPValue *> Operands,
8324 VFRange &Range) const {
8325
8326 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8327 [this, CI](ElementCount VF) {
8328 return CM.isScalarWithPredication(CI, VF);
8329 },
8330 Range);
8331
8332 if (IsPredicated)
8333 return nullptr;
8334
8335 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8336 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8337 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8338 ID == Intrinsic::pseudoprobe ||
8339 ID == Intrinsic::experimental_noalias_scope_decl))
8340 return nullptr;
8341
8342 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8343
8344 // Is it beneficial to perform intrinsic call compared to lib call?
8345 bool ShouldUseVectorIntrinsic =
8346 ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8347 [&](ElementCount VF) -> bool {
8348 bool NeedToScalarize = false;
8349 // Is it beneficial to perform intrinsic call compared to lib
8350 // call?
8351 InstructionCost CallCost =
8352 CM.getVectorCallCost(CI, VF, NeedToScalarize);
8353 InstructionCost IntrinsicCost =
8354 CM.getVectorIntrinsicCost(CI, VF);
8355 return IntrinsicCost <= CallCost;
8356 },
8357 Range);
8358 if (ShouldUseVectorIntrinsic)
8359 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID);
8360
8361 // Is better to call a vectorized version of the function than to to scalarize
8362 // the call?
8363 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8364 [&](ElementCount VF) -> bool {
8365 // The following case may be scalarized depending on the VF.
8366 // The flag shows whether we can use a usual Call for vectorized
8367 // version of the instruction.
8368 bool NeedToScalarize = false;
8369 CM.getVectorCallCost(CI, VF, NeedToScalarize);
8370 return !NeedToScalarize;
8371 },
8372 Range);
8373 if (ShouldUseVectorCall)
8374 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
8375 Intrinsic::not_intrinsic);
8376
8377 return nullptr;
8378}
8379
8380bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8381 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&(static_cast <bool> (!isa<BranchInst>(I) &&
!isa<PHINode>(I) && !isa<LoadInst>(I) &&
!isa<StoreInst>(I) && "Instruction should have been handled earlier"
) ? void (0) : __assert_fail ("!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && !isa<StoreInst>(I) && \"Instruction should have been handled earlier\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8382, __extension__
__PRETTY_FUNCTION__))
8382 !isa<StoreInst>(I) && "Instruction should have been handled earlier")(static_cast <bool> (!isa<BranchInst>(I) &&
!isa<PHINode>(I) && !isa<LoadInst>(I) &&
!isa<StoreInst>(I) && "Instruction should have been handled earlier"
) ? void (0) : __assert_fail ("!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && !isa<StoreInst>(I) && \"Instruction should have been handled earlier\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8382, __extension__
__PRETTY_FUNCTION__))
;
8383 // Instruction should be widened, unless it is scalar after vectorization,
8384 // scalarization is profitable or it is predicated.
8385 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8386 return CM.isScalarAfterVectorization(I, VF) ||
8387 CM.isProfitableToScalarize(I, VF) ||
8388 CM.isScalarWithPredication(I, VF);
8389 };
8390 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8391 Range);
8392}
8393
8394VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
8395 ArrayRef<VPValue *> Operands,
8396 VPBasicBlock *VPBB, VPlanPtr &Plan) {
8397 switch (I->getOpcode()) {
8398 default:
8399 return nullptr;
8400 case Instruction::SDiv:
8401 case Instruction::UDiv:
8402 case Instruction::SRem:
8403 case Instruction::URem: {
8404 // If not provably safe, use a select to form a safe divisor before widening the
8405 // div/rem operation itself. Otherwise fall through to general handling below.
8406 if (CM.isPredicatedInst(I)) {
8407 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8408 VPValue *Mask = createBlockInMask(I->getParent(), Plan);
8409 VPValue *One =
8410 Plan->getOrAddExternalDef(ConstantInt::get(I->getType(), 1u, false));
8411 auto *SafeRHS =
8412 new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
8413 I->getDebugLoc());
8414 VPBB->appendRecipe(SafeRHS);
8415 Ops[1] = SafeRHS;
8416 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8417 }
8418 LLVM_FALLTHROUGH[[fallthrough]];
8419 }
8420 case Instruction::Add:
8421 case Instruction::And:
8422 case Instruction::AShr:
8423 case Instruction::BitCast:
8424 case Instruction::FAdd:
8425 case Instruction::FCmp:
8426 case Instruction::FDiv:
8427 case Instruction::FMul:
8428 case Instruction::FNeg:
8429 case Instruction::FPExt:
8430 case Instruction::FPToSI:
8431 case Instruction::FPToUI:
8432 case Instruction::FPTrunc:
8433 case Instruction::FRem:
8434 case Instruction::FSub:
8435 case Instruction::ICmp:
8436 case Instruction::IntToPtr:
8437 case Instruction::LShr:
8438 case Instruction::Mul:
8439 case Instruction::Or:
8440 case Instruction::PtrToInt:
8441 case Instruction::Select:
8442 case Instruction::SExt:
8443 case Instruction::Shl:
8444 case Instruction::SIToFP:
8445 case Instruction::Sub:
8446 case Instruction::Trunc:
8447 case Instruction::UIToFP:
8448 case Instruction::Xor:
8449 case Instruction::ZExt:
8450 case Instruction::Freeze:
8451 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8452 };
8453}
8454
8455void VPRecipeBuilder::fixHeaderPhis() {
8456 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8457 for (VPHeaderPHIRecipe *R : PhisToFix) {
8458 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8459 VPRecipeBase *IncR =
8460 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8461 R->addOperand(IncR->getVPSingleValue());
8462 }
8463}
8464
8465VPBasicBlock *VPRecipeBuilder::handleReplication(
8466 Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8467 VPlanPtr &Plan) {
8468 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8469 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8470 Range);
8471
8472 bool IsPredicated = CM.isPredicatedInst(I);
8473
8474 // Even if the instruction is not marked as uniform, there are certain
8475 // intrinsic calls that can be effectively treated as such, so we check for
8476 // them here. Conservatively, we only do this for scalable vectors, since
8477 // for fixed-width VFs we can always fall back on full scalarization.
8478 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8479 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8480 case Intrinsic::assume:
8481 case Intrinsic::lifetime_start:
8482 case Intrinsic::lifetime_end:
8483 // For scalable vectors if one of the operands is variant then we still
8484 // want to mark as uniform, which will generate one instruction for just
8485 // the first lane of the vector. We can't scalarize the call in the same
8486 // way as for fixed-width vectors because we don't know how many lanes
8487 // there are.
8488 //
8489 // The reasons for doing it this way for scalable vectors are:
8490 // 1. For the assume intrinsic generating the instruction for the first
8491 // lane is still be better than not generating any at all. For
8492 // example, the input may be a splat across all lanes.
8493 // 2. For the lifetime start/end intrinsics the pointer operand only
8494 // does anything useful when the input comes from a stack object,
8495 // which suggests it should always be uniform. For non-stack objects
8496 // the effect is to poison the object, which still allows us to
8497 // remove the call.
8498 IsUniform = true;
8499 break;
8500 default:
8501 break;
8502 }
8503 }
8504
8505 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8506 IsUniform, IsPredicated);
8507
8508 // Find if I uses a predicated instruction. If so, it will use its scalar
8509 // value. Avoid hoisting the insert-element which packs the scalar value into
8510 // a vector value, as that happens iff all users use the vector value.
8511 for (VPValue *Op : Recipe->operands()) {
8512 auto *PredR =
8513 dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDefiningRecipe());
8514 if (!PredR)
8515 continue;
8516 auto *RepR = cast<VPReplicateRecipe>(
8517 PredR->getOperand(0)->getDefiningRecipe());
8518 assert(RepR->isPredicated() &&(static_cast <bool> (RepR->isPredicated() &&
"expected Replicate recipe to be predicated") ? void (0) : __assert_fail
("RepR->isPredicated() && \"expected Replicate recipe to be predicated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8519, __extension__
__PRETTY_FUNCTION__))
8519 "expected Replicate recipe to be predicated")(static_cast <bool> (RepR->isPredicated() &&
"expected Replicate recipe to be predicated") ? void (0) : __assert_fail
("RepR->isPredicated() && \"expected Replicate recipe to be predicated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8519, __extension__
__PRETTY_FUNCTION__))
;
8520 RepR->setAlsoPack(false);
8521 }
8522
8523 // Finalize the recipe for Instr, first if it is not predicated.
8524 if (!IsPredicated) {
8525 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalarizing:" <<
*I << "\n"; } } while (false)
;
8526 setRecipe(I, Recipe);
8527 Plan->addVPValue(I, Recipe);
8528 VPBB->appendRecipe(Recipe);
8529 return VPBB;
8530 }
8531 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalarizing and predicating:"
<< *I << "\n"; } } while (false)
;
8532
8533 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8534 assert(SingleSucc && "VPBB must have a single successor when handling "(static_cast <bool> (SingleSucc && "VPBB must have a single successor when handling "
"predicated replication.") ? void (0) : __assert_fail ("SingleSucc && \"VPBB must have a single successor when handling \" \"predicated replication.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8535, __extension__
__PRETTY_FUNCTION__))
8535 "predicated replication.")(static_cast <bool> (SingleSucc && "VPBB must have a single successor when handling "
"predicated replication.") ? void (0) : __assert_fail ("SingleSucc && \"VPBB must have a single successor when handling \" \"predicated replication.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8535, __extension__
__PRETTY_FUNCTION__))
;
8536 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8537 // Record predicated instructions for above packing optimizations.
8538 VPBlockBase *Region = createReplicateRegion(Recipe, Plan);
8539 VPBlockUtils::insertBlockAfter(Region, VPBB);
8540 auto *RegSucc = new VPBasicBlock();
8541 VPBlockUtils::insertBlockAfter(RegSucc, Region);
8542 VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8543 return RegSucc;
8544}
8545
8546VPRegionBlock *
8547VPRecipeBuilder::createReplicateRegion(VPReplicateRecipe *PredRecipe,
8548 VPlanPtr &Plan) {
8549 Instruction *Instr = PredRecipe->getUnderlyingInstr();
8550 // Instructions marked for predication are replicated and placed under an
8551 // if-then construct to prevent side-effects.
8552 // Generate recipes to compute the block mask for this region.
8553 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
1
Calling 'VPRecipeBuilder::createBlockInMask'
8554
8555 // Build the triangular if-then region.
8556 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8557 assert(Instr->getParent() && "Predicated instruction not in any basic block")(static_cast <bool> (Instr->getParent() && "Predicated instruction not in any basic block"
) ? void (0) : __assert_fail ("Instr->getParent() && \"Predicated instruction not in any basic block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8557, __extension__
__PRETTY_FUNCTION__))
;
8558 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8559 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8560 auto *PHIRecipe = Instr->getType()->isVoidTy()
8561 ? nullptr
8562 : new VPPredInstPHIRecipe(PredRecipe);
8563 if (PHIRecipe) {
8564 setRecipe(Instr, PHIRecipe);
8565 Plan->addVPValue(Instr, PHIRecipe);
8566 } else {
8567 setRecipe(Instr, PredRecipe);
8568 Plan->addVPValue(Instr, PredRecipe);
8569 }
8570
8571 auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8572 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8573 VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
8574
8575 // Note: first set Entry as region entry and then connect successors starting
8576 // from it in order, to propagate the "parent" of each VPBasicBlock.
8577 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
8578 VPBlockUtils::connectBlocks(Pred, Exiting);
8579
8580 return Region;
8581}
8582
8583VPRecipeOrVPValueTy
8584VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8585 ArrayRef<VPValue *> Operands,
8586 VFRange &Range, VPBasicBlock *VPBB,
8587 VPlanPtr &Plan) {
8588 // First, check for specific widening recipes that deal with inductions, Phi
8589 // nodes, calls and memory operations.
8590 VPRecipeBase *Recipe;
8591 if (auto Phi = dyn_cast<PHINode>(Instr)) {
8592 if (Phi->getParent() != OrigLoop->getHeader())
8593 return tryToBlend(Phi, Operands, Plan);
8594
8595 // Always record recipes for header phis. Later first-order recurrence phis
8596 // can have earlier phis as incoming values.
8597 recordRecipeOf(Phi);
8598
8599 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8600 return toVPRecipeResult(Recipe);
8601
8602 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8603 assert((Legal->isReductionVariable(Phi) ||(static_cast <bool> ((Legal->isReductionVariable(Phi
) || Legal->isFixedOrderRecurrence(Phi)) && "can only widen reductions and fixed-order recurrences here"
) ? void (0) : __assert_fail ("(Legal->isReductionVariable(Phi) || Legal->isFixedOrderRecurrence(Phi)) && \"can only widen reductions and fixed-order recurrences here\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8605, __extension__
__PRETTY_FUNCTION__))
8604 Legal->isFixedOrderRecurrence(Phi)) &&(static_cast <bool> ((Legal->isReductionVariable(Phi
) || Legal->isFixedOrderRecurrence(Phi)) && "can only widen reductions and fixed-order recurrences here"
) ? void (0) : __assert_fail ("(Legal->isReductionVariable(Phi) || Legal->isFixedOrderRecurrence(Phi)) && \"can only widen reductions and fixed-order recurrences here\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8605, __extension__
__PRETTY_FUNCTION__))
8605 "can only widen reductions and fixed-order recurrences here")(static_cast <bool> ((Legal->isReductionVariable(Phi
) || Legal->isFixedOrderRecurrence(Phi)) && "can only widen reductions and fixed-order recurrences here"
) ? void (0) : __assert_fail ("(Legal->isReductionVariable(Phi) || Legal->isFixedOrderRecurrence(Phi)) && \"can only widen reductions and fixed-order recurrences here\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8605, __extension__
__PRETTY_FUNCTION__))
;
8606 VPValue *StartV = Operands[0];
8607 if (Legal->isReductionVariable(Phi)) {
8608 const RecurrenceDescriptor &RdxDesc =
8609 Legal->getReductionVars().find(Phi)->second;
8610 assert(RdxDesc.getRecurrenceStartValue() ==(static_cast <bool> (RdxDesc.getRecurrenceStartValue() ==
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader
())) ? void (0) : __assert_fail ("RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8611, __extension__
__PRETTY_FUNCTION__))
8611 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))(static_cast <bool> (RdxDesc.getRecurrenceStartValue() ==
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader
())) ? void (0) : __assert_fail ("RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8611, __extension__
__PRETTY_FUNCTION__))
;
8612 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8613 CM.isInLoopReduction(Phi),
8614 CM.useOrderedReductions(RdxDesc));
8615 } else {
8616 // TODO: Currently fixed-order recurrences are modeled as chains of
8617 // first-order recurrences. If there are no users of the intermediate
8618 // recurrences in the chain, the fixed order recurrence should be modeled
8619 // directly, enabling more efficient codegen.
8620 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8621 }
8622
8623 // Record the incoming value from the backedge, so we can add the incoming
8624 // value from the backedge after all recipes have been created.
8625 auto *Inc = cast<Instruction>(
8626 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
8627 auto RecipeIter = Ingredient2Recipe.find(Inc);
8628 if (RecipeIter == Ingredient2Recipe.end())
8629 recordRecipeOf(Inc);
8630
8631 PhisToFix.push_back(PhiRecipe);
8632 return toVPRecipeResult(PhiRecipe);
8633 }
8634
8635 if (isa<TruncInst>(Instr) &&
8636 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8637 Range, *Plan)))
8638 return toVPRecipeResult(Recipe);
8639
8640 // All widen recipes below deal only with VF > 1.
8641 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8642 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8643 return nullptr;
8644
8645 if (auto *CI = dyn_cast<CallInst>(Instr))
8646 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8647
8648 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8649 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8650
8651 if (!shouldWiden(Instr, Range))
8652 return nullptr;
8653
8654 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8655 return toVPRecipeResult(new VPWidenGEPRecipe(
8656 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8657
8658 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8659 bool InvariantCond =
8660 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8661 return toVPRecipeResult(new VPWidenSelectRecipe(
8662 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8663 }
8664
8665 return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
8666}
8667
8668void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8669 ElementCount MaxVF) {
8670 assert(OrigLoop->isInnermost() && "Inner loop expected.")(static_cast <bool> (OrigLoop->isInnermost() &&
"Inner loop expected.") ? void (0) : __assert_fail ("OrigLoop->isInnermost() && \"Inner loop expected.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8670, __extension__
__PRETTY_FUNCTION__))
;
8671
8672 // Add assume instructions we need to drop to DeadInstructions, to prevent
8673 // them from being added to the VPlan.
8674 // TODO: We only need to drop assumes in blocks that get flattend. If the
8675 // control flow is preserved, we should keep them.
8676 SmallPtrSet<Instruction *, 4> DeadInstructions;
8677 auto &ConditionalAssumes = Legal->getConditionalAssumes();
8678 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8679
8680 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8681 // Dead instructions do not need sinking. Remove them from SinkAfter.
8682 for (Instruction *I : DeadInstructions)
8683 SinkAfter.erase(I);
8684
8685 // Cannot sink instructions after dead instructions (there won't be any
8686 // recipes for them). Instead, find the first non-dead previous instruction.
8687 for (auto &P : Legal->getSinkAfter()) {
8688 Instruction *SinkTarget = P.second;
8689 Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8690 (void)FirstInst;
8691 while (DeadInstructions.contains(SinkTarget)) {
8692 assert((static_cast <bool> (SinkTarget != FirstInst &&
"Must find a live instruction (at least the one feeding the "
"fixed-order recurrence PHI) before reaching beginning of the block"
) ? void (0) : __assert_fail ("SinkTarget != FirstInst && \"Must find a live instruction (at least the one feeding the \" \"fixed-order recurrence PHI) before reaching beginning of the block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8695, __extension__
__PRETTY_FUNCTION__))
8693 SinkTarget != FirstInst &&(static_cast <bool> (SinkTarget != FirstInst &&
"Must find a live instruction (at least the one feeding the "
"fixed-order recurrence PHI) before reaching beginning of the block"
) ? void (0) : __assert_fail ("SinkTarget != FirstInst && \"Must find a live instruction (at least the one feeding the \" \"fixed-order recurrence PHI) before reaching beginning of the block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8695, __extension__
__PRETTY_FUNCTION__))
8694 "Must find a live instruction (at least the one feeding the "(static_cast <bool> (SinkTarget != FirstInst &&
"Must find a live instruction (at least the one feeding the "
"fixed-order recurrence PHI) before reaching beginning of the block"
) ? void (0) : __assert_fail ("SinkTarget != FirstInst && \"Must find a live instruction (at least the one feeding the \" \"fixed-order recurrence PHI) before reaching beginning of the block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8695, __extension__
__PRETTY_FUNCTION__))
8695 "fixed-order recurrence PHI) before reaching beginning of the block")(static_cast <bool> (SinkTarget != FirstInst &&
"Must find a live instruction (at least the one feeding the "
"fixed-order recurrence PHI) before reaching beginning of the block"
) ? void (0) : __assert_fail ("SinkTarget != FirstInst && \"Must find a live instruction (at least the one feeding the \" \"fixed-order recurrence PHI) before reaching beginning of the block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8695, __extension__
__PRETTY_FUNCTION__))
;
8696 SinkTarget = SinkTarget->getPrevNode();
8697 assert(SinkTarget != P.first &&(static_cast <bool> (SinkTarget != P.first && "sink source equals target, no sinking required"
) ? void (0) : __assert_fail ("SinkTarget != P.first && \"sink source equals target, no sinking required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8698, __extension__
__PRETTY_FUNCTION__))
8698 "sink source equals target, no sinking required")(static_cast <bool> (SinkTarget != P.first && "sink source equals target, no sinking required"
) ? void (0) : __assert_fail ("SinkTarget != P.first && \"sink source equals target, no sinking required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8698, __extension__
__PRETTY_FUNCTION__))
;
8699 }
8700 P.second = SinkTarget;
8701 }
8702
8703 auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8704 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8705 VFRange SubRange = {VF, MaxVFPlusOne};
8706 VPlans.push_back(
8707 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8708 VF = SubRange.End;
8709 }
8710}
8711
8712// Add the necessary canonical IV and branch recipes required to control the
8713// loop.
8714static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8715 bool HasNUW,
8716 bool UseLaneMaskForLoopControlFlow) {
8717 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8718 auto *StartV = Plan.getOrAddVPValue(StartIdx);
8719
8720 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8721 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8722 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8723 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8724 Header->insert(CanonicalIVPHI, Header->begin());
8725
8726 // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
8727 // IV by VF * UF.
8728 auto *CanonicalIVIncrement =
8729 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
8730 : VPInstruction::CanonicalIVIncrement,
8731 {CanonicalIVPHI}, DL, "index.next");
8732 CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8733
8734 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
8735 EB->appendRecipe(CanonicalIVIncrement);
8736
8737 if (UseLaneMaskForLoopControlFlow) {
8738 // Create the active lane mask instruction in the vplan preheader.
8739 VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();
8740
8741 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
8742 // we have to take unrolling into account. Each part needs to start at
8743 // Part * VF
8744 auto *CanonicalIVIncrementParts =
8745 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
8746 : VPInstruction::CanonicalIVIncrementForPart,
8747 {StartV}, DL, "index.part.next");
8748 Preheader->appendRecipe(CanonicalIVIncrementParts);
8749
8750 // Create the ActiveLaneMask instruction using the correct start values.
8751 VPValue *TC = Plan.getOrCreateTripCount();
8752 auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask,
8753 {CanonicalIVIncrementParts, TC}, DL,
8754 "active.lane.mask.entry");
8755 Preheader->appendRecipe(EntryALM);
8756
8757 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
8758 // preheader ActiveLaneMask instruction.
8759 auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
8760 Header->insert(LaneMaskPhi, Header->getFirstNonPhi());
8761
8762 // Create the active lane mask for the next iteration of the loop.
8763 CanonicalIVIncrementParts =
8764 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
8765 : VPInstruction::CanonicalIVIncrementForPart,
8766 {CanonicalIVIncrement}, DL);
8767 EB->appendRecipe(CanonicalIVIncrementParts);
8768
8769 auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask,
8770 {CanonicalIVIncrementParts, TC}, DL,
8771 "active.lane.mask.next");
8772 EB->appendRecipe(ALM);
8773 LaneMaskPhi->addOperand(ALM);
8774
8775 // We have to invert the mask here because a true condition means jumping
8776 // to the exit block.
8777 auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL);
8778 EB->appendRecipe(NotMask);
8779
8780 VPInstruction *BranchBack =
8781 new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL);
8782 EB->appendRecipe(BranchBack);
8783 } else {
8784 // Add the BranchOnCount VPInstruction to the latch.
8785 VPInstruction *BranchBack = new VPInstruction(
8786 VPInstruction::BranchOnCount,
8787 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8788 EB->appendRecipe(BranchBack);
8789 }
8790}
8791
8792// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8793// original exit block.
8794static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
8795 VPBasicBlock *MiddleVPBB, Loop *OrigLoop,
8796 VPlan &Plan) {
8797 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8798 BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8799 // Only handle single-exit loops with unique exit blocks for now.
8800 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8801 return;
8802
8803 // Introduce VPUsers modeling the exit values.
8804 for (PHINode &ExitPhi : ExitBB->phis()) {
8805 Value *IncomingValue =
8806 ExitPhi.getIncomingValueForBlock(ExitingBB);
8807 VPValue *V = Plan.getOrAddVPValue(IncomingValue, true);
8808 Plan.addLiveOut(&ExitPhi, V);
8809 }
8810}
8811
8812VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8813 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8814 const MapVector<Instruction *, Instruction *> &SinkAfter) {
8815
8816 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8817
8818 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8819
8820 // ---------------------------------------------------------------------------
8821 // Pre-construction: record ingredients whose recipes we'll need to further
8822 // process after constructing the initial VPlan.
8823 // ---------------------------------------------------------------------------
8824
8825 // Mark instructions we'll need to sink later and their targets as
8826 // ingredients whose recipe we'll need to record.
8827 for (const auto &Entry : SinkAfter) {
8828 RecipeBuilder.recordRecipeOf(Entry.first);
8829 RecipeBuilder.recordRecipeOf(Entry.second);
8830 }
8831 for (const auto &Reduction : CM.getInLoopReductionChains()) {
8832 PHINode *Phi = Reduction.first;
8833 RecurKind Kind =
8834 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8835 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8836
8837 RecipeBuilder.recordRecipeOf(Phi);
8838 for (const auto &R : ReductionOperations) {
8839 RecipeBuilder.recordRecipeOf(R);
8840 // For min/max reductions, where we have a pair of icmp/select, we also
8841 // need to record the ICmp recipe, so it can be removed later.
8842 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&(static_cast <bool> (!RecurrenceDescriptor::isSelectCmpRecurrenceKind
(Kind) && "Only min/max recurrences allowed for inloop reductions"
) ? void (0) : __assert_fail ("!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && \"Only min/max recurrences allowed for inloop reductions\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8843, __extension__
__PRETTY_FUNCTION__))
8843 "Only min/max recurrences allowed for inloop reductions")(static_cast <bool> (!RecurrenceDescriptor::isSelectCmpRecurrenceKind
(Kind) && "Only min/max recurrences allowed for inloop reductions"
) ? void (0) : __assert_fail ("!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && \"Only min/max recurrences allowed for inloop reductions\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8843, __extension__
__PRETTY_FUNCTION__))
;
8844 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8845 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8846 }
8847 }
8848
8849 // For each interleave group which is relevant for this (possibly trimmed)
8850 // Range, add it to the set of groups to be later applied to the VPlan and add
8851 // placeholders for its members' Recipes which we'll be replacing with a
8852 // single VPInterleaveRecipe.
8853 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8854 auto applyIG = [IG, this](ElementCount VF) -> bool {
8855 return (VF.isVector() && // Query is illegal for VF == 1
8856 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8857 LoopVectorizationCostModel::CM_Interleave);
8858 };
8859 if (!getDecisionAndClampRange(applyIG, Range))
8860 continue;
8861 InterleaveGroups.insert(IG);
8862 for (unsigned i = 0; i < IG->getFactor(); i++)
8863 if (Instruction *Member = IG->getMember(i))
8864 RecipeBuilder.recordRecipeOf(Member);
8865 };
8866
8867 // ---------------------------------------------------------------------------
8868 // Build initial VPlan: Scan the body of the loop in a topological order to
8869 // visit each basic block after having visited its predecessor basic blocks.
8870 // ---------------------------------------------------------------------------
8871
8872 // Create initial VPlan skeleton, starting with a block for the pre-header,
8873 // followed by a region for the vector loop, followed by the middle block. The
8874 // skeleton vector loop region contains a header and latch block.
8875 VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
8876 auto Plan = std::make_unique<VPlan>(Preheader);
8877
8878 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8879 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8880 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8881 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
8882 VPBlockUtils::insertBlockAfter(TopRegion, Preheader);
8883 VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
8884 VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
8885
8886 Instruction *DLInst =
8887 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8888 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
8889 DLInst ? DLInst->getDebugLoc() : DebugLoc(),
8890 !CM.foldTailByMasking(),
8891 CM.useActiveLaneMaskForControlFlow());
8892
8893 // Scan the body of the loop in a topological order to visit each basic block
8894 // after having visited its predecessor basic blocks.
8895 LoopBlocksDFS DFS(OrigLoop);
8896 DFS.perform(LI);
8897
8898 VPBasicBlock *VPBB = HeaderVPBB;
8899 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
8900 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8901 // Relevant instructions from basic block BB will be grouped into VPRecipe
8902 // ingredients and fill a new VPBasicBlock.
8903 unsigned VPBBsForBB = 0;
8904 if (VPBB != HeaderVPBB)
8905 VPBB->setName(BB->getName());
8906 Builder.setInsertPoint(VPBB);
8907
8908 // Introduce each ingredient into VPlan.
8909 // TODO: Model and preserve debug intrinsics in VPlan.
8910 for (Instruction &I : BB->instructionsWithoutDebug()) {
8911 Instruction *Instr = &I;
8912
8913 // First filter out irrelevant instructions, to ensure no recipes are
8914 // built for them.
8915 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8916 continue;
8917
8918 SmallVector<VPValue *, 4> Operands;
8919 auto *Phi = dyn_cast<PHINode>(Instr);
8920 if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8921 Operands.push_back(Plan->getOrAddVPValue(
8922 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8923 } else {
8924 auto OpRange = Plan->mapToVPValues(Instr->operands());
8925 Operands = {OpRange.begin(), OpRange.end()};
8926 }
8927
8928 // Invariant stores inside loop will be deleted and a single store
8929 // with the final reduction value will be added to the exit block
8930 StoreInst *SI;
8931 if ((SI = dyn_cast<StoreInst>(&I)) &&
8932 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8933 continue;
8934
8935 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8936 Instr, Operands, Range, VPBB, Plan)) {
8937 // If Instr can be simplified to an existing VPValue, use it.
8938 if (RecipeOrValue.is<VPValue *>()) {
8939 auto *VPV = RecipeOrValue.get<VPValue *>();
8940 Plan->addVPValue(Instr, VPV);
8941 // If the re-used value is a recipe, register the recipe for the
8942 // instruction, in case the recipe for Instr needs to be recorded.
8943 if (VPRecipeBase *R = VPV->getDefiningRecipe())
8944 RecipeBuilder.setRecipe(Instr, R);
8945 continue;
8946 }
8947 // Otherwise, add the new recipe.
8948 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
8949 for (auto *Def : Recipe->definedValues()) {
8950 auto *UV = Def->getUnderlyingValue();
8951 Plan->addVPValue(UV, Def);
8952 }
8953
8954 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
8955 HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
8956 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
8957 // of the header block. That can happen for truncates of induction
8958 // variables. Those recipes are moved to the phi section of the header
8959 // block after applying SinkAfter, which relies on the original
8960 // position of the trunc.
8961 assert(isa<TruncInst>(Instr))(static_cast <bool> (isa<TruncInst>(Instr)) ? void
(0) : __assert_fail ("isa<TruncInst>(Instr)", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8961, __extension__ __PRETTY_FUNCTION__))
;
8962 InductionsToMove.push_back(
8963 cast<VPWidenIntOrFpInductionRecipe>(Recipe));
8964 }
8965 RecipeBuilder.setRecipe(Instr, Recipe);
8966 VPBB->appendRecipe(Recipe);
8967 continue;
8968 }
8969
8970 // Otherwise, if all widening options failed, Instruction is to be
8971 // replicated. This may create a successor for VPBB.
8972 VPBasicBlock *NextVPBB =
8973 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
8974 if (NextVPBB != VPBB) {
8975 VPBB = NextVPBB;
8976 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8977 : "");
8978 }
8979 }
8980
8981 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8982 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8983 }
8984
8985 // After here, VPBB should not be used.
8986 VPBB = nullptr;
8987
8988 addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);
8989
8990 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&(static_cast <bool> (isa<VPRegionBlock>(Plan->
getVectorLoopRegion()) && !Plan->getVectorLoopRegion
()->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry "
"VPBasicBlock") ? void (0) : __assert_fail ("isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && \"entry block must be set to a VPRegionBlock having a non-empty entry \" \"VPBasicBlock\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8993, __extension__
__PRETTY_FUNCTION__))
8991 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&(static_cast <bool> (isa<VPRegionBlock>(Plan->
getVectorLoopRegion()) && !Plan->getVectorLoopRegion
()->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry "
"VPBasicBlock") ? void (0) : __assert_fail ("isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && \"entry block must be set to a VPRegionBlock having a non-empty entry \" \"VPBasicBlock\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8993, __extension__
__PRETTY_FUNCTION__))
8992 "entry block must be set to a VPRegionBlock having a non-empty entry "(static_cast <bool> (isa<VPRegionBlock>(Plan->
getVectorLoopRegion()) && !Plan->getVectorLoopRegion
()->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry "
"VPBasicBlock") ? void (0) : __assert_fail ("isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && \"entry block must be set to a VPRegionBlock having a non-empty entry \" \"VPBasicBlock\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8993, __extension__
__PRETTY_FUNCTION__))
8993 "VPBasicBlock")(static_cast <bool> (isa<VPRegionBlock>(Plan->
getVectorLoopRegion()) && !Plan->getVectorLoopRegion
()->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry "
"VPBasicBlock") ? void (0) : __assert_fail ("isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && \"entry block must be set to a VPRegionBlock having a non-empty entry \" \"VPBasicBlock\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8993, __extension__
__PRETTY_FUNCTION__))
;
8994 RecipeBuilder.fixHeaderPhis();
8995
8996 // ---------------------------------------------------------------------------
8997 // Transform initial VPlan: Apply previously taken decisions, in order, to
8998 // bring the VPlan to its final state.
8999 // ---------------------------------------------------------------------------
9000
9001 // Apply Sink-After legal constraints.
9002 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
9003 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
9004 if (Region && Region->isReplicator()) {
9005 assert(Region->getNumSuccessors() == 1 &&(static_cast <bool> (Region->getNumSuccessors() == 1
&& Region->getNumPredecessors() == 1 && "Expected SESE region!"
) ? void (0) : __assert_fail ("Region->getNumSuccessors() == 1 && Region->getNumPredecessors() == 1 && \"Expected SESE region!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9006, __extension__
__PRETTY_FUNCTION__))
9006 Region->getNumPredecessors() == 1 && "Expected SESE region!")(static_cast <bool> (Region->getNumSuccessors() == 1
&& Region->getNumPredecessors() == 1 && "Expected SESE region!"
) ? void (0) : __assert_fail ("Region->getNumSuccessors() == 1 && Region->getNumPredecessors() == 1 && \"Expected SESE region!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9006, __extension__
__PRETTY_FUNCTION__))
;
9007 assert(R->getParent()->size() == 1 &&(static_cast <bool> (R->getParent()->size() == 1 &&
"A recipe in an original replicator region must be the only "
"recipe in its block") ? void (0) : __assert_fail ("R->getParent()->size() == 1 && \"A recipe in an original replicator region must be the only \" \"recipe in its block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9009, __extension__
__PRETTY_FUNCTION__))
9008 "A recipe in an original replicator region must be the only "(static_cast <bool> (R->getParent()->size() == 1 &&
"A recipe in an original replicator region must be the only "
"recipe in its block") ? void (0) : __assert_fail ("R->getParent()->size() == 1 && \"A recipe in an original replicator region must be the only \" \"recipe in its block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9009, __extension__
__PRETTY_FUNCTION__))
9009 "recipe in its block")(static_cast <bool> (R->getParent()->size() == 1 &&
"A recipe in an original replicator region must be the only "
"recipe in its block") ? void (0) : __assert_fail ("R->getParent()->size() == 1 && \"A recipe in an original replicator region must be the only \" \"recipe in its block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9009, __extension__
__PRETTY_FUNCTION__))
;
9010 return Region;
9011 }
9012 return nullptr;
9013 };
9014 for (const auto &Entry : SinkAfter) {
9015 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9016 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9017
9018 auto *TargetRegion = GetReplicateRegion(Target);
9019 auto *SinkRegion = GetReplicateRegion(Sink);
9020 if (!SinkRegion) {
9021 // If the sink source is not a replicate region, sink the recipe directly.
9022 if (TargetRegion) {
9023 // The target is in a replication region, make sure to move Sink to
9024 // the block after it, not into the replication region itself.
9025 VPBasicBlock *NextBlock =
9026 cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9027 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9028 } else
9029 Sink->moveAfter(Target);
9030 continue;
9031 }
9032
9033 // The sink source is in a replicate region. Unhook the region from the CFG.
9034 auto *SinkPred = SinkRegion->getSinglePredecessor();
9035 auto *SinkSucc = SinkRegion->getSingleSuccessor();
9036 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9037 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9038 VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9039
9040 if (TargetRegion) {
9041 // The target recipe is also in a replicate region, move the sink region
9042 // after the target region.
9043 auto *TargetSucc = TargetRegion->getSingleSuccessor();
9044 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9045 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9046 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9047 } else {
9048 // The sink source is in a replicate region, we need to move the whole
9049 // replicate region, which should only contain a single recipe in the
9050 // main block.
9051 auto *SplitBlock =
9052 Target->getParent()->splitAt(std::next(Target->getIterator()));
9053
9054 auto *SplitPred = SplitBlock->getSinglePredecessor();
9055
9056 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9057 VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9058 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9059 }
9060 }
9061
9062 VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
9063 VPlanTransforms::removeRedundantInductionCasts(*Plan);
9064
9065 // Now that sink-after is done, move induction recipes for optimized truncates
9066 // to the phi section of the header block.
9067 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
9068 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9069
9070 // Adjust the recipes for any inloop reductions.
9071 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,
9072 RecipeBuilder, Range.Start);
9073
9074 // Introduce a recipe to combine the incoming and previous values of a
9075 // fixed-order recurrence.
9076 for (VPRecipeBase &R :
9077 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9078 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9079 if (!RecurPhi)
9080 continue;
9081
9082 VPRecipeBase *PrevRecipe = &RecurPhi->getBackedgeRecipe();
9083 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
9084 // to terminate.
9085 while (auto *PrevPhi =
9086 dyn_cast<VPFirstOrderRecurrencePHIRecipe>(PrevRecipe))
9087 PrevRecipe = &PrevPhi->getBackedgeRecipe();
9088 VPBasicBlock *InsertBlock = PrevRecipe->getParent();
9089 auto *Region = GetReplicateRegion(PrevRecipe);
9090 if (Region)
9091 InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor());
9092 if (!InsertBlock) {
9093 InsertBlock = new VPBasicBlock(Region->getName() + ".succ");
9094 VPBlockUtils::insertBlockAfter(InsertBlock, Region);
9095 }
9096 if (Region || PrevRecipe->isPhi())
9097 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
9098 else
9099 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
9100
9101 auto *RecurSplice = cast<VPInstruction>(
9102 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9103 {RecurPhi, RecurPhi->getBackedgeValue()}));
9104
9105 RecurPhi->replaceAllUsesWith(RecurSplice);
9106 // Set the first operand of RecurSplice to RecurPhi again, after replacing
9107 // all users.
9108 RecurSplice->setOperand(0, RecurPhi);
9109 }
9110
9111 // Interleave memory: for each Interleave Group we marked earlier as relevant
9112 // for this VPlan, replace the Recipes widening its memory instructions with a
9113 // single VPInterleaveRecipe at its insertion point.
9114 for (const auto *IG : InterleaveGroups) {
9115 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9116 RecipeBuilder.getRecipe(IG->getInsertPos()));
9117 SmallVector<VPValue *, 4> StoredValues;
9118 for (unsigned i = 0; i < IG->getFactor(); ++i)
9119 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9120 auto *StoreR =
9121 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9122 StoredValues.push_back(StoreR->getStoredValue());
9123 }
9124
9125 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9126 Recipe->getMask());
9127 VPIG->insertBefore(Recipe);
9128 unsigned J = 0;
9129 for (unsigned i = 0; i < IG->getFactor(); ++i)
9130 if (Instruction *Member = IG->getMember(i)) {
9131 if (!Member->getType()->isVoidTy()) {
9132 VPValue *OriginalV = Plan->getVPValue(Member);
9133 Plan->removeVPValueFor(Member);
9134 Plan->addVPValue(Member, VPIG->getVPValue(J));
9135 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9136 J++;
9137 }
9138 RecipeBuilder.getRecipe(Member)->eraseFromParent();
9139 }
9140 }
9141
9142 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9143 VF *= 2)
9144 Plan->addVF(VF);
9145 Plan->setName("Initial VPlan");
9146
9147 // From this point onwards, VPlan-to-VPlan transformations may change the plan
9148 // in ways that accessing values using original IR values is incorrect.
9149 Plan->disableValue2VPValue();
9150
9151 VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
9152 VPlanTransforms::removeDeadRecipes(*Plan);
9153
9154 bool ShouldSimplify = true;
9155 while (ShouldSimplify) {
9156 ShouldSimplify = VPlanTransforms::sinkScalarOperands(*Plan);
9157 ShouldSimplify |=
9158 VPlanTransforms::mergeReplicateRegionsIntoSuccessors(*Plan);
9159 ShouldSimplify |= VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);
9160 }
9161
9162 VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
9163 VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);
9164
9165 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid")(static_cast <bool> (VPlanVerifier::verifyPlanIsValid(*
Plan) && "VPlan is invalid") ? void (0) : __assert_fail
("VPlanVerifier::verifyPlanIsValid(*Plan) && \"VPlan is invalid\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9165, __extension__
__PRETTY_FUNCTION__))
;
9166 return Plan;
9167}
9168
9169VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9170 // Outer loop handling: They may require CFG and instruction level
9171 // transformations before even evaluating whether vectorization is profitable.
9172 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9173 // the vectorization pipeline.
9174 assert(!OrigLoop->isInnermost())(static_cast <bool> (!OrigLoop->isInnermost()) ? void
(0) : __assert_fail ("!OrigLoop->isInnermost()", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9174, __extension__ __PRETTY_FUNCTION__))
;
9175 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.")(static_cast <bool> (EnableVPlanNativePath && "VPlan-native path is not enabled."
) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is not enabled.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9175, __extension__
__PRETTY_FUNCTION__))
;
9176
9177 // Create new empty VPlan
9178 auto Plan = std::make_unique<VPlan>();
9179
9180 // Build hierarchical CFG
9181 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9182 HCFGBuilder.buildHierarchicalCFG();
9183
9184 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9185 VF *= 2)
9186 Plan->addVF(VF);
9187
9188 SmallPtrSet<Instruction *, 1> DeadInstructions;
9189 VPlanTransforms::VPInstructionsToVPRecipes(
9190 OrigLoop, Plan,
9191 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9192 DeadInstructions, *PSE.getSE(), *TLI);
9193
9194 // Remove the existing terminator of the exiting block of the top-most region.
9195 // A BranchOnCount will be added instead when adding the canonical IV recipes.
9196 auto *Term =
9197 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9198 Term->eraseFromParent();
9199
9200 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
9201 true, CM.useActiveLaneMaskForControlFlow());
9202 return Plan;
9203}
9204
9205// Adjust the recipes for reductions. For in-loop reductions the chain of
9206// instructions leading from the loop exit instr to the phi need to be converted
9207// to reductions, with one operand being vector and the other being the scalar
9208// reduction chain. For other reductions, a select is introduced between the phi
9209// and live-out recipes when folding the tail.
9210void LoopVectorizationPlanner::adjustRecipesForReductions(
9211 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9212 ElementCount MinVF) {
9213 for (const auto &Reduction : CM.getInLoopReductionChains()) {
9214 PHINode *Phi = Reduction.first;
9215 const RecurrenceDescriptor &RdxDesc =
9216 Legal->getReductionVars().find(Phi)->second;
9217 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9218
9219 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9220 continue;
9221
9222 // ReductionOperations are orders top-down from the phi's use to the
9223 // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9224 // which of the two operands will remain scalar and which will be reduced.
9225 // For minmax the chain will be the select instructions.
9226 Instruction *Chain = Phi;
9227 for (Instruction *R : ReductionOperations) {
9228 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9229 RecurKind Kind = RdxDesc.getRecurrenceKind();
9230
9231 VPValue *ChainOp = Plan->getVPValue(Chain);
9232 unsigned FirstOpId;
9233 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&(static_cast <bool> (!RecurrenceDescriptor::isSelectCmpRecurrenceKind
(Kind) && "Only min/max recurrences allowed for inloop reductions"
) ? void (0) : __assert_fail ("!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && \"Only min/max recurrences allowed for inloop reductions\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9234, __extension__
__PRETTY_FUNCTION__))
9234 "Only min/max recurrences allowed for inloop reductions")(static_cast <bool> (!RecurrenceDescriptor::isSelectCmpRecurrenceKind
(Kind) && "Only min/max recurrences allowed for inloop reductions"
) ? void (0) : __assert_fail ("!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && \"Only min/max recurrences allowed for inloop reductions\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9234, __extension__
__PRETTY_FUNCTION__))
;
9235 // Recognize a call to the llvm.fmuladd intrinsic.
9236 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9237 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&(static_cast <bool> ((!IsFMulAdd || RecurrenceDescriptor
::isFMulAddIntrinsic(R)) && "Expected instruction to be a call to the llvm.fmuladd intrinsic"
) ? void (0) : __assert_fail ("(!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && \"Expected instruction to be a call to the llvm.fmuladd intrinsic\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9238, __extension__
__PRETTY_FUNCTION__))
9238 "Expected instruction to be a call to the llvm.fmuladd intrinsic")(static_cast <bool> ((!IsFMulAdd || RecurrenceDescriptor
::isFMulAddIntrinsic(R)) && "Expected instruction to be a call to the llvm.fmuladd intrinsic"
) ? void (0) : __assert_fail ("(!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && \"Expected instruction to be a call to the llvm.fmuladd intrinsic\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9238, __extension__
__PRETTY_FUNCTION__))
;
9239 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9240 assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&(static_cast <bool> (isa<VPWidenSelectRecipe>(WidenRecipe
) && "Expected to replace a VPWidenSelectSC") ? void (
0) : __assert_fail ("isa<VPWidenSelectRecipe>(WidenRecipe) && \"Expected to replace a VPWidenSelectSC\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9241, __extension__
__PRETTY_FUNCTION__))
9241 "Expected to replace a VPWidenSelectSC")(static_cast <bool> (isa<VPWidenSelectRecipe>(WidenRecipe
) && "Expected to replace a VPWidenSelectSC") ? void (
0) : __assert_fail ("isa<VPWidenSelectRecipe>(WidenRecipe) && \"Expected to replace a VPWidenSelectSC\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9241, __extension__
__PRETTY_FUNCTION__))
;
9242 FirstOpId = 1;
9243 } else {
9244 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||(static_cast <bool> ((MinVF.isScalar() || isa<VPWidenRecipe
>(WidenRecipe) || (IsFMulAdd && isa<VPWidenCallRecipe
>(WidenRecipe))) && "Expected to replace a VPWidenSC"
) ? void (0) : __assert_fail ("(MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && \"Expected to replace a VPWidenSC\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9246, __extension__
__PRETTY_FUNCTION__))
9245 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&(static_cast <bool> ((MinVF.isScalar() || isa<VPWidenRecipe
>(WidenRecipe) || (IsFMulAdd && isa<VPWidenCallRecipe
>(WidenRecipe))) && "Expected to replace a VPWidenSC"
) ? void (0) : __assert_fail ("(MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && \"Expected to replace a VPWidenSC\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9246, __extension__
__PRETTY_FUNCTION__))
9246 "Expected to replace a VPWidenSC")(static_cast <bool> ((MinVF.isScalar() || isa<VPWidenRecipe
>(WidenRecipe) || (IsFMulAdd && isa<VPWidenCallRecipe
>(WidenRecipe))) && "Expected to replace a VPWidenSC"
) ? void (0) : __assert_fail ("(MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && \"Expected to replace a VPWidenSC\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9246, __extension__
__PRETTY_FUNCTION__))
;
9247 FirstOpId = 0;
9248 }
9249 unsigned VecOpId =
9250 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9251 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9252
9253 VPValue *CondOp = nullptr;
9254 if (CM.blockNeedsPredicationForAnyReason(R->getParent())) {
9255 VPBuilder::InsertPointGuard Guard(Builder);
9256 Builder.setInsertPoint(WidenRecipe->getParent(),
9257 WidenRecipe->getIterator());
9258 CondOp = RecipeBuilder.createBlockInMask(R->getParent(), Plan);
9259 }
9260
9261 if (IsFMulAdd) {
9262 // If the instruction is a call to the llvm.fmuladd intrinsic then we
9263 // need to create an fmul recipe to use as the vector operand for the
9264 // fadd reduction.
9265 VPInstruction *FMulRecipe = new VPInstruction(
9266 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9267 FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9268 WidenRecipe->getParent()->insert(FMulRecipe,
9269 WidenRecipe->getIterator());
9270 VecOp = FMulRecipe;
9271 }
9272 VPReductionRecipe *RedRecipe =
9273 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9274 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9275 Plan->removeVPValueFor(R);
9276 Plan->addVPValue(R, RedRecipe);
9277 // Append the recipe to the end of the VPBasicBlock because we need to
9278 // ensure that it comes after all of it's inputs, including CondOp.
9279 WidenRecipe->getParent()->appendRecipe(RedRecipe);
9280 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9281 WidenRecipe->eraseFromParent();
9282
9283 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9284 VPRecipeBase *CompareRecipe =
9285 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9286 assert(isa<VPWidenRecipe>(CompareRecipe) &&(static_cast <bool> (isa<VPWidenRecipe>(CompareRecipe
) && "Expected to replace a VPWidenSC") ? void (0) : __assert_fail
("isa<VPWidenRecipe>(CompareRecipe) && \"Expected to replace a VPWidenSC\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9287, __extension__
__PRETTY_FUNCTION__))
9287 "Expected to replace a VPWidenSC")(static_cast <bool> (isa<VPWidenRecipe>(CompareRecipe
) && "Expected to replace a VPWidenSC") ? void (0) : __assert_fail
("isa<VPWidenRecipe>(CompareRecipe) && \"Expected to replace a VPWidenSC\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9287, __extension__
__PRETTY_FUNCTION__))
;
9288 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&(static_cast <bool> (cast<VPWidenRecipe>(CompareRecipe
)->getNumUsers() == 0 && "Expected no remaining users"
) ? void (0) : __assert_fail ("cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && \"Expected no remaining users\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9289, __extension__
__PRETTY_FUNCTION__))
9289 "Expected no remaining users")(static_cast <bool> (cast<VPWidenRecipe>(CompareRecipe
)->getNumUsers() == 0 && "Expected no remaining users"
) ? void (0) : __assert_fail ("cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && \"Expected no remaining users\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9289, __extension__
__PRETTY_FUNCTION__))
;
9290 CompareRecipe->eraseFromParent();
9291 }
9292 Chain = R;
9293 }
9294 }
9295
9296 // If tail is folded by masking, introduce selects between the phi
9297 // and the live-out instruction of each reduction, at the beginning of the
9298 // dedicated latch block.
9299 if (CM.foldTailByMasking()) {
9300 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
9301 for (VPRecipeBase &R :
9302 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9303 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9304 if (!PhiR || PhiR->isInLoop())
9305 continue;
9306 VPValue *Cond =
9307 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9308 VPValue *Red = PhiR->getBackedgeValue();
9309 assert(Red->getDefiningRecipe()->getParent() != LatchVPBB &&(static_cast <bool> (Red->getDefiningRecipe()->getParent
() != LatchVPBB && "reduction recipe must be defined before latch"
) ? void (0) : __assert_fail ("Red->getDefiningRecipe()->getParent() != LatchVPBB && \"reduction recipe must be defined before latch\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9310, __extension__
__PRETTY_FUNCTION__))
9310 "reduction recipe must be defined before latch")(static_cast <bool> (Red->getDefiningRecipe()->getParent
() != LatchVPBB && "reduction recipe must be defined before latch"
) ? void (0) : __assert_fail ("Red->getDefiningRecipe()->getParent() != LatchVPBB && \"reduction recipe must be defined before latch\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9310, __extension__
__PRETTY_FUNCTION__))
;
9311 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9312 }
9313 }
9314}
9315
9316#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9317void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9318 VPSlotTracker &SlotTracker) const {
9319 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9320 IG->getInsertPos()->printAsOperand(O, false);
9321 O << ", ";
9322 getAddr()->printAsOperand(O, SlotTracker);
9323 VPValue *Mask = getMask();
9324 if (Mask) {
9325 O << ", ";
9326 Mask->printAsOperand(O, SlotTracker);
9327 }
9328
9329 unsigned OpIdx = 0;
9330 for (unsigned i = 0; i < IG->getFactor(); ++i) {
9331 if (!IG->getMember(i))
9332 continue;
9333 if (getNumStoreOperands() > 0) {
9334 O << "\n" << Indent << " store ";
9335 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9336 O << " to index " << i;
9337 } else {
9338 O << "\n" << Indent << " ";
9339 getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9340 O << " = load from index " << i;
9341 }
9342 ++OpIdx;
9343 }
9344}
9345#endif
9346
9347void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9348 assert(!State.Instance && "Int or FP induction being replicated.")(static_cast <bool> (!State.Instance && "Int or FP induction being replicated."
) ? void (0) : __assert_fail ("!State.Instance && \"Int or FP induction being replicated.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9348, __extension__
__PRETTY_FUNCTION__))
;
9349
9350 Value *Start = getStartValue()->getLiveInIRValue();
9351 const InductionDescriptor &ID = getInductionDescriptor();
9352 TruncInst *Trunc = getTruncInst();
9353 IRBuilderBase &Builder = State.Builder;
9354 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match")(static_cast <bool> (IV->getType() == ID.getStartValue
()->getType() && "Types must match") ? void (0) : __assert_fail
("IV->getType() == ID.getStartValue()->getType() && \"Types must match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9354, __extension__
__PRETTY_FUNCTION__))
;
9355 assert(State.VF.isVector() && "must have vector VF")(static_cast <bool> (State.VF.isVector() && "must have vector VF"
) ? void (0) : __assert_fail ("State.VF.isVector() && \"must have vector VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9355, __extension__
__PRETTY_FUNCTION__))
;
9356
9357 // The value from the original loop to which we are mapping the new induction
9358 // variable.
9359 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
9360
9361 // Fast-math-flags propagate from the original induction instruction.
9362 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9363 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
9364 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
9365
9366 // Now do the actual transformations, and start with fetching the step value.
9367 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9368
9369 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9370, __extension__
__PRETTY_FUNCTION__))
9370 "Expected either an induction phi-node or a truncate of it!")(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9370, __extension__
__PRETTY_FUNCTION__))
;
9371
9372 // Construct the initial value of the vector IV in the vector loop preheader
9373 auto CurrIP = Builder.saveIP();
9374 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9375 Builder.SetInsertPoint(VectorPH->getTerminator());
9376 if (isa<TruncInst>(EntryVal)) {
9377 assert(Start->getType()->isIntegerTy() &&(static_cast <bool> (Start->getType()->isIntegerTy
() && "Truncation requires an integer type") ? void (
0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9378, __extension__
__PRETTY_FUNCTION__))
9378 "Truncation requires an integer type")(static_cast <bool> (Start->getType()->isIntegerTy
() && "Truncation requires an integer type") ? void (
0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9378, __extension__
__PRETTY_FUNCTION__))
;
9379 auto *TruncType = cast<IntegerType>(EntryVal->getType());
9380 Step = Builder.CreateTrunc(Step, TruncType);
9381 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
9382 }
9383
9384 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
9385 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
9386 Value *SteppedStart = getStepVector(
9387 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
9388
9389 // We create vector phi nodes for both integer and floating-point induction
9390 // variables. Here, we determine the kind of arithmetic we will perform.
9391 Instruction::BinaryOps AddOp;
9392 Instruction::BinaryOps MulOp;
9393 if (Step->getType()->isIntegerTy()) {
9394 AddOp = Instruction::Add;
9395 MulOp = Instruction::Mul;
9396 } else {
9397 AddOp = ID.getInductionOpcode();
9398 MulOp = Instruction::FMul;
9399 }
9400
9401 // Multiply the vectorization factor by the step using integer or
9402 // floating-point arithmetic as appropriate.
9403 Type *StepType = Step->getType();
9404 Value *RuntimeVF;
9405 if (Step->getType()->isFloatingPointTy())
9406 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
9407 else
9408 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
9409 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
9410
9411 // Create a vector splat to use in the induction update.
9412 //
9413 // FIXME: If the step is non-constant, we create the vector splat with
9414 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
9415 // handle a constant vector splat.
9416 Value *SplatVF = isa<Constant>(Mul)
9417 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
9418 : Builder.CreateVectorSplat(State.VF, Mul);
9419 Builder.restoreIP(CurrIP);
9420
9421 // We may need to add the step a number of times, depending on the unroll
9422 // factor. The last of those goes into the PHI.
9423 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
9424 &*State.CFG.PrevBB->getFirstInsertionPt());
9425 VecInd->setDebugLoc(EntryVal->getDebugLoc());
9426 Instruction *LastInduction = VecInd;
9427 for (unsigned Part = 0; Part < State.UF; ++Part) {
9428 State.set(this, LastInduction, Part);
9429
9430 if (isa<TruncInst>(EntryVal))
9431 State.addMetadata(LastInduction, EntryVal);
9432
9433 LastInduction = cast<Instruction>(
9434 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
9435 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
9436 }
9437
9438 LastInduction->setName("vec.ind.next");
9439 VecInd->addIncoming(SteppedStart, VectorPH);
9440 // Add induction update using an incorrect block temporarily. The phi node
9441 // will be fixed after VPlan execution. Note that at this point the latch
9442 // block cannot be used, as it does not exist yet.
9443 // TODO: Model increment value in VPlan, by turning the recipe into a
9444 // multi-def and a subclass of VPHeaderPHIRecipe.
9445 VecInd->addIncoming(LastInduction, VectorPH);
9446}
9447
9448void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9449 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&(static_cast <bool> (IndDesc.getKind() == InductionDescriptor
::IK_PtrInduction && "Not a pointer induction according to InductionDescriptor!"
) ? void (0) : __assert_fail ("IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && \"Not a pointer induction according to InductionDescriptor!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9450, __extension__
__PRETTY_FUNCTION__))
9450 "Not a pointer induction according to InductionDescriptor!")(static_cast <bool> (IndDesc.getKind() == InductionDescriptor
::IK_PtrInduction && "Not a pointer induction according to InductionDescriptor!"
) ? void (0) : __assert_fail ("IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && \"Not a pointer induction according to InductionDescriptor!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9450, __extension__
__PRETTY_FUNCTION__))
;
9451 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&(static_cast <bool> (cast<PHINode>(getUnderlyingInstr
())->getType()->isPointerTy() && "Unexpected type."
) ? void (0) : __assert_fail ("cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && \"Unexpected type.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9452, __extension__
__PRETTY_FUNCTION__))
9452 "Unexpected type.")(static_cast <bool> (cast<PHINode>(getUnderlyingInstr
())->getType()->isPointerTy() && "Unexpected type."
) ? void (0) : __assert_fail ("cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && \"Unexpected type.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9452, __extension__
__PRETTY_FUNCTION__))
;
9453
9454 auto *IVR = getParent()->getPlan()->getCanonicalIV();
9455 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
9456
9457 if (onlyScalarsGenerated(State.VF)) {
9458 // This is the normalized GEP that starts counting at zero.
9459 Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9460 CanonicalIV, IndDesc.getStep()->getType());
9461 // Determine the number of scalars we need to generate for each unroll
9462 // iteration. If the instruction is uniform, we only need to generate the
9463 // first lane. Otherwise, we generate all VF values.
9464 bool IsUniform = vputils::onlyFirstLaneUsed(this);
9465 assert((IsUniform || !State.VF.isScalable()) &&(static_cast <bool> ((IsUniform || !State.VF.isScalable
()) && "Cannot scalarize a scalable VF") ? void (0) :
__assert_fail ("(IsUniform || !State.VF.isScalable()) && \"Cannot scalarize a scalable VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9466, __extension__
__PRETTY_FUNCTION__))
9466 "Cannot scalarize a scalable VF")(static_cast <bool> ((IsUniform || !State.VF.isScalable
()) && "Cannot scalarize a scalable VF") ? void (0) :
__assert_fail ("(IsUniform || !State.VF.isScalable()) && \"Cannot scalarize a scalable VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9466, __extension__
__PRETTY_FUNCTION__))
;
9467 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9468
9469 for (unsigned Part = 0; Part < State.UF; ++Part) {
9470 Value *PartStart =
9471 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9472
9473 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9474 Value *Idx = State.Builder.CreateAdd(
9475 PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9476 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9477
9478 Value *Step = State.get(getOperand(1), VPIteration(0, Part));
9479 Value *SclrGep = emitTransformedIndex(
9480 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
9481 SclrGep->setName("next.gep");
9482 State.set(this, SclrGep, VPIteration(Part, Lane));
9483 }
9484 }
9485 return;
9486 }
9487
9488 assert(isa<SCEVConstant>(IndDesc.getStep()) &&(static_cast <bool> (isa<SCEVConstant>(IndDesc.getStep
()) && "Induction step not a SCEV constant!") ? void (
0) : __assert_fail ("isa<SCEVConstant>(IndDesc.getStep()) && \"Induction step not a SCEV constant!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9489, __extension__
__PRETTY_FUNCTION__))
9489 "Induction step not a SCEV constant!")(static_cast <bool> (isa<SCEVConstant>(IndDesc.getStep
()) && "Induction step not a SCEV constant!") ? void (
0) : __assert_fail ("isa<SCEVConstant>(IndDesc.getStep()) && \"Induction step not a SCEV constant!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9489, __extension__
__PRETTY_FUNCTION__))
;
9490 Type *PhiType = IndDesc.getStep()->getType();
9491
9492 // Build a pointer phi
9493 Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9494 Type *ScStValueType = ScalarStartValue->getType();
9495 PHINode *NewPointerPhi =
9496 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9497
9498 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9499 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9500
9501 // A pointer induction, performed by using a gep
9502 Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
9503
9504 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9505 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9506 Value *NumUnrolledElems =
9507 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9508 Value *InductionGEP = GetElementPtrInst::Create(
9509 IndDesc.getElementType(), NewPointerPhi,
9510 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9511 InductionLoc);
9512 // Add induction update using an incorrect block temporarily. The phi node
9513 // will be fixed after VPlan execution. Note that at this point the latch
9514 // block cannot be used, as it does not exist yet.
9515 // TODO: Model increment value in VPlan, by turning the recipe into a
9516 // multi-def and a subclass of VPHeaderPHIRecipe.
9517 NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9518
9519 // Create UF many actual address geps that use the pointer
9520 // phi as base and a vectorized version of the step value
9521 // (<step*0, ..., step*N>) as offset.
9522 for (unsigned Part = 0; Part < State.UF; ++Part) {
9523 Type *VecPhiType = VectorType::get(PhiType, State.VF);
9524 Value *StartOffsetScalar =
9525 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9526 Value *StartOffset =
9527 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9528 // Create a vector of consecutive numbers from zero to VF.
9529 StartOffset = State.Builder.CreateAdd(
9530 StartOffset, State.Builder.CreateStepVector(VecPhiType));
9531
9532 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(0, Part)) &&(static_cast <bool> (ScalarStepValue == State.get(getOperand
(1), VPIteration(0, Part)) && "scalar step must be the same across all parts"
) ? void (0) : __assert_fail ("ScalarStepValue == State.get(getOperand(1), VPIteration(0, Part)) && \"scalar step must be the same across all parts\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9533, __extension__
__PRETTY_FUNCTION__))
9533 "scalar step must be the same across all parts")(static_cast <bool> (ScalarStepValue == State.get(getOperand
(1), VPIteration(0, Part)) && "scalar step must be the same across all parts"
) ? void (0) : __assert_fail ("ScalarStepValue == State.get(getOperand(1), VPIteration(0, Part)) && \"scalar step must be the same across all parts\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9533, __extension__
__PRETTY_FUNCTION__))
;
9534 Value *GEP = State.Builder.CreateGEP(
9535 IndDesc.getElementType(), NewPointerPhi,
9536 State.Builder.CreateMul(
9537 StartOffset,
9538 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9539 "vector.gep"));
9540 State.set(this, GEP, Part);
9541 }
9542}
9543
9544void VPDerivedIVRecipe::execute(VPTransformState &State) {
9545 assert(!State.Instance && "VPDerivedIVRecipe being replicated.")(static_cast <bool> (!State.Instance && "VPDerivedIVRecipe being replicated."
) ? void (0) : __assert_fail ("!State.Instance && \"VPDerivedIVRecipe being replicated.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9545, __extension__
__PRETTY_FUNCTION__))
;
9546
9547 // Fast-math-flags propagate from the original induction instruction.
9548 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9549 if (IndDesc.getInductionBinOp() &&
9550 isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9551 State.Builder.setFastMathFlags(
9552 IndDesc.getInductionBinOp()->getFastMathFlags());
9553
9554 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9555 Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9556 Value *DerivedIV =
9557 emitTransformedIndex(State.Builder, CanonicalIV,
9558 getStartValue()->getLiveInIRValue(), Step, IndDesc);
9559 DerivedIV->setName("offset.idx");
9560 if (ResultTy != DerivedIV->getType()) {
9561 assert(Step->getType()->isIntegerTy() &&(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9562, __extension__
__PRETTY_FUNCTION__))
9562 "Truncation requires an integer step")(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9562, __extension__
__PRETTY_FUNCTION__))
;
9563 DerivedIV = State.Builder.CreateTrunc(DerivedIV, ResultTy);
9564 }
9565 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?")(static_cast <bool> (DerivedIV != CanonicalIV &&
"IV didn't need transforming?") ? void (0) : __assert_fail (
"DerivedIV != CanonicalIV && \"IV didn't need transforming?\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9565, __extension__
__PRETTY_FUNCTION__))
;
9566
9567 State.set(this, DerivedIV, VPIteration(0, 0));
9568}
9569
9570void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
9571 // Fast-math-flags propagate from the original induction instruction.
9572 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9573 if (IndDesc.getInductionBinOp() &&
9574 isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9575 State.Builder.setFastMathFlags(
9576 IndDesc.getInductionBinOp()->getFastMathFlags());
9577
9578 Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
9579 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9580
9581 buildScalarSteps(BaseIV, Step, IndDesc, this, State);
9582}
9583
9584void VPInterleaveRecipe::execute(VPTransformState &State) {
9585 assert(!State.Instance && "Interleave group being replicated.")(static_cast <bool> (!State.Instance && "Interleave group being replicated."
) ? void (0) : __assert_fail ("!State.Instance && \"Interleave group being replicated.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9585, __extension__
__PRETTY_FUNCTION__))
;
9586 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9587 getStoredValues(), getMask());
9588}
9589
9590void VPReductionRecipe::execute(VPTransformState &State) {
9591 assert(!State.Instance && "Reduction being replicated.")(static_cast <bool> (!State.Instance && "Reduction being replicated."
) ? void (0) : __assert_fail ("!State.Instance && \"Reduction being replicated.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9591, __extension__
__PRETTY_FUNCTION__))
;
9592 Value *PrevInChain = State.get(getChainOp(), 0);
9593 RecurKind Kind = RdxDesc->getRecurrenceKind();
9594 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9595 // Propagate the fast-math flags carried by the underlying instruction.
9596 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9597 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9598 for (unsigned Part = 0; Part < State.UF; ++Part) {
9599 Value *NewVecOp = State.get(getVecOp(), Part);
9600 if (VPValue *Cond = getCondOp()) {
9601 Value *NewCond = State.get(Cond, Part);
9602 VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9603 Value *Iden = RdxDesc->getRecurrenceIdentity(
9604 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9605 Value *IdenVec =
9606 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9607 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9608 NewVecOp = Select;
9609 }
9610 Value *NewRed;
9611 Value *NextInChain;
9612 if (IsOrdered) {
9613 if (State.VF.isVector())
9614 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9615 PrevInChain);
9616 else
9617 NewRed = State.Builder.CreateBinOp(
9618 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9619 NewVecOp);
9620 PrevInChain = NewRed;
9621 } else {
9622 PrevInChain = State.get(getChainOp(), Part);
9623 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9624 }
9625 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9626 NextInChain =
9627 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9628 NewRed, PrevInChain);
9629 } else if (IsOrdered)
9630 NextInChain = NewRed;
9631 else
9632 NextInChain = State.Builder.CreateBinOp(
9633 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9634 PrevInChain);
9635 State.set(this, NextInChain, Part);
9636 }
9637}
9638
9639void VPReplicateRecipe::execute(VPTransformState &State) {
9640 Instruction *UI = getUnderlyingInstr();
9641 if (State.Instance) { // Generate a single instance.
9642 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector")(static_cast <bool> (!State.VF.isScalable() && "Can't scalarize a scalable vector"
) ? void (0) : __assert_fail ("!State.VF.isScalable() && \"Can't scalarize a scalable vector\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9642, __extension__
__PRETTY_FUNCTION__))
;
9643 State.ILV->scalarizeInstruction(UI, this, *State.Instance,
9644 IsPredicated, State);
9645 // Insert scalar instance packing it into a vector.
9646 if (AlsoPack && State.VF.isVector()) {
9647 // If we're constructing lane 0, initialize to start from poison.
9648 if (State.Instance->Lane.isFirstLane()) {
9649 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.")(static_cast <bool> (!State.VF.isScalable() && "VF is assumed to be non scalable."
) ? void (0) : __assert_fail ("!State.VF.isScalable() && \"VF is assumed to be non scalable.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9649, __extension__
__PRETTY_FUNCTION__))
;
9650 Value *Poison = PoisonValue::get(
9651 VectorType::get(UI->getType(), State.VF));
9652 State.set(this, Poison, State.Instance->Part);
9653 }
9654 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9655 }
9656 return;
9657 }
9658
9659 if (IsUniform) {
9660 // If the recipe is uniform across all parts (instead of just per VF), only
9661 // generate a single instance.
9662 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9663 all_of(operands(), [](VPValue *Op) {
9664 return Op->isDefinedOutsideVectorRegions();
9665 })) {
9666 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), IsPredicated,
9667 State);
9668 if (user_begin() != user_end()) {
9669 for (unsigned Part = 1; Part < State.UF; ++Part)
9670 State.set(this, State.get(this, VPIteration(0, 0)),
9671 VPIteration(Part, 0));
9672 }
9673 return;
9674 }
9675
9676 // Uniform within VL means we need to generate lane 0 only for each
9677 // unrolled copy.
9678 for (unsigned Part = 0; Part < State.UF; ++Part)
9679 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0),
9680 IsPredicated, State);
9681 return;
9682 }
9683
9684 // A store of a loop varying value to a loop invariant address only
9685 // needs only the last copy of the store.
9686 if (isa<StoreInst>(UI) && !getOperand(1)->hasDefiningRecipe()) {
9687 auto Lane = VPLane::getLastLaneForVF(State.VF);
9688 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), IsPredicated,
9689 State);
9690 return;
9691 }
9692
9693 // Generate scalar instances for all VF lanes of all UF parts.
9694 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector")(static_cast <bool> (!State.VF.isScalable() && "Can't scalarize a scalable vector"
) ? void (0) : __assert_fail ("!State.VF.isScalable() && \"Can't scalarize a scalable vector\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9694, __extension__
__PRETTY_FUNCTION__))
;
9695 const unsigned EndLane = State.VF.getKnownMinValue();
9696 for (unsigned Part = 0; Part < State.UF; ++Part)
9697 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9698 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane),
9699 IsPredicated, State);
9700}
9701
9702void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9703 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9704
9705 // Attempt to issue a wide load.
9706 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9707 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9708
9709 assert((LI || SI) && "Invalid Load/Store instruction")(static_cast <bool> ((LI || SI) && "Invalid Load/Store instruction"
) ? void (0) : __assert_fail ("(LI || SI) && \"Invalid Load/Store instruction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9709, __extension__
__PRETTY_FUNCTION__))
;
9710 assert((!SI || StoredValue) && "No stored value provided for widened store")(static_cast <bool> ((!SI || StoredValue) && "No stored value provided for widened store"
) ? void (0) : __assert_fail ("(!SI || StoredValue) && \"No stored value provided for widened store\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9710, __extension__
__PRETTY_FUNCTION__))
;
9711 assert((!LI || !StoredValue) && "Stored value provided for widened load")(static_cast <bool> ((!LI || !StoredValue) && "Stored value provided for widened load"
) ? void (0) : __assert_fail ("(!LI || !StoredValue) && \"Stored value provided for widened load\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9711, __extension__
__PRETTY_FUNCTION__))
;
9712
9713 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9714
9715 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9716 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9717 bool CreateGatherScatter = !Consecutive;
9718
9719 auto &Builder = State.Builder;
9720 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9721 bool isMaskRequired = getMask();
9722 if (isMaskRequired)
9723 for (unsigned Part = 0; Part < State.UF; ++Part)
9724 BlockInMaskParts[Part] = State.get(getMask(), Part);
9725
9726 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9727 // Calculate the pointer for the specific unroll-part.
9728 GetElementPtrInst *PartPtr = nullptr;
9729
9730 bool InBounds = false;
9731 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9732 InBounds = gep->isInBounds();
9733 if (Reverse) {
9734 // If the address is consecutive but reversed, then the
9735 // wide store needs to start at the last vector element.
9736 // RunTimeVF = VScale * VF.getKnownMinValue()
9737 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9738 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9739 // NumElt = -Part * RunTimeVF
9740 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9741 // LastLane = 1 - RunTimeVF
9742 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9743 PartPtr =
9744 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9745 PartPtr->setIsInBounds(InBounds);
9746 PartPtr = cast<GetElementPtrInst>(
9747 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9748 PartPtr->setIsInBounds(InBounds);
9749 if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9750 BlockInMaskParts[Part] =
9751 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9752 } else {
9753 Value *Increment =
9754 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9755 PartPtr = cast<GetElementPtrInst>(
9756 Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9757 PartPtr->setIsInBounds(InBounds);
9758 }
9759
9760 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9761 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9762 };
9763
9764 // Handle Stores:
9765 if (SI) {
9766 State.setDebugLocFromInst(SI);
9767
9768 for (unsigned Part = 0; Part < State.UF; ++Part) {
9769 Instruction *NewSI = nullptr;
9770 Value *StoredVal = State.get(StoredValue, Part);
9771 if (CreateGatherScatter) {
9772 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9773 Value *VectorGep = State.get(getAddr(), Part);
9774 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9775 MaskPart);
9776 } else {
9777 if (Reverse) {
9778 // If we store to reverse consecutive memory locations, then we need
9779 // to reverse the order of elements in the stored value.
9780 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9781 // We don't want to update the value in the map as it might be used in
9782 // another expression. So don't call resetVectorValue(StoredVal).
9783 }
9784 auto *VecPtr =
9785 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9786 if (isMaskRequired)
9787 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9788 BlockInMaskParts[Part]);
9789 else
9790 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9791 }
9792 State.addMetadata(NewSI, SI);
9793 }
9794 return;
9795 }
9796
9797 // Handle loads.
9798 assert(LI && "Must have a load instruction")(static_cast <bool> (LI && "Must have a load instruction"
) ? void (0) : __assert_fail ("LI && \"Must have a load instruction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9798, __extension__
__PRETTY_FUNCTION__))
;
9799 State.setDebugLocFromInst(LI);
9800 for (unsigned Part = 0; Part < State.UF; ++Part) {
9801 Value *NewLI;
9802 if (CreateGatherScatter) {
9803 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9804 Value *VectorGep = State.get(getAddr(), Part);
9805 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9806 nullptr, "wide.masked.gather");
9807 State.addMetadata(NewLI, LI);
9808 } else {
9809 auto *VecPtr =
9810 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9811 if (isMaskRequired)
9812 NewLI = Builder.CreateMaskedLoad(
9813 DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9814 PoisonValue::get(DataTy), "wide.masked.load");
9815 else
9816 NewLI =
9817 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9818
9819 // Add metadata to the load, but setVectorValue to the reverse shuffle.
9820 State.addMetadata(NewLI, LI);
9821 if (Reverse)
9822 NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9823 }
9824
9825 State.set(getVPSingleValue(), NewLI, Part);
9826 }
9827}
9828
9829// Determine how to lower the scalar epilogue, which depends on 1) optimising
9830// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9831// predication, and 4) a TTI hook that analyses whether the loop is suitable
9832// for predication.
9833static ScalarEpilogueLowering getScalarEpilogueLowering(
9834 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9835 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9836 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
9837 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9838 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9839 // don't look at hints or options, and don't request a scalar epilogue.
9840 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9841 // LoopAccessInfo (due to code dependency and not being able to reliably get
9842 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9843 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9844 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9845 // back to the old way and vectorize with versioning when forced. See D81345.)
9846 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9847 PGSOQueryType::IRPass) &&
9848 Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9849 return CM_ScalarEpilogueNotAllowedOptSize;
9850
9851 // 2) If set, obey the directives
9852 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9853 switch (PreferPredicateOverEpilogue) {
9854 case PreferPredicateTy::ScalarEpilogue:
9855 return CM_ScalarEpilogueAllowed;
9856 case PreferPredicateTy::PredicateElseScalarEpilogue:
9857 return CM_ScalarEpilogueNotNeededUsePredicate;
9858 case PreferPredicateTy::PredicateOrDontVectorize:
9859 return CM_ScalarEpilogueNotAllowedUsePredicate;
9860 };
9861 }
9862
9863 // 3) If set, obey the hints
9864 switch (Hints.getPredicate()) {
9865 case LoopVectorizeHints::FK_Enabled:
9866 return CM_ScalarEpilogueNotNeededUsePredicate;
9867 case LoopVectorizeHints::FK_Disabled:
9868 return CM_ScalarEpilogueAllowed;
9869 };
9870
9871 // 4) if the TTI hook indicates this is profitable, request predication.
9872 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI))
9873 return CM_ScalarEpilogueNotNeededUsePredicate;
9874
9875 return CM_ScalarEpilogueAllowed;
9876}
9877
9878Value *VPTransformState::get(VPValue *Def, unsigned Part) {
9879 // If Values have been set for this Def return the one relevant for \p Part.
9880 if (hasVectorValue(Def, Part))
9881 return Data.PerPartOutput[Def][Part];
9882
9883 if (!hasScalarValue(Def, {Part, 0})) {
9884 Value *IRV = Def->getLiveInIRValue();
9885 Value *B = ILV->getBroadcastInstrs(IRV);
9886 set(Def, B, Part);
9887 return B;
9888 }
9889
9890 Value *ScalarValue = get(Def, {Part, 0});
9891 // If we aren't vectorizing, we can just copy the scalar map values over
9892 // to the vector map.
9893 if (VF.isScalar()) {
9894 set(Def, ScalarValue, Part);
9895 return ScalarValue;
9896 }
9897
9898 bool IsUniform = vputils::isUniformAfterVectorization(Def);
9899
9900 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
9901 // Check if there is a scalar value for the selected lane.
9902 if (!hasScalarValue(Def, {Part, LastLane})) {
9903 // At the moment, VPWidenIntOrFpInductionRecipes and VPScalarIVStepsRecipes can also be uniform.
9904 assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||(static_cast <bool> ((isa<VPWidenIntOrFpInductionRecipe
>(Def->getDefiningRecipe()) || isa<VPScalarIVStepsRecipe
>(Def->getDefiningRecipe())) && "unexpected recipe found to be invariant"
) ? void (0) : __assert_fail ("(isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) || isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) && \"unexpected recipe found to be invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9906, __extension__
__PRETTY_FUNCTION__))
9905 isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) &&(static_cast <bool> ((isa<VPWidenIntOrFpInductionRecipe
>(Def->getDefiningRecipe()) || isa<VPScalarIVStepsRecipe
>(Def->getDefiningRecipe())) && "unexpected recipe found to be invariant"
) ? void (0) : __assert_fail ("(isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) || isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) && \"unexpected recipe found to be invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9906, __extension__
__PRETTY_FUNCTION__))
9906 "unexpected recipe found to be invariant")(static_cast <bool> ((isa<VPWidenIntOrFpInductionRecipe
>(Def->getDefiningRecipe()) || isa<VPScalarIVStepsRecipe
>(Def->getDefiningRecipe())) && "unexpected recipe found to be invariant"
) ? void (0) : __assert_fail ("(isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) || isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) && \"unexpected recipe found to be invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9906, __extension__
__PRETTY_FUNCTION__))
;
9907 IsUniform = true;
9908 LastLane = 0;
9909 }
9910
9911 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
9912 // Set the insert point after the last scalarized instruction or after the
9913 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
9914 // will directly follow the scalar definitions.
9915 auto OldIP = Builder.saveIP();
9916 auto NewIP =
9917 isa<PHINode>(LastInst)
9918 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
9919 : std::next(BasicBlock::iterator(LastInst));
9920 Builder.SetInsertPoint(&*NewIP);
9921
9922 // However, if we are vectorizing, we need to construct the vector values.
9923 // If the value is known to be uniform after vectorization, we can just
9924 // broadcast the scalar value corresponding to lane zero for each unroll
9925 // iteration. Otherwise, we construct the vector values using
9926 // insertelement instructions. Since the resulting vectors are stored in
9927 // State, we will only generate the insertelements once.
9928 Value *VectorValue = nullptr;
9929 if (IsUniform) {
9930 VectorValue = ILV->getBroadcastInstrs(ScalarValue);
9931 set(Def, VectorValue, Part);
9932 } else {
9933 // Initialize packing with insertelements to start from undef.
9934 assert(!VF.isScalable() && "VF is assumed to be non scalable.")(static_cast <bool> (!VF.isScalable() && "VF is assumed to be non scalable."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9934, __extension__
__PRETTY_FUNCTION__))
;
9935 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
9936 set(Def, Undef, Part);
9937 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
9938 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
9939 VectorValue = get(Def, Part);
9940 }
9941 Builder.restoreIP(OldIP);
9942 return VectorValue;
9943}
9944
9945// Process the loop in the VPlan-native vectorization path. This path builds
9946// VPlan upfront in the vectorization pipeline, which allows to apply
9947// VPlan-to-VPlan transformations from the very beginning without modifying the
9948// input LLVM IR.
9949static bool processLoopInVPlanNativePath(
9950 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9951 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9952 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9953 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9954 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9955 LoopVectorizationRequirements &Requirements) {
9956
9957 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9958 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: cannot compute the outer-loop trip count\n"
; } } while (false)
;
9959 return false;
9960 }
9961 assert(EnableVPlanNativePath && "VPlan-native path is disabled.")(static_cast <bool> (EnableVPlanNativePath && "VPlan-native path is disabled."
) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is disabled.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9961, __extension__
__PRETTY_FUNCTION__))
;
9962 Function *F = L->getHeader()->getParent();
9963 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9964
9965 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9966 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL, &IAI);
9967
9968 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9969 &Hints, IAI);
9970 // Use the planner for outer loop vectorization.
9971 // TODO: CM is not used at this point inside the planner. Turn CM into an
9972 // optional argument if we don't need it in the future.
9973 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE);
9974
9975 // Get user vectorization factor.
9976 ElementCount UserVF = Hints.getWidth();
9977
9978 CM.collectElementTypesForWidening();
9979
9980 // Plan how to best vectorize, return the best VF and its cost.
9981 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9982
9983 // If we are stress testing VPlan builds, do not attempt to generate vector
9984 // code. Masked vector code generation support will follow soon.
9985 // Also, do not attempt to vectorize if no vector code will be produced.
9986 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9987 return false;
9988
9989 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9990
9991 {
9992 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9993 F->getParent()->getDataLayout());
9994 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9995 VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9996 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() <<
"\"\n"; } } while (false)
9997 << L->getHeader()->getParent()->getName() << "\"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() <<
"\"\n"; } } while (false)
;
9998 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9999 }
10000
10001 // Mark the loop as already vectorized to avoid vectorizing again.
10002 Hints.setAlreadyVectorized();
10003 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()))(static_cast <bool> (!verifyFunction(*L->getHeader()
->getParent(), &dbgs())) ? void (0) : __assert_fail ("!verifyFunction(*L->getHeader()->getParent(), &dbgs())"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10003, __extension__
__PRETTY_FUNCTION__))
;
10004 return true;
10005}
10006
10007// Emit a remark if there are stores to floats that required a floating point
10008// extension. If the vectorized loop was generated with floating point there
10009// will be a performance penalty from the conversion overhead and the change in
10010// the vector width.
10011static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10012 SmallVector<Instruction *, 4> Worklist;
10013 for (BasicBlock *BB : L->getBlocks()) {
10014 for (Instruction &Inst : *BB) {
10015 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10016 if (S->getValueOperand()->getType()->isFloatTy())
10017 Worklist.push_back(S);
10018 }
10019 }
10020 }
10021
10022 // Traverse the floating point stores upwards searching, for floating point
10023 // conversions.
10024 SmallPtrSet<const Instruction *, 4> Visited;
10025 SmallPtrSet<const Instruction *, 4> EmittedRemark;
10026 while (!Worklist.empty()) {
10027 auto *I = Worklist.pop_back_val();
10028 if (!L->contains(I))
10029 continue;
10030 if (!Visited.insert(I).second)
10031 continue;
10032
10033 // Emit a remark if the floating point store required a floating
10034 // point conversion.
10035 // TODO: More work could be done to identify the root cause such as a
10036 // constant or a function return type and point the user to it.
10037 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10038 ORE->emit([&]() {
10039 return OptimizationRemarkAnalysis(LV_NAME"loop-vectorize", "VectorMixedPrecision",
10040 I->getDebugLoc(), L->getHeader())
10041 << "floating point conversion changes vector width. "
10042 << "Mixed floating point precision requires an up/down "
10043 << "cast that will negatively impact performance.";
10044 });
10045
10046 for (Use &Op : I->operands())
10047 if (auto *OpI = dyn_cast<Instruction>(Op))
10048 Worklist.push_back(OpI);
10049 }
10050}
10051
10052static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
10053 VectorizationFactor &VF,
10054 std::optional<unsigned> VScale, Loop *L,
10055 ScalarEvolution &SE) {
10056 InstructionCost CheckCost = Checks.getCost();
10057 if (!CheckCost.isValid())
10058 return false;
10059
10060 // When interleaving only scalar and vector cost will be equal, which in turn
10061 // would lead to a divide by 0. Fall back to hard threshold.
10062 if (VF.Width.isScalar()) {
10063 if (CheckCost > VectorizeMemoryCheckThreshold) {
10064 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving only is not profitable due to runtime checks\n"
; } } while (false)
10065 dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving only is not profitable due to runtime checks\n"
; } } while (false)
10066 << "LV: Interleaving only is not profitable due to runtime checks\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving only is not profitable due to runtime checks\n"
; } } while (false)
;
10067 return false;
10068 }
10069 return true;
10070 }
10071
10072 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
10073 double ScalarC = *VF.ScalarCost.getValue();
10074 if (ScalarC == 0)
10075 return true;
10076
10077 // First, compute the minimum iteration count required so that the vector
10078 // loop outperforms the scalar loop.
10079 // The total cost of the scalar loop is
10080 // ScalarC * TC
10081 // where
10082 // * TC is the actual trip count of the loop.
10083 // * ScalarC is the cost of a single scalar iteration.
10084 //
10085 // The total cost of the vector loop is
10086 // RtC + VecC * (TC / VF) + EpiC
10087 // where
10088 // * RtC is the cost of the generated runtime checks
10089 // * VecC is the cost of a single vector iteration.
10090 // * TC is the actual trip count of the loop
10091 // * VF is the vectorization factor
10092 // * EpiCost is the cost of the generated epilogue, including the cost
10093 // of the remaining scalar operations.
10094 //
10095 // Vectorization is profitable once the total vector cost is less than the
10096 // total scalar cost:
10097 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
10098 //
10099 // Now we can compute the minimum required trip count TC as
10100 // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
10101 //
10102 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
10103 // the computations are performed on doubles, not integers and the result
10104 // is rounded up, hence we get an upper estimate of the TC.
10105 unsigned IntVF = VF.Width.getKnownMinValue();
10106 if (VF.Width.isScalable()) {
10107 unsigned AssumedMinimumVscale = 1;
10108 if (VScale)
10109 AssumedMinimumVscale = *VScale;
10110 IntVF *= AssumedMinimumVscale;
10111 }
10112 double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
10113 double RtC = *CheckCost.getValue();
10114 double MinTC1 = RtC / (ScalarC - VecCOverVF);
10115
10116 // Second, compute a minimum iteration count so that the cost of the
10117 // runtime checks is only a fraction of the total scalar loop cost. This
10118 // adds a loop-dependent bound on the overhead incurred if the runtime
10119 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
10120 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
10121 // cost, compute
10122 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
10123 double MinTC2 = RtC * 10 / ScalarC;
10124
10125 // Now pick the larger minimum. If it is not a multiple of VF, choose the
10126 // next closest multiple of VF. This should partly compensate for ignoring
10127 // the epilogue cost.
10128 uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
10129 VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF));
10130
10131 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
<< VF.MinProfitableTripCount << "\n"; } } while (
false)
10132 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
<< VF.MinProfitableTripCount << "\n"; } } while (
false)
10133 << VF.MinProfitableTripCount << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
<< VF.MinProfitableTripCount << "\n"; } } while (
false)
;
10134
10135 // Skip vectorization if the expected trip count is less than the minimum
10136 // required trip count.
10137 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
10138 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
10139 VF.MinProfitableTripCount)) {
10140 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vectorization is not beneficial: expected "
"trip count < minimum profitable VF (" << *ExpectedTC
<< " < " << VF.MinProfitableTripCount <<
")\n"; } } while (false)
10141 "trip count < minimum profitable VF ("do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vectorization is not beneficial: expected "
"trip count < minimum profitable VF (" << *ExpectedTC
<< " < " << VF.MinProfitableTripCount <<
")\n"; } } while (false)
10142 << *ExpectedTC << " < " << VF.MinProfitableTripCountdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vectorization is not beneficial: expected "
"trip count < minimum profitable VF (" << *ExpectedTC
<< " < " << VF.MinProfitableTripCount <<
")\n"; } } while (false)
10143 << ")\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vectorization is not beneficial: expected "
"trip count < minimum profitable VF (" << *ExpectedTC
<< " < " << VF.MinProfitableTripCount <<
")\n"; } } while (false)
;
10144
10145 return false;
10146 }
10147 }
10148 return true;
10149}
10150
10151LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10152 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10153 !EnableLoopInterleaving),
10154 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10155 !EnableLoopVectorization) {}
10156
10157bool LoopVectorizePass::processLoop(Loop *L) {
10158 assert((EnableVPlanNativePath || L->isInnermost()) &&(static_cast <bool> ((EnableVPlanNativePath || L->isInnermost
()) && "VPlan-native path is not enabled. Only process inner loops."
) ? void (0) : __assert_fail ("(EnableVPlanNativePath || L->isInnermost()) && \"VPlan-native path is not enabled. Only process inner loops.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10159, __extension__
__PRETTY_FUNCTION__))
10159 "VPlan-native path is not enabled. Only process inner loops.")(static_cast <bool> ((EnableVPlanNativePath || L->isInnermost
()) && "VPlan-native path is not enabled. Only process inner loops."
) ? void (0) : __assert_fail ("(EnableVPlanNativePath || L->isInnermost()) && \"VPlan-native path is not enabled. Only process inner loops.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10159, __extension__
__PRETTY_FUNCTION__))
;
10160
10161#ifndef NDEBUG
10162 const std::string DebugLocStr = getDebugLocString(L);
10163#endif /* NDEBUG */
10164
10165 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in '"
<< L->getHeader()->getParent()->getName() <<
"' from " << DebugLocStr << "\n"; } } while (false
)
10166 << L->getHeader()->getParent()->getName() << "' from "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in '"
<< L->getHeader()->getParent()->getName() <<
"' from " << DebugLocStr << "\n"; } } while (false
)
10167 << DebugLocStr << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in '"
<< L->getHeader()->getParent()->getName() <<
"' from " << DebugLocStr << "\n"; } } while (false
)
;
10168
10169 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10170
10171 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " interleave=" << Hints.getInterleave
() << "\n"; } } while (false)
10172 dbgs() << "LV: Loop hints:"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " interleave=" << Hints.getInterleave
() << "\n"; } } while (false)
10173 << " force="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " interleave=" << Hints.getInterleave
() << "\n"; } } while (false)
10174 << (Hints.getForce() == LoopVectorizeHints::FK_Disableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " interleave=" << Hints.getInterleave
() << "\n"; } } while (false)
10175 ? "disabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " interleave=" << Hints.getInterleave
() << "\n"; } } while (false)
10176 : (Hints.getForce() == LoopVectorizeHints::FK_Enableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " interleave=" << Hints.getInterleave
() << "\n"; } } while (false)
10177 ? "enabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " interleave=" << Hints.getInterleave
() << "\n"; } } while (false)
10178 : "?"))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " interleave=" << Hints.getInterleave
() << "\n"; } } while (false)
10179 << " width=" << Hints.getWidth()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " interleave=" << Hints.getInterleave
() << "\n"; } } while (false)
10180 << " interleave=" << Hints.getInterleave() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " interleave=" << Hints.getInterleave
() << "\n"; } } while (false)
;
10181
10182 // Function containing loop
10183 Function *F = L->getHeader()->getParent();
10184
10185 // Looking at the diagnostic output is the only way to determine if a loop
10186 // was vectorized (other than looking at the IR or machine code), so it
10187 // is important to generate an optimization remark for each loop. Most of
10188 // these messages are generated as OptimizationRemarkAnalysis. Remarks
10189 // generated as OptimizationRemark and OptimizationRemarkMissed are
10190 // less verbose reporting vectorized loops and unvectorized loops that may
10191 // benefit from vectorization, respectively.
10192
10193 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10194 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent vectorization.\n"
; } } while (false)
;
10195 return false;
10196 }
10197
10198 PredicatedScalarEvolution PSE(*SE, *L);
10199
10200 // Check if it is legal to vectorize the loop.
10201 LoopVectorizationRequirements Requirements;
10202 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
10203 &Requirements, &Hints, DB, AC, BFI, PSI);
10204 if (!LVL.canVectorize(EnableVPlanNativePath)) {
10205 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"
; } } while (false)
;
10206 Hints.emitRemarkWithHints();
10207 return false;
10208 }
10209
10210 // Entrance to the VPlan-native vectorization path. Outer loops are processed
10211 // here. They may require CFG and instruction level transformations before
10212 // even evaluating whether vectorization is profitable. Since we cannot modify
10213 // the incoming IR, we need to build VPlan upfront in the vectorization
10214 // pipeline.
10215 if (!L->isInnermost())
10216 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10217 ORE, BFI, PSI, Hints, Requirements);
10218
10219 assert(L->isInnermost() && "Inner loop expected.")(static_cast <bool> (L->isInnermost() && "Inner loop expected."
) ? void (0) : __assert_fail ("L->isInnermost() && \"Inner loop expected.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10219, __extension__
__PRETTY_FUNCTION__))
;
10220
10221 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10222 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10223
10224 // If an override option has been passed in for interleaved accesses, use it.
10225 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10226 UseInterleaved = EnableInterleavedMemAccesses;
10227
10228 // Analyze interleaved memory accesses.
10229 if (UseInterleaved)
10230 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10231
10232 // Check the function attributes and profiles to find out if this function
10233 // should be optimized for size.
10234 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10235 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL, &IAI);
10236
10237 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10238 // count by optimizing for size, to minimize overheads.
10239 auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10240 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10241 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "
<< "iteration overheads are incurred."; } } while (false
)
10242 << "This loop is worth vectorizing only if no scalar "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "
<< "iteration overheads are incurred."; } } while (false
)
10243 << "iteration overheads are incurred.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "
<< "iteration overheads are incurred."; } } while (false
)
;
10244 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10245 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " But vectorizing was explicitly forced.\n"
; } } while (false)
;
10246 else {
10247 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10248 LLVM_DEBUG(dbgs() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\n"; } } while (false)
;
10249 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10250 } else {
10251 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " But the target considers the trip count too "
"small to consider vectorizing.\n"; } } while (false)
10252 "small to consider vectorizing.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " But the target considers the trip count too "
"small to consider vectorizing.\n"; } } while (false)
;
10253 reportVectorizationFailure(
10254 "The trip count is below the minial threshold value.",
10255 "loop trip count is too low, avoiding vectorization",
10256 "LowTripCount", ORE, L);
10257 Hints.emitRemarkWithHints();
10258 return false;
10259 }
10260 }
10261 }
10262
10263 // Check the function attributes to see if implicit floats or vectors are
10264 // allowed.
10265 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10266 reportVectorizationFailure(
10267 "Can't vectorize when the NoImplicitFloat attribute is used",
10268 "loop not vectorized due to NoImplicitFloat attribute",
10269 "NoImplicitFloat", ORE, L);
10270 Hints.emitRemarkWithHints();
10271 return false;
10272 }
10273
10274 // Check if the target supports potentially unsafe FP vectorization.
10275 // FIXME: Add a check for the type of safety issue (denormal, signaling)
10276 // for the target we're vectorizing for, to make sure none of the
10277 // additional fp-math flags can help.
10278 if (Hints.isPotentiallyUnsafe() &&
10279 TTI->isFPVectorizationPotentiallyUnsafe()) {
10280 reportVectorizationFailure(
10281 "Potentially unsafe FP op prevents vectorization",
10282 "loop not vectorized due to unsafe FP support.",
10283 "UnsafeFP", ORE, L);
10284 Hints.emitRemarkWithHints();
10285 return false;
10286 }
10287
10288 bool AllowOrderedReductions;
10289 // If the flag is set, use that instead and override the TTI behaviour.
10290 if (ForceOrderedReductions.getNumOccurrences() > 0)
10291 AllowOrderedReductions = ForceOrderedReductions;
10292 else
10293 AllowOrderedReductions = TTI->enableOrderedReductions();
10294 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10295 ORE->emit([&]() {
10296 auto *ExactFPMathInst = Requirements.getExactFPInst();
10297 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE"loop-vectorize", "CantReorderFPOps",
10298 ExactFPMathInst->getDebugLoc(),
10299 ExactFPMathInst->getParent())
10300 << "loop not vectorized: cannot prove it is safe to reorder "
10301 "floating-point operations";
10302 });
10303 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
"reorder floating-point operations\n"; } } while (false)
10304 "reorder floating-point operations\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
"reorder floating-point operations\n"; } } while (false)
;
10305 Hints.emitRemarkWithHints();
10306 return false;
10307 }
10308
10309 // Use the cost model.
10310 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10311 F, &Hints, IAI);
10312 CM.collectValuesToIgnore();
10313 CM.collectElementTypesForWidening();
10314
10315 // Use the planner for vectorization.
10316 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE);
10317
10318 // Get user vectorization factor and interleave count.
10319 ElementCount UserVF = Hints.getWidth();
10320 unsigned UserIC = Hints.getInterleave();
10321
10322 // Plan how to best vectorize, return the best VF and its cost.
10323 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10324
10325 VectorizationFactor VF = VectorizationFactor::Disabled();
10326 unsigned IC = 1;
10327
10328 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
10329 F->getParent()->getDataLayout());
10330 if (MaybeVF) {
10331 VF = *MaybeVF;
10332 // Select the interleave count.
10333 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10334
10335 unsigned SelectedIC = std::max(IC, UserIC);
10336 // Optimistically generate runtime checks if they are needed. Drop them if
10337 // they turn out to not be profitable.
10338 if (VF.Width.isVector() || SelectedIC > 1)
10339 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10340
10341 // Check if it is profitable to vectorize with runtime checks.
10342 bool ForceVectorization =
10343 Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10344 if (!ForceVectorization &&
10345 !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L,
10346 *PSE.getSE())) {
10347 ORE->emit([&]() {
10348 return OptimizationRemarkAnalysisAliasing(
10349 DEBUG_TYPE"loop-vectorize", "CantReorderMemOps", L->getStartLoc(),
10350 L->getHeader())
10351 << "loop not vectorized: cannot prove it is safe to reorder "
10352 "memory operations";
10353 });
10354 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Too many memory checks needed.\n"
; } } while (false)
;
10355 Hints.emitRemarkWithHints();
10356 return false;
10357 }
10358 }
10359
10360 // Identify the diagnostic messages that should be produced.
10361 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10362 bool VectorizeLoop = true, InterleaveLoop = true;
10363 if (VF.Width.isScalar()) {
10364 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vectorization is possible but not beneficial.\n"
; } } while (false)
;
10365 VecDiagMsg = std::make_pair(
10366 "VectorizationNotBeneficial",
10367 "the cost-model indicates that vectorization is not beneficial");
10368 VectorizeLoop = false;
10369 }
10370
10371 if (!MaybeVF && UserIC > 1) {
10372 // Tell the user interleaving was avoided up-front, despite being explicitly
10373 // requested.
10374 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Ignoring UserIC, because vectorization and "
"interleaving should be avoided up front\n"; } } while (false
)
10375 "interleaving should be avoided up front\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Ignoring UserIC, because vectorization and "
"interleaving should be avoided up front\n"; } } while (false
)
;
10376 IntDiagMsg = std::make_pair(
10377 "InterleavingAvoided",
10378 "Ignoring UserIC, because interleaving was avoided up front");
10379 InterleaveLoop = false;
10380 } else if (IC == 1 && UserIC <= 1) {
10381 // Tell the user interleaving is not beneficial.
10382 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is not beneficial.\n"
; } } while (false)
;
10383 IntDiagMsg = std::make_pair(
10384 "InterleavingNotBeneficial",
10385 "the cost-model indicates that interleaving is not beneficial");
10386 InterleaveLoop = false;
10387 if (UserIC == 1) {
10388 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10389 IntDiagMsg.second +=
10390 " and is explicitly disabled or interleave count is set to 1";
10391 }
10392 } else if (IC > 1 && UserIC == 1) {
10393 // Tell the user interleaving is beneficial, but it explicitly disabled.
10394 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."
; } } while (false)
10395 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."
; } } while (false)
;
10396 IntDiagMsg = std::make_pair(
10397 "InterleavingBeneficialButDisabled",
10398 "the cost-model indicates that interleaving is beneficial "
10399 "but is explicitly disabled or interleave count is set to 1");
10400 InterleaveLoop = false;
10401 }
10402
10403 // Override IC if user provided an interleave count.
10404 IC = UserIC > 0 ? UserIC : IC;
10405
10406 // Emit diagnostic messages, if any.
10407 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10408 if (!VectorizeLoop && !InterleaveLoop) {
10409 // Do not vectorize or interleaving the loop.
10410 ORE->emit([&]() {
10411 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10412 L->getStartLoc(), L->getHeader())
10413 << VecDiagMsg.second;
10414 });
10415 ORE->emit([&]() {
10416 return OptimizationRemarkMissed(LV_NAME"loop-vectorize", IntDiagMsg.first,
10417 L->getStartLoc(), L->getHeader())
10418 << IntDiagMsg.second;
10419 });
10420 return false;
10421 } else if (!VectorizeLoop && InterleaveLoop) {
10422 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleave Count is "
<< IC << '\n'; } } while (false)
;
10423 ORE->emit([&]() {
10424 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10425 L->getStartLoc(), L->getHeader())
10426 << VecDiagMsg.second;
10427 });
10428 } else if (VectorizeLoop && !InterleaveLoop) {
10429 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Widthdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
10430 << ") in " << DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
;
10431 ORE->emit([&]() {
10432 return OptimizationRemarkAnalysis(LV_NAME"loop-vectorize", IntDiagMsg.first,
10433 L->getStartLoc(), L->getHeader())
10434 << IntDiagMsg.second;
10435 });
10436 } else if (VectorizeLoop && InterleaveLoop) {
10437 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Widthdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
10438 << ") in " << DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
;
10439 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleave Count is "
<< IC << '\n'; } } while (false)
;
10440 }
10441
10442 bool DisableRuntimeUnroll = false;
10443 MDNode *OrigLoopID = L->getLoopID();
10444 {
10445 using namespace ore;
10446 if (!VectorizeLoop) {
10447 assert(IC > 1 && "interleave count should not be 1 or 0")(static_cast <bool> (IC > 1 && "interleave count should not be 1 or 0"
) ? void (0) : __assert_fail ("IC > 1 && \"interleave count should not be 1 or 0\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10447, __extension__
__PRETTY_FUNCTION__))
;
10448 // If we decided that it is not legal to vectorize the loop, then
10449 // interleave it.
10450 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10451 &CM, BFI, PSI, Checks);
10452
10453 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10454 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10455
10456 ORE->emit([&]() {
10457 return OptimizationRemark(LV_NAME"loop-vectorize", "Interleaved", L->getStartLoc(),
10458 L->getHeader())
10459 << "interleaved loop (interleaved count: "
10460 << NV("InterleaveCount", IC) << ")";
10461 });
10462 } else {
10463 // If we decided that it is *legal* to vectorize the loop, then do it.
10464
10465 // Consider vectorizing the epilogue too if it's profitable.
10466 VectorizationFactor EpilogueVF =
10467 CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10468 if (EpilogueVF.Width.isVector()) {
10469
10470 // The first pass vectorizes the main loop and creates a scalar epilogue
10471 // to be vectorized by executing the plan (potentially with a different
10472 // factor) again shortly afterwards.
10473 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10474 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10475 EPI, &LVL, &CM, BFI, PSI, Checks);
10476
10477 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10478 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10479 DT, true);
10480 ++LoopsVectorized;
10481
10482 // Second pass vectorizes the epilogue and adjusts the control flow
10483 // edges from the first pass.
10484 EPI.MainLoopVF = EPI.EpilogueVF;
10485 EPI.MainLoopUF = EPI.EpilogueUF;
10486 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10487 ORE, EPI, &LVL, &CM, BFI, PSI,
10488 Checks);
10489
10490 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10491 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10492 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10493 Header->setName("vec.epilog.vector.body");
10494
10495 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10496 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10497 // before vectorizing the epilogue loop.
10498 for (VPRecipeBase &R : Header->phis()) {
10499 if (isa<VPCanonicalIVPHIRecipe>(&R))
10500 continue;
10501
10502 Value *ResumeV = nullptr;
10503 // TODO: Move setting of resume values to prepareToExecute.
10504 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10505 ResumeV = MainILV.getReductionResumeValue(
10506 ReductionPhi->getRecurrenceDescriptor());
10507 } else {
10508 // Create induction resume values for both widened pointer and
10509 // integer/fp inductions and update the start value of the induction
10510 // recipes to use the resume value.
10511 PHINode *IndPhi = nullptr;
10512 const InductionDescriptor *ID;
10513 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10514 IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10515 ID = &Ind->getInductionDescriptor();
10516 } else {
10517 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10518 IndPhi = WidenInd->getPHINode();
10519 ID = &WidenInd->getInductionDescriptor();
10520 }
10521
10522 ResumeV = MainILV.createInductionResumeValue(
10523 IndPhi, *ID, {EPI.MainLoopIterationCountCheck});
10524 }
10525 assert(ResumeV && "Must have a resume value")(static_cast <bool> (ResumeV && "Must have a resume value"
) ? void (0) : __assert_fail ("ResumeV && \"Must have a resume value\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10525, __extension__
__PRETTY_FUNCTION__))
;
10526 VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(ResumeV);
10527 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10528 }
10529
10530 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10531 DT, true);
10532 ++LoopsEpilogueVectorized;
10533
10534 if (!MainILV.areSafetyChecksAdded())
10535 DisableRuntimeUnroll = true;
10536 } else {
10537 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10538 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10539 PSI, Checks);
10540
10541 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10542 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10543 ++LoopsVectorized;
10544
10545 // Add metadata to disable runtime unrolling a scalar loop when there
10546 // are no runtime checks about strides and memory. A scalar loop that is
10547 // rarely used is not worth unrolling.
10548 if (!LB.areSafetyChecksAdded())
10549 DisableRuntimeUnroll = true;
10550 }
10551 // Report the vectorization decision.
10552 ORE->emit([&]() {
10553 return OptimizationRemark(LV_NAME"loop-vectorize", "Vectorized", L->getStartLoc(),
10554 L->getHeader())
10555 << "vectorized loop (vectorization width: "
10556 << NV("VectorizationFactor", VF.Width)
10557 << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10558 });
10559 }
10560
10561 if (ORE->allowExtraAnalysis(LV_NAME"loop-vectorize"))
10562 checkMixedPrecision(L, ORE);
10563 }
10564
10565 std::optional<MDNode *> RemainderLoopID =
10566 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10567 LLVMLoopVectorizeFollowupEpilogue});
10568 if (RemainderLoopID) {
10569 L->setLoopID(*RemainderLoopID);
10570 } else {
10571 if (DisableRuntimeUnroll)
10572 AddRuntimeUnrollDisableMetaData(L);
10573
10574 // Mark the loop as already vectorized to avoid vectorizing again.
10575 Hints.setAlreadyVectorized();
10576 }
10577
10578 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()))(static_cast <bool> (!verifyFunction(*L->getHeader()
->getParent(), &dbgs())) ? void (0) : __assert_fail ("!verifyFunction(*L->getHeader()->getParent(), &dbgs())"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10578, __extension__
__PRETTY_FUNCTION__))
;
10579 return true;
10580}
10581
10582LoopVectorizeResult LoopVectorizePass::runImpl(
10583 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10584 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10585 DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
10586 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10587 SE = &SE_;
10588 LI = &LI_;
10589 TTI = &TTI_;
10590 DT = &DT_;
10591 BFI = &BFI_;
10592 TLI = TLI_;
10593 AC = &AC_;
10594 LAIs = &LAIs_;
10595 DB = &DB_;
10596 ORE = &ORE_;
10597 PSI = PSI_;
10598
10599 // Don't attempt if
10600 // 1. the target claims to have no vector registers, and
10601 // 2. interleaving won't help ILP.
10602 //
10603 // The second condition is necessary because, even if the target has no
10604 // vector registers, loop vectorization may still enable scalar
10605 // interleaving.
10606 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10607 TTI->getMaxInterleaveFactor(1) < 2)
10608 return LoopVectorizeResult(false, false);
10609
10610 bool Changed = false, CFGChanged = false;
10611
10612 // The vectorizer requires loops to be in simplified form.
10613 // Since simplification may add new inner loops, it has to run before the
10614 // legality and profitability checks. This means running the loop vectorizer
10615 // will simplify all loops, regardless of whether anything end up being
10616 // vectorized.
10617 for (const auto &L : *LI)
10618 Changed |= CFGChanged |=
10619 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10620
10621 // Build up a worklist of inner-loops to vectorize. This is necessary as
10622 // the act of vectorizing or partially unrolling a loop creates new loops
10623 // and can invalidate iterators across the loops.
10624 SmallVector<Loop *, 8> Worklist;
10625
10626 for (Loop *L : *LI)
10627 collectSupportedLoops(*L, LI, ORE, Worklist);
10628
10629 LoopsAnalyzed += Worklist.size();
10630
10631 // Now walk the identified inner loops.
10632 while (!Worklist.empty()) {
10633 Loop *L = Worklist.pop_back_val();
10634
10635 // For the inner loops we actually process, form LCSSA to simplify the
10636 // transform.
10637 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10638
10639 Changed |= CFGChanged |= processLoop(L);
10640
10641 if (Changed)
10642 LAIs->clear();
10643 }
10644
10645 // Process each loop nest in the function.
10646 return LoopVectorizeResult(Changed, CFGChanged);
10647}
10648
10649PreservedAnalyses LoopVectorizePass::run(Function &F,
10650 FunctionAnalysisManager &AM) {
10651 auto &LI = AM.getResult<LoopAnalysis>(F);
10652 // There are no loops in the function. Return before computing other expensive
10653 // analyses.
10654 if (LI.empty())
10655 return PreservedAnalyses::all();
10656 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10657 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10658 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10659 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10660 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10661 auto &AC = AM.getResult<AssumptionAnalysis>(F);
10662 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10663 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10664
10665 LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F);
10666 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10667 ProfileSummaryInfo *PSI =
10668 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10669 LoopVectorizeResult Result =
10670 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10671 if (!Result.MadeAnyChange)
10672 return PreservedAnalyses::all();
10673 PreservedAnalyses PA;
10674
10675 // We currently do not preserve loopinfo/dominator analyses with outer loop
10676 // vectorization. Until this is addressed, mark these analyses as preserved
10677 // only for non-VPlan-native path.
10678 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10679 if (!EnableVPlanNativePath) {
10680 PA.preserve<LoopAnalysis>();
10681 PA.preserve<DominatorTreeAnalysis>();
10682 }
10683
10684 if (Result.MadeCFGChange) {
10685 // Making CFG changes likely means a loop got vectorized. Indicate that
10686 // extra simplification passes should be run.
10687 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10688 // be run if runtime checks have been added.
10689 AM.getResult<ShouldRunExtraVectorPasses>(F);
10690 PA.preserve<ShouldRunExtraVectorPasses>();
10691 } else {
10692 PA.preserveSet<CFGAnalyses>();
10693 }
10694 return PA;
10695}
10696
10697void LoopVectorizePass::printPipeline(
10698 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10699 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10700 OS, MapClassName2PassName);
10701
10702 OS << "<";
10703 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10704 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10705 OS << ">";
10706}

/build/source/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

1//===- LoopVectorizationPlanner.h - Planner for LoopVectorization ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file provides a LoopVectorizationPlanner class.
11/// InnerLoopVectorizer vectorizes loops which contain only one basic
12/// LoopVectorizationPlanner - drives the vectorization process after having
13/// passed Legality checks.
14/// The planner builds and optimizes the Vectorization Plans which record the
15/// decisions how to vectorize the given loop. In particular, represent the
16/// control-flow of the vectorized version, the replication of instructions that
17/// are to be scalarized, and interleave access groups.
18///
19/// Also provides a VPlan-based builder utility analogous to IRBuilder.
20/// It provides an instruction-level API for generating VPInstructions while
21/// abstracting away the Recipe manipulation details.
22//===----------------------------------------------------------------------===//
23
24#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
25#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
26
27#include "VPlan.h"
28#include "llvm/Support/InstructionCost.h"
29
30namespace llvm {
31
32class LoopInfo;
33class LoopVectorizationLegality;
34class LoopVectorizationCostModel;
35class PredicatedScalarEvolution;
36class LoopVectorizeHints;
37class OptimizationRemarkEmitter;
38class TargetTransformInfo;
39class TargetLibraryInfo;
40class VPRecipeBuilder;
41
42/// VPlan-based builder utility analogous to IRBuilder.
43class VPBuilder {
44 VPBasicBlock *BB = nullptr;
45 VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
46
47 VPInstruction *createInstruction(unsigned Opcode,
48 ArrayRef<VPValue *> Operands, DebugLoc DL,
49 const Twine &Name = "") {
50 VPInstruction *Instr = new VPInstruction(Opcode, Operands, DL, Name);
17
Memory is allocated
51 if (BB)
18
Assuming field 'BB' is null
19
Taking false branch
52 BB->insert(Instr, InsertPt);
53 return Instr;
54 }
55
56 VPInstruction *createInstruction(unsigned Opcode,
57 std::initializer_list<VPValue *> Operands,
58 DebugLoc DL, const Twine &Name = "") {
59 return createInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name);
16
Calling 'VPBuilder::createInstruction'
20
Returned allocated memory
60 }
61
62public:
63 VPBuilder() = default;
64
65 /// Clear the insertion point: created instructions will not be inserted into
66 /// a block.
67 void clearInsertionPoint() {
68 BB = nullptr;
69 InsertPt = VPBasicBlock::iterator();
70 }
71
72 VPBasicBlock *getInsertBlock() const { return BB; }
73 VPBasicBlock::iterator getInsertPoint() const { return InsertPt; }
74
75 /// InsertPoint - A saved insertion point.
76 class VPInsertPoint {
77 VPBasicBlock *Block = nullptr;
78 VPBasicBlock::iterator Point;
79
80 public:
81 /// Creates a new insertion point which doesn't point to anything.
82 VPInsertPoint() = default;
83
84 /// Creates a new insertion point at the given location.
85 VPInsertPoint(VPBasicBlock *InsertBlock, VPBasicBlock::iterator InsertPoint)
86 : Block(InsertBlock), Point(InsertPoint) {}
87
88 /// Returns true if this insert point is set.
89 bool isSet() const { return Block != nullptr; }
90
91 VPBasicBlock *getBlock() const { return Block; }
92 VPBasicBlock::iterator getPoint() const { return Point; }
93 };
94
95 /// Sets the current insert point to a previously-saved location.
96 void restoreIP(VPInsertPoint IP) {
97 if (IP.isSet())
98 setInsertPoint(IP.getBlock(), IP.getPoint());
99 else
100 clearInsertionPoint();
101 }
102
103 /// This specifies that created VPInstructions should be appended to the end
104 /// of the specified block.
105 void setInsertPoint(VPBasicBlock *TheBB) {
106 assert(TheBB && "Attempting to set a null insert point")(static_cast <bool> (TheBB && "Attempting to set a null insert point"
) ? void (0) : __assert_fail ("TheBB && \"Attempting to set a null insert point\""
, "llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h",
106, __extension__ __PRETTY_FUNCTION__))
;
107 BB = TheBB;
108 InsertPt = BB->end();
109 }
110
111 /// This specifies that created instructions should be inserted at the
112 /// specified point.
113 void setInsertPoint(VPBasicBlock *TheBB, VPBasicBlock::iterator IP) {
114 BB = TheBB;
115 InsertPt = IP;
116 }
117
118 /// Insert and return the specified instruction.
119 VPInstruction *insert(VPInstruction *I) const {
120 BB->insert(I, InsertPt);
121 return I;
122 }
123
124 /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as
125 /// its underlying Instruction.
126 VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
127 Instruction *Inst = nullptr, const Twine &Name = "") {
128 DebugLoc DL;
129 if (Inst)
130 DL = Inst->getDebugLoc();
131 VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL, Name);
132 NewVPInst->setUnderlyingValue(Inst);
133 return NewVPInst;
134 }
135 VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
136 DebugLoc DL, const Twine &Name = "") {
137 return createInstruction(Opcode, Operands, DL, Name);
138 }
139
140 VPValue *createNot(VPValue *Operand, DebugLoc DL, const Twine &Name = "") {
141 return createInstruction(VPInstruction::Not, {Operand}, DL, Name);
142 }
143
144 VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL,
145 const Twine &Name = "") {
146 return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL, Name);
147 }
148
149 VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL,
150 const Twine &Name = "") {
151 return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}, DL, Name);
15
Calling 'VPBuilder::createInstruction'
21
Returned allocated memory
152 }
153
154 VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal,
155 DebugLoc DL, const Twine &Name = "") {
156 return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL,
157 Name);
158 }
159
160 //===--------------------------------------------------------------------===//
161 // RAII helpers.
162 //===--------------------------------------------------------------------===//
163
164 /// RAII object that stores the current insertion point and restores it when
165 /// the object is destroyed.
166 class InsertPointGuard {
167 VPBuilder &Builder;
168 VPBasicBlock *Block;
169 VPBasicBlock::iterator Point;
170
171 public:
172 InsertPointGuard(VPBuilder &B)
173 : Builder(B), Block(B.getInsertBlock()), Point(B.getInsertPoint()) {}
174
175 InsertPointGuard(const InsertPointGuard &) = delete;
176 InsertPointGuard &operator=(const InsertPointGuard &) = delete;
177
178 ~InsertPointGuard() { Builder.restoreIP(VPInsertPoint(Block, Point)); }
179 };
180};
181
182/// TODO: The following VectorizationFactor was pulled out of
183/// LoopVectorizationCostModel class. LV also deals with
184/// VectorizerParams::VectorizationFactor and VectorizationCostTy.
185/// We need to streamline them.
186
187/// Information about vectorization costs.
188struct VectorizationFactor {
189 /// Vector width with best cost.
190 ElementCount Width;
191
192 /// Cost of the loop with that width.
193 InstructionCost Cost;
194
195 /// Cost of the scalar loop.
196 InstructionCost ScalarCost;
197
198 /// The minimum trip count required to make vectorization profitable, e.g. due
199 /// to runtime checks.
200 ElementCount MinProfitableTripCount;
201
202 VectorizationFactor(ElementCount Width, InstructionCost Cost,
203 InstructionCost ScalarCost)
204 : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {}
205
206 /// Width 1 means no vectorization, cost 0 means uncomputed cost.
207 static VectorizationFactor Disabled() {
208 return {ElementCount::getFixed(1), 0, 0};
209 }
210
211 bool operator==(const VectorizationFactor &rhs) const {
212 return Width == rhs.Width && Cost == rhs.Cost;
213 }
214
215 bool operator!=(const VectorizationFactor &rhs) const {
216 return !(*this == rhs);
217 }
218};
219
220/// A class that represents two vectorization factors (initialized with 0 by
221/// default). One for fixed-width vectorization and one for scalable
222/// vectorization. This can be used by the vectorizer to choose from a range of
223/// fixed and/or scalable VFs in order to find the most cost-effective VF to
224/// vectorize with.
225struct FixedScalableVFPair {
226 ElementCount FixedVF;
227 ElementCount ScalableVF;
228
229 FixedScalableVFPair()
230 : FixedVF(ElementCount::getFixed(0)),
231 ScalableVF(ElementCount::getScalable(0)) {}
232 FixedScalableVFPair(const ElementCount &Max) : FixedScalableVFPair() {
233 *(Max.isScalable() ? &ScalableVF : &FixedVF) = Max;
234 }
235 FixedScalableVFPair(const ElementCount &FixedVF,
236 const ElementCount &ScalableVF)
237 : FixedVF(FixedVF), ScalableVF(ScalableVF) {
238 assert(!FixedVF.isScalable() && ScalableVF.isScalable() &&(static_cast <bool> (!FixedVF.isScalable() && ScalableVF
.isScalable() && "Invalid scalable properties") ? void
(0) : __assert_fail ("!FixedVF.isScalable() && ScalableVF.isScalable() && \"Invalid scalable properties\""
, "llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h",
239, __extension__ __PRETTY_FUNCTION__))
239 "Invalid scalable properties")(static_cast <bool> (!FixedVF.isScalable() && ScalableVF
.isScalable() && "Invalid scalable properties") ? void
(0) : __assert_fail ("!FixedVF.isScalable() && ScalableVF.isScalable() && \"Invalid scalable properties\""
, "llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h",
239, __extension__ __PRETTY_FUNCTION__))
;
240 }
241
242 static FixedScalableVFPair getNone() { return FixedScalableVFPair(); }
243
244 /// \return true if either fixed- or scalable VF is non-zero.
245 explicit operator bool() const { return FixedVF || ScalableVF; }
246
247 /// \return true if either fixed- or scalable VF is a valid vector VF.
248 bool hasVector() const { return FixedVF.isVector() || ScalableVF.isVector(); }
249};
250
251/// Planner drives the vectorization process after having passed
252/// Legality checks.
253class LoopVectorizationPlanner {
254 /// The loop that we evaluate.
255 Loop *OrigLoop;
256
257 /// Loop Info analysis.
258 LoopInfo *LI;
259
260 /// Target Library Info.
261 const TargetLibraryInfo *TLI;
262
263 /// Target Transform Info.
264 const TargetTransformInfo *TTI;
265
266 /// The legality analysis.
267 LoopVectorizationLegality *Legal;
268
269 /// The profitability analysis.
270 LoopVectorizationCostModel &CM;
271
272 /// The interleaved access analysis.
273 InterleavedAccessInfo &IAI;
274
275 PredicatedScalarEvolution &PSE;
276
277 const LoopVectorizeHints &Hints;
278
279 OptimizationRemarkEmitter *ORE;
280
281 SmallVector<VPlanPtr, 4> VPlans;
282
283 /// A builder used to construct the current plan.
284 VPBuilder Builder;
285
286public:
287 LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
288 const TargetTransformInfo *TTI,
289 LoopVectorizationLegality *Legal,
290 LoopVectorizationCostModel &CM,
291 InterleavedAccessInfo &IAI,
292 PredicatedScalarEvolution &PSE,
293 const LoopVectorizeHints &Hints,
294 OptimizationRemarkEmitter *ORE)
295 : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI),
296 PSE(PSE), Hints(Hints), ORE(ORE) {}
297
298 /// Plan how to best vectorize, return the best VF and its cost, or
299 /// std::nullopt if vectorization and interleaving should be avoided up front.
300 std::optional<VectorizationFactor> plan(ElementCount UserVF, unsigned UserIC);
301
302 /// Use the VPlan-native path to plan how to best vectorize, return the best
303 /// VF and its cost.
304 VectorizationFactor planInVPlanNativePath(ElementCount UserVF);
305
306 /// Return the best VPlan for \p VF.
307 VPlan &getBestPlanFor(ElementCount VF) const;
308
309 /// Generate the IR code for the body of the vectorized loop according to the
310 /// best selected \p VF, \p UF and VPlan \p BestPlan.
311 /// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue
312 /// vectorization re-using plans for both the main and epilogue vector loops.
313 /// It should be removed once the re-use issue has been fixed.
314 void executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan,
315 InnerLoopVectorizer &LB, DominatorTree *DT,
316 bool IsEpilogueVectorization);
317
318#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
319 void printPlans(raw_ostream &O);
320#endif
321
322 /// Look through the existing plans and return true if we have one with all
323 /// the vectorization factors in question.
324 bool hasPlanWithVF(ElementCount VF) const {
325 return any_of(VPlans,
326 [&](const VPlanPtr &Plan) { return Plan->hasVF(VF); });
327 }
328
329 /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
330 /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
331 /// returned value holds for the entire \p Range.
332 static bool
333 getDecisionAndClampRange(const std::function<bool(ElementCount)> &Predicate,
334 VFRange &Range);
335
336 /// Check if the number of runtime checks exceeds the threshold.
337 bool requiresTooManyRuntimeChecks() const;
338
339protected:
340 /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
341 /// according to the information gathered by Legal when it checked if it is
342 /// legal to vectorize the loop.
343 void buildVPlans(ElementCount MinVF, ElementCount MaxVF);
344
345private:
346 /// Build a VPlan according to the information gathered by Legal. \return a
347 /// VPlan for vectorization factors \p Range.Start and up to \p Range.End
348 /// exclusive, possibly decreasing \p Range.End.
349 VPlanPtr buildVPlan(VFRange &Range);
350
351 /// Build a VPlan using VPRecipes according to the information gather by
352 /// Legal. This method is only used for the legacy inner loop vectorizer.
353 VPlanPtr buildVPlanWithVPRecipes(
354 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
355 const MapVector<Instruction *, Instruction *> &SinkAfter);
356
357 /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
358 /// according to the information gathered by Legal when it checked if it is
359 /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
360 void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
361
362 // Adjust the recipes for reductions. For in-loop reductions the chain of
363 // instructions leading from the loop exit instr to the phi need to be
364 // converted to reductions, with one operand being vector and the other being
365 // the scalar reduction chain. For other reductions, a select is introduced
366 // between the phi and live-out recipes when folding the tail.
367 void adjustRecipesForReductions(VPBasicBlock *LatchVPBB, VPlanPtr &Plan,
368 VPRecipeBuilder &RecipeBuilder,
369 ElementCount MinVF);
370};
371
372} // namespace llvm
373
374#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H