Bug Summary

File:build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Warning:line 8973, column 3
Use of memory after it is freed

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name LoopVectorize.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/build-llvm -resource-dir /usr/lib/llvm-16/lib/clang/16.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/llvm/lib/Transforms/Vectorize -I include -I /build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-16/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/build-llvm=build-llvm -fmacro-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/build-llvm=build-llvm -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/build-llvm=build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/= -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-10-03-140002-15933-1 -x c++ /build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/Proposal/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanHCFGBuilder.h"
61#include "VPlanTransforms.h"
62#include "llvm/ADT/APInt.h"
63#include "llvm/ADT/ArrayRef.h"
64#include "llvm/ADT/DenseMap.h"
65#include "llvm/ADT/DenseMapInfo.h"
66#include "llvm/ADT/Hashing.h"
67#include "llvm/ADT/MapVector.h"
68#include "llvm/ADT/None.h"
69#include "llvm/ADT/Optional.h"
70#include "llvm/ADT/STLExtras.h"
71#include "llvm/ADT/SmallPtrSet.h"
72#include "llvm/ADT/SmallSet.h"
73#include "llvm/ADT/SmallVector.h"
74#include "llvm/ADT/Statistic.h"
75#include "llvm/ADT/StringRef.h"
76#include "llvm/ADT/Twine.h"
77#include "llvm/ADT/iterator_range.h"
78#include "llvm/Analysis/AssumptionCache.h"
79#include "llvm/Analysis/BasicAliasAnalysis.h"
80#include "llvm/Analysis/BlockFrequencyInfo.h"
81#include "llvm/Analysis/CFG.h"
82#include "llvm/Analysis/CodeMetrics.h"
83#include "llvm/Analysis/DemandedBits.h"
84#include "llvm/Analysis/GlobalsModRef.h"
85#include "llvm/Analysis/LoopAccessAnalysis.h"
86#include "llvm/Analysis/LoopAnalysisManager.h"
87#include "llvm/Analysis/LoopInfo.h"
88#include "llvm/Analysis/LoopIterator.h"
89#include "llvm/Analysis/OptimizationRemarkEmitter.h"
90#include "llvm/Analysis/ProfileSummaryInfo.h"
91#include "llvm/Analysis/ScalarEvolution.h"
92#include "llvm/Analysis/ScalarEvolutionExpressions.h"
93#include "llvm/Analysis/TargetLibraryInfo.h"
94#include "llvm/Analysis/TargetTransformInfo.h"
95#include "llvm/Analysis/ValueTracking.h"
96#include "llvm/Analysis/VectorUtils.h"
97#include "llvm/IR/Attributes.h"
98#include "llvm/IR/BasicBlock.h"
99#include "llvm/IR/CFG.h"
100#include "llvm/IR/Constant.h"
101#include "llvm/IR/Constants.h"
102#include "llvm/IR/DataLayout.h"
103#include "llvm/IR/DebugInfoMetadata.h"
104#include "llvm/IR/DebugLoc.h"
105#include "llvm/IR/DerivedTypes.h"
106#include "llvm/IR/DiagnosticInfo.h"
107#include "llvm/IR/Dominators.h"
108#include "llvm/IR/Function.h"
109#include "llvm/IR/IRBuilder.h"
110#include "llvm/IR/InstrTypes.h"
111#include "llvm/IR/Instruction.h"
112#include "llvm/IR/Instructions.h"
113#include "llvm/IR/IntrinsicInst.h"
114#include "llvm/IR/Intrinsics.h"
115#include "llvm/IR/Metadata.h"
116#include "llvm/IR/Module.h"
117#include "llvm/IR/Operator.h"
118#include "llvm/IR/PatternMatch.h"
119#include "llvm/IR/Type.h"
120#include "llvm/IR/Use.h"
121#include "llvm/IR/User.h"
122#include "llvm/IR/Value.h"
123#include "llvm/IR/ValueHandle.h"
124#include "llvm/IR/Verifier.h"
125#include "llvm/InitializePasses.h"
126#include "llvm/Pass.h"
127#include "llvm/Support/Casting.h"
128#include "llvm/Support/CommandLine.h"
129#include "llvm/Support/Compiler.h"
130#include "llvm/Support/Debug.h"
131#include "llvm/Support/ErrorHandling.h"
132#include "llvm/Support/InstructionCost.h"
133#include "llvm/Support/MathExtras.h"
134#include "llvm/Support/raw_ostream.h"
135#include "llvm/Transforms/Utils/BasicBlockUtils.h"
136#include "llvm/Transforms/Utils/InjectTLIMappings.h"
137#include "llvm/Transforms/Utils/LoopSimplify.h"
138#include "llvm/Transforms/Utils/LoopUtils.h"
139#include "llvm/Transforms/Utils/LoopVersioning.h"
140#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141#include "llvm/Transforms/Utils/SizeOpts.h"
142#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143#include <algorithm>
144#include <cassert>
145#include <cmath>
146#include <cstdint>
147#include <functional>
148#include <iterator>
149#include <limits>
150#include <map>
151#include <memory>
152#include <string>
153#include <tuple>
154#include <utility>
155
156using namespace llvm;
157
158#define LV_NAME"loop-vectorize" "loop-vectorize"
159#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"
160
161#ifndef NDEBUG
162const char VerboseDebug[] = DEBUG_TYPE"loop-vectorize" "-verbose";
163#endif
164
165/// @{
166/// Metadata attribute names
167const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168const char LLVMLoopVectorizeFollowupVectorized[] =
169 "llvm.loop.vectorize.followup_vectorized";
170const char LLVMLoopVectorizeFollowupEpilogue[] =
171 "llvm.loop.vectorize.followup_epilogue";
172/// @}
173
174STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = {"loop-vectorize", "LoopsVectorized"
, "Number of loops vectorized"}
;
175STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = {"loop-vectorize", "LoopsAnalyzed"
, "Number of loops analyzed for vectorization"}
;
176STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized")static llvm::Statistic LoopsEpilogueVectorized = {"loop-vectorize"
, "LoopsEpilogueVectorized", "Number of epilogues vectorized"
}
;
177
178static cl::opt<bool> EnableEpilogueVectorization(
179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180 cl::desc("Enable vectorization of epilogue loops."));
181
182static cl::opt<unsigned> EpilogueVectorizationForceVF(
183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184 cl::desc("When epilogue vectorization is enabled, and a value greater than "
185 "1 is specified, forces the given VF for all applicable epilogue "
186 "loops."));
187
188static cl::opt<unsigned> EpilogueVectorizationMinVF(
189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190 cl::desc("Only loops with vectorization factor equal to or larger than "
191 "the specified value are considered for epilogue vectorization."));
192
193/// Loops with a known constant trip count below this number are vectorized only
194/// if no scalar iteration overheads are incurred.
195static cl::opt<unsigned> TinyTripCountVectorThreshold(
196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197 cl::desc("Loops with a constant trip count that is smaller than this "
198 "value are vectorized only if no scalar iteration overheads "
199 "are incurred."));
200
201static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
202 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203 cl::desc("The maximum allowed number of runtime memory checks"));
204
205// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206// that predication is preferred, and this lists all options. I.e., the
207// vectorizer will try to fold the tail-loop (epilogue) into the vector body
208// and predicate the instructions accordingly. If tail-folding fails, there are
209// different fallback strategies depending on these values:
210namespace PreferPredicateTy {
211 enum Option {
212 ScalarEpilogue = 0,
213 PredicateElseScalarEpilogue,
214 PredicateOrDontVectorize
215 };
216} // namespace PreferPredicateTy
217
218static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
219 "prefer-predicate-over-epilogue",
220 cl::init(PreferPredicateTy::ScalarEpilogue),
221 cl::Hidden,
222 cl::desc("Tail-folding and predication preferences over creating a scalar "
223 "epilogue loop."),
224 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
225 "scalar-epilogue",llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
226 "Don't tail-predicate loops, create scalar epilogue")llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
,
227 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
228 "predicate-else-scalar-epilogue",llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
229 "prefer tail-folding, create scalar epilogue if tail "llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
230 "folding fails.")llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
,
231 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
232 "predicate-dont-vectorize",llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
233 "prefers tail-folding, don't attempt vectorization if "llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
234 "tail-folding fails.")llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
));
235
236static cl::opt<bool> MaximizeBandwidth(
237 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
238 cl::desc("Maximize bandwidth when selecting vectorization factor which "
239 "will be determined by the smallest type in loop."));
240
241static cl::opt<bool> EnableInterleavedMemAccesses(
242 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
243 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
244
245/// An interleave-group may need masking if it resides in a block that needs
246/// predication, or in order to mask away gaps.
247static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
248 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
249 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
250
251static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
252 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
253 cl::desc("We don't interleave loops with a estimated constant trip count "
254 "below this number"));
255
256static cl::opt<unsigned> ForceTargetNumScalarRegs(
257 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
258 cl::desc("A flag that overrides the target's number of scalar registers."));
259
260static cl::opt<unsigned> ForceTargetNumVectorRegs(
261 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
262 cl::desc("A flag that overrides the target's number of vector registers."));
263
264static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
265 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
266 cl::desc("A flag that overrides the target's max interleave factor for "
267 "scalar loops."));
268
269static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
270 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
271 cl::desc("A flag that overrides the target's max interleave factor for "
272 "vectorized loops."));
273
274static cl::opt<unsigned> ForceTargetInstructionCost(
275 "force-target-instruction-cost", cl::init(0), cl::Hidden,
276 cl::desc("A flag that overrides the target's expected cost for "
277 "an instruction to a single constant value. Mostly "
278 "useful for getting consistent testing."));
279
280static cl::opt<bool> ForceTargetSupportsScalableVectors(
281 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
282 cl::desc(
283 "Pretend that scalable vectors are supported, even if the target does "
284 "not support them. This flag should only be used for testing."));
285
286static cl::opt<unsigned> SmallLoopCost(
287 "small-loop-cost", cl::init(20), cl::Hidden,
288 cl::desc(
289 "The cost of a loop that is considered 'small' by the interleaver."));
290
291static cl::opt<bool> LoopVectorizeWithBlockFrequency(
292 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
293 cl::desc("Enable the use of the block frequency analysis to access PGO "
294 "heuristics minimizing code growth in cold regions and being more "
295 "aggressive in hot regions."));
296
297// Runtime interleave loops for load/store throughput.
298static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
299 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
300 cl::desc(
301 "Enable runtime interleaving until load/store ports are saturated"));
302
303/// Interleave small loops with scalar reductions.
304static cl::opt<bool> InterleaveSmallLoopScalarReduction(
305 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
306 cl::desc("Enable interleaving for loops with small iteration counts that "
307 "contain scalar reductions to expose ILP."));
308
309/// The number of stores in a loop that are allowed to need predication.
310static cl::opt<unsigned> NumberOfStoresToPredicate(
311 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
312 cl::desc("Max number of stores to be predicated behind an if."));
313
314static cl::opt<bool> EnableIndVarRegisterHeur(
315 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
316 cl::desc("Count the induction variable only once when interleaving"));
317
318static cl::opt<bool> EnableCondStoresVectorization(
319 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
320 cl::desc("Enable if predication of stores during vectorization."));
321
322static cl::opt<unsigned> MaxNestedScalarReductionIC(
323 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
324 cl::desc("The maximum interleave count to use when interleaving a scalar "
325 "reduction in a nested loop."));
326
327static cl::opt<bool>
328 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
329 cl::Hidden,
330 cl::desc("Prefer in-loop vector reductions, "
331 "overriding the targets preference."));
332
333static cl::opt<bool> ForceOrderedReductions(
334 "force-ordered-reductions", cl::init(false), cl::Hidden,
335 cl::desc("Enable the vectorisation of loops with in-order (strict) "
336 "FP reductions"));
337
338static cl::opt<bool> PreferPredicatedReductionSelect(
339 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
340 cl::desc(
341 "Prefer predicating a reduction operation over an after loop select."));
342
343cl::opt<bool> EnableVPlanNativePath(
344 "enable-vplan-native-path", cl::init(false), cl::Hidden,
345 cl::desc("Enable VPlan-native vectorization path with "
346 "support for outer loop vectorization."));
347
348// This flag enables the stress testing of the VPlan H-CFG construction in the
349// VPlan-native vectorization path. It must be used in conjuction with
350// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
351// verification of the H-CFGs built.
352static cl::opt<bool> VPlanBuildStressTest(
353 "vplan-build-stress-test", cl::init(false), cl::Hidden,
354 cl::desc(
355 "Build VPlan for every supported loop nest in the function and bail "
356 "out right after the build (stress test the VPlan H-CFG construction "
357 "in the VPlan-native vectorization path)."));
358
359cl::opt<bool> llvm::EnableLoopInterleaving(
360 "interleave-loops", cl::init(true), cl::Hidden,
361 cl::desc("Enable loop interleaving in Loop vectorization passes"));
362cl::opt<bool> llvm::EnableLoopVectorization(
363 "vectorize-loops", cl::init(true), cl::Hidden,
364 cl::desc("Run the Loop vectorization passes"));
365
366cl::opt<bool> PrintVPlansInDotFormat(
367 "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
368 cl::desc("Use dot format instead of plain text when dumping VPlans"));
369
370cl::opt<cl::boolOrDefault> ForceSafeDivisor(
371 "force-widen-divrem-via-safe-divisor", cl::Hidden,
372 cl::desc("Override cost based safe divisor widening for div/rem instructions"));
373
374/// A helper function that returns true if the given type is irregular. The
375/// type is irregular if its allocated size doesn't equal the store size of an
376/// element of the corresponding vector type.
377static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
378 // Determine if an array of N elements of type Ty is "bitcast compatible"
379 // with a <N x Ty> vector.
380 // This is only true if there is no padding between the array elements.
381 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
382}
383
384/// A helper function that returns the reciprocal of the block probability of
385/// predicated blocks. If we return X, we are assuming the predicated block
386/// will execute once for every X iterations of the loop header.
387///
388/// TODO: We should use actual block probability here, if available. Currently,
389/// we always assume predicated blocks have a 50% chance of executing.
390static unsigned getReciprocalPredBlockProb() { return 2; }
391
392/// A helper function that returns an integer or floating-point constant with
393/// value C.
394static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
395 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
396 : ConstantFP::get(Ty, C);
397}
398
399/// Returns "best known" trip count for the specified loop \p L as defined by
400/// the following procedure:
401/// 1) Returns exact trip count if it is known.
402/// 2) Returns expected trip count according to profile data if any.
403/// 3) Returns upper bound estimate if it is known.
404/// 4) Returns None if all of the above failed.
405static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
406 // Check if exact trip count is known.
407 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
408 return ExpectedTC;
409
410 // Check if there is an expected trip count available from profile data.
411 if (LoopVectorizeWithBlockFrequency)
412 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
413 return EstimatedTC;
414
415 // Check if upper bound estimate is known.
416 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
417 return ExpectedTC;
418
419 return None;
420}
421
422// Forward declare GeneratedRTChecks.
423class GeneratedRTChecks;
424
425namespace llvm {
426
427AnalysisKey ShouldRunExtraVectorPasses::Key;
428
429/// InnerLoopVectorizer vectorizes loops which contain only one basic
430/// block to a specified vectorization factor (VF).
431/// This class performs the widening of scalars into vectors, or multiple
432/// scalars. This class also implements the following features:
433/// * It inserts an epilogue loop for handling loops that don't have iteration
434/// counts that are known to be a multiple of the vectorization factor.
435/// * It handles the code generation for reduction variables.
436/// * Scalarization (implementation using scalars) of un-vectorizable
437/// instructions.
438/// InnerLoopVectorizer does not perform any vectorization-legality
439/// checks, and relies on the caller to check for the different legality
440/// aspects. The InnerLoopVectorizer relies on the
441/// LoopVectorizationLegality class to provide information about the induction
442/// and reduction variables that were found to a given vectorization factor.
443class InnerLoopVectorizer {
444public:
445 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
446 LoopInfo *LI, DominatorTree *DT,
447 const TargetLibraryInfo *TLI,
448 const TargetTransformInfo *TTI, AssumptionCache *AC,
449 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
450 ElementCount MinProfitableTripCount,
451 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
452 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
453 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
454 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
455 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
456 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
457 PSI(PSI), RTChecks(RTChecks) {
458 // Query this against the original loop and save it here because the profile
459 // of the original loop header may change as the transformation happens.
460 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
461 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
462
463 if (MinProfitableTripCount.isZero())
464 this->MinProfitableTripCount = VecWidth;
465 else
466 this->MinProfitableTripCount = MinProfitableTripCount;
467 }
468
469 virtual ~InnerLoopVectorizer() = default;
470
471 /// Create a new empty loop that will contain vectorized instructions later
472 /// on, while the old loop will be used as the scalar remainder. Control flow
473 /// is generated around the vectorized (and scalar epilogue) loops consisting
474 /// of various checks and bypasses. Return the pre-header block of the new
475 /// loop and the start value for the canonical induction, if it is != 0. The
476 /// latter is the case when vectorizing the epilogue loop. In the case of
477 /// epilogue vectorization, this function is overriden to handle the more
478 /// complex control flow around the loops.
479 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
480
481 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
482 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
483
484 // Return true if any runtime check is added.
485 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
486
487 /// A type for vectorized values in the new loop. Each value from the
488 /// original loop, when vectorized, is represented by UF vector values in the
489 /// new unrolled loop, where UF is the unroll factor.
490 using VectorParts = SmallVector<Value *, 2>;
491
492 /// A helper function to scalarize a single Instruction in the innermost loop.
493 /// Generates a sequence of scalar instances for each lane between \p MinLane
494 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
495 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
496 /// Instr's operands.
497 void scalarizeInstruction(const Instruction *Instr,
498 VPReplicateRecipe *RepRecipe,
499 const VPIteration &Instance, bool IfPredicateInstr,
500 VPTransformState &State);
501
502 /// Construct the vector value of a scalarized value \p V one lane at a time.
503 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
504 VPTransformState &State);
505
506 /// Try to vectorize interleaved access group \p Group with the base address
507 /// given in \p Addr, optionally masking the vector operations if \p
508 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
509 /// values in the vectorized loop.
510 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
511 ArrayRef<VPValue *> VPDefs,
512 VPTransformState &State, VPValue *Addr,
513 ArrayRef<VPValue *> StoredValues,
514 VPValue *BlockInMask = nullptr);
515
516 /// Fix the non-induction PHIs in \p Plan.
517 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
518
519 /// Returns true if the reordering of FP operations is not allowed, but we are
520 /// able to vectorize with strict in-order reductions for the given RdxDesc.
521 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
522
523 /// Create a broadcast instruction. This method generates a broadcast
524 /// instruction (shuffle) for loop invariant values and for the induction
525 /// value. If this is the induction variable then we extend it to N, N+1, ...
526 /// this is needed because each iteration in the loop corresponds to a SIMD
527 /// element.
528 virtual Value *getBroadcastInstrs(Value *V);
529
530 // Returns the resume value (bc.merge.rdx) for a reduction as
531 // generated by fixReduction.
532 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
533
534 /// Create a new phi node for the induction variable \p OrigPhi to resume
535 /// iteration count in the scalar epilogue, from where the vectorized loop
536 /// left off. In cases where the loop skeleton is more complicated (eg.
537 /// epilogue vectorization) and the resume values can come from an additional
538 /// bypass block, the \p AdditionalBypass pair provides information about the
539 /// bypass block and the end value on the edge from bypass to this loop.
540 PHINode *createInductionResumeValue(
541 PHINode *OrigPhi, const InductionDescriptor &ID,
542 ArrayRef<BasicBlock *> BypassBlocks,
543 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
544
545protected:
546 friend class LoopVectorizationPlanner;
547
548 /// A small list of PHINodes.
549 using PhiVector = SmallVector<PHINode *, 4>;
550
551 /// A type for scalarized values in the new loop. Each value from the
552 /// original loop, when scalarized, is represented by UF x VF scalar values
553 /// in the new unrolled loop, where UF is the unroll factor and VF is the
554 /// vectorization factor.
555 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
556
557 /// Set up the values of the IVs correctly when exiting the vector loop.
558 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
559 Value *VectorTripCount, Value *EndValue,
560 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
561 VPlan &Plan);
562
563 /// Handle all cross-iteration phis in the header.
564 void fixCrossIterationPHIs(VPTransformState &State);
565
566 /// Create the exit value of first order recurrences in the middle block and
567 /// update their users.
568 void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
569 VPTransformState &State);
570
571 /// Create code for the loop exit value of the reduction.
572 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
573
574 /// Clear NSW/NUW flags from reduction instructions if necessary.
575 void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
576 VPTransformState &State);
577
578 /// Iteratively sink the scalarized operands of a predicated instruction into
579 /// the block that was created for it.
580 void sinkScalarOperands(Instruction *PredInst);
581
582 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
583 /// represented as.
584 void truncateToMinimalBitwidths(VPTransformState &State);
585
586 /// Returns (and creates if needed) the original loop trip count.
587 Value *getOrCreateTripCount(BasicBlock *InsertBlock);
588
589 /// Returns (and creates if needed) the trip count of the widened loop.
590 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
591
592 /// Returns a bitcasted value to the requested vector type.
593 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
594 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
595 const DataLayout &DL);
596
597 /// Emit a bypass check to see if the vector trip count is zero, including if
598 /// it overflows.
599 void emitIterationCountCheck(BasicBlock *Bypass);
600
601 /// Emit a bypass check to see if all of the SCEV assumptions we've
602 /// had to make are correct. Returns the block containing the checks or
603 /// nullptr if no checks have been added.
604 BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
605
606 /// Emit bypass checks to check any memory assumptions we may have made.
607 /// Returns the block containing the checks or nullptr if no checks have been
608 /// added.
609 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
610
611 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
612 /// vector loop preheader, middle block and scalar preheader.
613 void createVectorLoopSkeleton(StringRef Prefix);
614
615 /// Create new phi nodes for the induction variables to resume iteration count
616 /// in the scalar epilogue, from where the vectorized loop left off.
617 /// In cases where the loop skeleton is more complicated (eg. epilogue
618 /// vectorization) and the resume values can come from an additional bypass
619 /// block, the \p AdditionalBypass pair provides information about the bypass
620 /// block and the end value on the edge from bypass to this loop.
621 void createInductionResumeValues(
622 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
623
624 /// Complete the loop skeleton by adding debug MDs, creating appropriate
625 /// conditional branches in the middle block, preparing the builder and
626 /// running the verifier. Return the preheader of the completed vector loop.
627 BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID);
628
629 /// Collect poison-generating recipes that may generate a poison value that is
630 /// used after vectorization, even when their operands are not poison. Those
631 /// recipes meet the following conditions:
632 /// * Contribute to the address computation of a recipe generating a widen
633 /// memory load/store (VPWidenMemoryInstructionRecipe or
634 /// VPInterleaveRecipe).
635 /// * Such a widen memory load/store has at least one underlying Instruction
636 /// that is in a basic block that needs predication and after vectorization
637 /// the generated instruction won't be predicated.
638 void collectPoisonGeneratingRecipes(VPTransformState &State);
639
640 /// Allow subclasses to override and print debug traces before/after vplan
641 /// execution, when trace information is requested.
642 virtual void printDebugTracesAtStart(){};
643 virtual void printDebugTracesAtEnd(){};
644
645 /// The original loop.
646 Loop *OrigLoop;
647
648 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
649 /// dynamic knowledge to simplify SCEV expressions and converts them to a
650 /// more usable form.
651 PredicatedScalarEvolution &PSE;
652
653 /// Loop Info.
654 LoopInfo *LI;
655
656 /// Dominator Tree.
657 DominatorTree *DT;
658
659 /// Alias Analysis.
660 AAResults *AA;
661
662 /// Target Library Info.
663 const TargetLibraryInfo *TLI;
664
665 /// Target Transform Info.
666 const TargetTransformInfo *TTI;
667
668 /// Assumption Cache.
669 AssumptionCache *AC;
670
671 /// Interface to emit optimization remarks.
672 OptimizationRemarkEmitter *ORE;
673
674 /// The vectorization SIMD factor to use. Each vector will have this many
675 /// vector elements.
676 ElementCount VF;
677
678 ElementCount MinProfitableTripCount;
679
680 /// The vectorization unroll factor to use. Each scalar is vectorized to this
681 /// many different vector instructions.
682 unsigned UF;
683
684 /// The builder that we use
685 IRBuilder<> Builder;
686
687 // --- Vectorization state ---
688
689 /// The vector-loop preheader.
690 BasicBlock *LoopVectorPreHeader;
691
692 /// The scalar-loop preheader.
693 BasicBlock *LoopScalarPreHeader;
694
695 /// Middle Block between the vector and the scalar.
696 BasicBlock *LoopMiddleBlock;
697
698 /// The unique ExitBlock of the scalar loop if one exists. Note that
699 /// there can be multiple exiting edges reaching this block.
700 BasicBlock *LoopExitBlock;
701
702 /// The scalar loop body.
703 BasicBlock *LoopScalarBody;
704
705 /// A list of all bypass blocks. The first block is the entry of the loop.
706 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
707
708 /// Store instructions that were predicated.
709 SmallVector<Instruction *, 4> PredicatedInstructions;
710
711 /// Trip count of the original loop.
712 Value *TripCount = nullptr;
713
714 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
715 Value *VectorTripCount = nullptr;
716
717 /// The legality analysis.
718 LoopVectorizationLegality *Legal;
719
720 /// The profitablity analysis.
721 LoopVectorizationCostModel *Cost;
722
723 // Record whether runtime checks are added.
724 bool AddedSafetyChecks = false;
725
726 // Holds the end values for each induction variable. We save the end values
727 // so we can later fix-up the external users of the induction variables.
728 DenseMap<PHINode *, Value *> IVEndValues;
729
730 /// BFI and PSI are used to check for profile guided size optimizations.
731 BlockFrequencyInfo *BFI;
732 ProfileSummaryInfo *PSI;
733
734 // Whether this loop should be optimized for size based on profile guided size
735 // optimizatios.
736 bool OptForSizeBasedOnProfile;
737
738 /// Structure to hold information about generated runtime checks, responsible
739 /// for cleaning the checks, if vectorization turns out unprofitable.
740 GeneratedRTChecks &RTChecks;
741
742 // Holds the resume values for reductions in the loops, used to set the
743 // correct start value of reduction PHIs when vectorizing the epilogue.
744 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
745 ReductionResumeValues;
746};
747
748class InnerLoopUnroller : public InnerLoopVectorizer {
749public:
750 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
751 LoopInfo *LI, DominatorTree *DT,
752 const TargetLibraryInfo *TLI,
753 const TargetTransformInfo *TTI, AssumptionCache *AC,
754 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
755 LoopVectorizationLegality *LVL,
756 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
757 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
758 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
759 ElementCount::getFixed(1),
760 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
761 BFI, PSI, Check) {}
762
763private:
764 Value *getBroadcastInstrs(Value *V) override;
765};
766
767/// Encapsulate information regarding vectorization of a loop and its epilogue.
768/// This information is meant to be updated and used across two stages of
769/// epilogue vectorization.
770struct EpilogueLoopVectorizationInfo {
771 ElementCount MainLoopVF = ElementCount::getFixed(0);
772 unsigned MainLoopUF = 0;
773 ElementCount EpilogueVF = ElementCount::getFixed(0);
774 unsigned EpilogueUF = 0;
775 BasicBlock *MainLoopIterationCountCheck = nullptr;
776 BasicBlock *EpilogueIterationCountCheck = nullptr;
777 BasicBlock *SCEVSafetyCheck = nullptr;
778 BasicBlock *MemSafetyCheck = nullptr;
779 Value *TripCount = nullptr;
780 Value *VectorTripCount = nullptr;
781
782 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
783 ElementCount EVF, unsigned EUF)
784 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
785 assert(EUF == 1 &&(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 786, __extension__
__PRETTY_FUNCTION__))
786 "A high UF for the epilogue loop is likely not beneficial.")(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 786, __extension__
__PRETTY_FUNCTION__))
;
787 }
788};
789
790/// An extension of the inner loop vectorizer that creates a skeleton for a
791/// vectorized loop that has its epilogue (residual) also vectorized.
792/// The idea is to run the vplan on a given loop twice, firstly to setup the
793/// skeleton and vectorize the main loop, and secondly to complete the skeleton
794/// from the first step and vectorize the epilogue. This is achieved by
795/// deriving two concrete strategy classes from this base class and invoking
796/// them in succession from the loop vectorizer planner.
797class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
798public:
799 InnerLoopAndEpilogueVectorizer(
800 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
801 DominatorTree *DT, const TargetLibraryInfo *TLI,
802 const TargetTransformInfo *TTI, AssumptionCache *AC,
803 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
804 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
805 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
806 GeneratedRTChecks &Checks)
807 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
808 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
809 CM, BFI, PSI, Checks),
810 EPI(EPI) {}
811
812 // Override this function to handle the more complex control flow around the
813 // three loops.
814 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton() final {
815 return createEpilogueVectorizedLoopSkeleton();
816 }
817
818 /// The interface for creating a vectorized skeleton using one of two
819 /// different strategies, each corresponding to one execution of the vplan
820 /// as described above.
821 virtual std::pair<BasicBlock *, Value *>
822 createEpilogueVectorizedLoopSkeleton() = 0;
823
824 /// Holds and updates state information required to vectorize the main loop
825 /// and its epilogue in two separate passes. This setup helps us avoid
826 /// regenerating and recomputing runtime safety checks. It also helps us to
827 /// shorten the iteration-count-check path length for the cases where the
828 /// iteration count of the loop is so small that the main vector loop is
829 /// completely skipped.
830 EpilogueLoopVectorizationInfo &EPI;
831};
832
833/// A specialized derived class of inner loop vectorizer that performs
834/// vectorization of *main* loops in the process of vectorizing loops and their
835/// epilogues.
836class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
837public:
838 EpilogueVectorizerMainLoop(
839 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
840 DominatorTree *DT, const TargetLibraryInfo *TLI,
841 const TargetTransformInfo *TTI, AssumptionCache *AC,
842 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
843 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
844 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
845 GeneratedRTChecks &Check)
846 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
847 EPI, LVL, CM, BFI, PSI, Check) {}
848 /// Implements the interface for creating a vectorized skeleton using the
849 /// *main loop* strategy (ie the first pass of vplan execution).
850 std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
851
852protected:
853 /// Emits an iteration count bypass check once for the main loop (when \p
854 /// ForEpilogue is false) and once for the epilogue loop (when \p
855 /// ForEpilogue is true).
856 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
857 void printDebugTracesAtStart() override;
858 void printDebugTracesAtEnd() override;
859};
860
861// A specialized derived class of inner loop vectorizer that performs
862// vectorization of *epilogue* loops in the process of vectorizing loops and
863// their epilogues.
864class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
865public:
866 EpilogueVectorizerEpilogueLoop(
867 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
868 DominatorTree *DT, const TargetLibraryInfo *TLI,
869 const TargetTransformInfo *TTI, AssumptionCache *AC,
870 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
871 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
872 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
873 GeneratedRTChecks &Checks)
874 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
875 EPI, LVL, CM, BFI, PSI, Checks) {
876 TripCount = EPI.TripCount;
877 }
878 /// Implements the interface for creating a vectorized skeleton using the
879 /// *epilogue loop* strategy (ie the second pass of vplan execution).
880 std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
881
882protected:
883 /// Emits an iteration count bypass check after the main vector loop has
884 /// finished to see if there are any iterations left to execute by either
885 /// the vector epilogue or the scalar epilogue.
886 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
887 BasicBlock *Bypass,
888 BasicBlock *Insert);
889 void printDebugTracesAtStart() override;
890 void printDebugTracesAtEnd() override;
891};
892} // end namespace llvm
893
894/// Look for a meaningful debug location on the instruction or it's
895/// operands.
896static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
897 if (!I)
898 return I;
899
900 DebugLoc Empty;
901 if (I->getDebugLoc() != Empty)
902 return I;
903
904 for (Use &Op : I->operands()) {
905 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
906 if (OpInst->getDebugLoc() != Empty)
907 return OpInst;
908 }
909
910 return I;
911}
912
913/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
914/// is passed, the message relates to that particular instruction.
915#ifndef NDEBUG
916static void debugVectorizationMessage(const StringRef Prefix,
917 const StringRef DebugMsg,
918 Instruction *I) {
919 dbgs() << "LV: " << Prefix << DebugMsg;
920 if (I != nullptr)
921 dbgs() << " " << *I;
922 else
923 dbgs() << '.';
924 dbgs() << '\n';
925}
926#endif
927
928/// Create an analysis remark that explains why vectorization failed
929///
930/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
931/// RemarkName is the identifier for the remark. If \p I is passed it is an
932/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
933/// the location of the remark. \return the remark object that can be
934/// streamed to.
935static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
936 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
937 Value *CodeRegion = TheLoop->getHeader();
938 DebugLoc DL = TheLoop->getStartLoc();
939
940 if (I) {
941 CodeRegion = I->getParent();
942 // If there is no debug location attached to the instruction, revert back to
943 // using the loop's.
944 if (I->getDebugLoc())
945 DL = I->getDebugLoc();
946 }
947
948 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
949}
950
951namespace llvm {
952
953/// Return a value for Step multiplied by VF.
954Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
955 int64_t Step) {
956 assert(Ty->isIntegerTy() && "Expected an integer step")(static_cast <bool> (Ty->isIntegerTy() && "Expected an integer step"
) ? void (0) : __assert_fail ("Ty->isIntegerTy() && \"Expected an integer step\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 956, __extension__
__PRETTY_FUNCTION__))
;
957 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
958 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
959}
960
961/// Return the runtime value for VF.
962Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
963 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
964 return VF.isScalable() ? B.CreateVScale(EC) : EC;
965}
966
967static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
968 ElementCount VF) {
969 assert(FTy->isFloatingPointTy() && "Expected floating point type!")(static_cast <bool> (FTy->isFloatingPointTy() &&
"Expected floating point type!") ? void (0) : __assert_fail (
"FTy->isFloatingPointTy() && \"Expected floating point type!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 969, __extension__
__PRETTY_FUNCTION__))
;
970 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
971 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
972 return B.CreateUIToFP(RuntimeVF, FTy);
973}
974
975void reportVectorizationFailure(const StringRef DebugMsg,
976 const StringRef OREMsg, const StringRef ORETag,
977 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
978 Instruction *I) {
979 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationMessage("Not vectorizing: "
, DebugMsg, I); } } while (false)
;
980 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
981 ORE->emit(
982 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
983 << "loop not vectorized: " << OREMsg);
984}
985
986void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
987 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
988 Instruction *I) {
989 LLVM_DEBUG(debugVectorizationMessage("", Msg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationMessage("", Msg, I); }
} while (false)
;
990 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
991 ORE->emit(
992 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
993 << Msg);
994}
995
996} // end namespace llvm
997
998#ifndef NDEBUG
999/// \return string containing a file name and a line # for the given loop.
1000static std::string getDebugLocString(const Loop *L) {
1001 std::string Result;
1002 if (L) {
1003 raw_string_ostream OS(Result);
1004 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1005 LoopDbgLoc.print(OS);
1006 else
1007 // Just print the module name.
1008 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1009 OS.flush();
1010 }
1011 return Result;
1012}
1013#endif
1014
1015void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1016 VPTransformState &State) {
1017
1018 // Collect recipes in the backward slice of `Root` that may generate a poison
1019 // value that is used after vectorization.
1020 SmallPtrSet<VPRecipeBase *, 16> Visited;
1021 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1022 SmallVector<VPRecipeBase *, 16> Worklist;
1023 Worklist.push_back(Root);
1024
1025 // Traverse the backward slice of Root through its use-def chain.
1026 while (!Worklist.empty()) {
1027 VPRecipeBase *CurRec = Worklist.back();
1028 Worklist.pop_back();
1029
1030 if (!Visited.insert(CurRec).second)
1031 continue;
1032
1033 // Prune search if we find another recipe generating a widen memory
1034 // instruction. Widen memory instructions involved in address computation
1035 // will lead to gather/scatter instructions, which don't need to be
1036 // handled.
1037 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1038 isa<VPInterleaveRecipe>(CurRec) ||
1039 isa<VPScalarIVStepsRecipe>(CurRec) ||
1040 isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1041 isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1042 continue;
1043
1044 // This recipe contributes to the address computation of a widen
1045 // load/store. Collect recipe if its underlying instruction has
1046 // poison-generating flags.
1047 Instruction *Instr = CurRec->getUnderlyingInstr();
1048 if (Instr && Instr->hasPoisonGeneratingFlags())
1049 State.MayGeneratePoisonRecipes.insert(CurRec);
1050
1051 // Add new definitions to the worklist.
1052 for (VPValue *operand : CurRec->operands())
1053 if (VPDef *OpDef = operand->getDef())
1054 Worklist.push_back(cast<VPRecipeBase>(OpDef));
1055 }
1056 });
1057
1058 // Traverse all the recipes in the VPlan and collect the poison-generating
1059 // recipes in the backward slice starting at the address of a VPWidenRecipe or
1060 // VPInterleaveRecipe.
1061 auto Iter = depth_first(
1062 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1063 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1064 for (VPRecipeBase &Recipe : *VPBB) {
1065 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1066 Instruction &UnderlyingInstr = WidenRec->getIngredient();
1067 VPDef *AddrDef = WidenRec->getAddr()->getDef();
1068 if (AddrDef && WidenRec->isConsecutive() &&
1069 Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1070 collectPoisonGeneratingInstrsInBackwardSlice(
1071 cast<VPRecipeBase>(AddrDef));
1072 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1073 VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1074 if (AddrDef) {
1075 // Check if any member of the interleave group needs predication.
1076 const InterleaveGroup<Instruction> *InterGroup =
1077 InterleaveRec->getInterleaveGroup();
1078 bool NeedPredication = false;
1079 for (int I = 0, NumMembers = InterGroup->getNumMembers();
1080 I < NumMembers; ++I) {
1081 Instruction *Member = InterGroup->getMember(I);
1082 if (Member)
1083 NeedPredication |=
1084 Legal->blockNeedsPredication(Member->getParent());
1085 }
1086
1087 if (NeedPredication)
1088 collectPoisonGeneratingInstrsInBackwardSlice(
1089 cast<VPRecipeBase>(AddrDef));
1090 }
1091 }
1092 }
1093 }
1094}
1095
1096PHINode *InnerLoopVectorizer::getReductionResumeValue(
1097 const RecurrenceDescriptor &RdxDesc) {
1098 auto It = ReductionResumeValues.find(&RdxDesc);
1099 assert(It != ReductionResumeValues.end() &&(static_cast <bool> (It != ReductionResumeValues.end() &&
"Expected to find a resume value for the reduction.") ? void
(0) : __assert_fail ("It != ReductionResumeValues.end() && \"Expected to find a resume value for the reduction.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1100, __extension__
__PRETTY_FUNCTION__))
1100 "Expected to find a resume value for the reduction.")(static_cast <bool> (It != ReductionResumeValues.end() &&
"Expected to find a resume value for the reduction.") ? void
(0) : __assert_fail ("It != ReductionResumeValues.end() && \"Expected to find a resume value for the reduction.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1100, __extension__
__PRETTY_FUNCTION__))
;
1101 return It->second;
1102}
1103
1104namespace llvm {
1105
1106// Loop vectorization cost-model hints how the scalar epilogue loop should be
1107// lowered.
1108enum ScalarEpilogueLowering {
1109
1110 // The default: allowing scalar epilogues.
1111 CM_ScalarEpilogueAllowed,
1112
1113 // Vectorization with OptForSize: don't allow epilogues.
1114 CM_ScalarEpilogueNotAllowedOptSize,
1115
1116 // A special case of vectorisation with OptForSize: loops with a very small
1117 // trip count are considered for vectorization under OptForSize, thereby
1118 // making sure the cost of their loop body is dominant, free of runtime
1119 // guards and scalar iteration overheads.
1120 CM_ScalarEpilogueNotAllowedLowTripLoop,
1121
1122 // Loop hint predicate indicating an epilogue is undesired.
1123 CM_ScalarEpilogueNotNeededUsePredicate,
1124
1125 // Directive indicating we must either tail fold or not vectorize
1126 CM_ScalarEpilogueNotAllowedUsePredicate
1127};
1128
1129/// ElementCountComparator creates a total ordering for ElementCount
1130/// for the purposes of using it in a set structure.
1131struct ElementCountComparator {
1132 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1133 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1134 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1135 }
1136};
1137using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1138
1139/// LoopVectorizationCostModel - estimates the expected speedups due to
1140/// vectorization.
1141/// In many cases vectorization is not profitable. This can happen because of
1142/// a number of reasons. In this class we mainly attempt to predict the
1143/// expected speedup/slowdowns due to the supported instruction set. We use the
1144/// TargetTransformInfo to query the different backends for the cost of
1145/// different operations.
1146class LoopVectorizationCostModel {
1147public:
1148 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1149 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1150 LoopVectorizationLegality *Legal,
1151 const TargetTransformInfo &TTI,
1152 const TargetLibraryInfo *TLI, DemandedBits *DB,
1153 AssumptionCache *AC,
1154 OptimizationRemarkEmitter *ORE, const Function *F,
1155 const LoopVectorizeHints *Hints,
1156 InterleavedAccessInfo &IAI)
1157 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1158 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1159 Hints(Hints), InterleaveInfo(IAI) {}
1160
1161 /// \return An upper bound for the vectorization factors (both fixed and
1162 /// scalable). If the factors are 0, vectorization and interleaving should be
1163 /// avoided up front.
1164 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1165
1166 /// \return True if runtime checks are required for vectorization, and false
1167 /// otherwise.
1168 bool runtimeChecksRequired();
1169
1170 /// \return The most profitable vectorization factor and the cost of that VF.
1171 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1172 /// then this vectorization factor will be selected if vectorization is
1173 /// possible.
1174 VectorizationFactor
1175 selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1176
1177 VectorizationFactor
1178 selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1179 const LoopVectorizationPlanner &LVP);
1180
1181 /// Setup cost-based decisions for user vectorization factor.
1182 /// \return true if the UserVF is a feasible VF to be chosen.
1183 bool selectUserVectorizationFactor(ElementCount UserVF) {
1184 collectUniformsAndScalars(UserVF);
1185 collectInstsToScalarize(UserVF);
1186 return expectedCost(UserVF).first.isValid();
1187 }
1188
1189 /// \return The size (in bits) of the smallest and widest types in the code
1190 /// that needs to be vectorized. We ignore values that remain scalar such as
1191 /// 64 bit loop indices.
1192 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1193
1194 /// \return The desired interleave count.
1195 /// If interleave count has been specified by metadata it will be returned.
1196 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1197 /// are the selected vectorization factor and the cost of the selected VF.
1198 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1199
1200 /// Memory access instruction may be vectorized in more than one way.
1201 /// Form of instruction after vectorization depends on cost.
1202 /// This function takes cost-based decisions for Load/Store instructions
1203 /// and collects them in a map. This decisions map is used for building
1204 /// the lists of loop-uniform and loop-scalar instructions.
1205 /// The calculated cost is saved with widening decision in order to
1206 /// avoid redundant calculations.
1207 void setCostBasedWideningDecision(ElementCount VF);
1208
1209 /// A struct that represents some properties of the register usage
1210 /// of a loop.
1211 struct RegisterUsage {
1212 /// Holds the number of loop invariant values that are used in the loop.
1213 /// The key is ClassID of target-provided register class.
1214 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1215 /// Holds the maximum number of concurrent live intervals in the loop.
1216 /// The key is ClassID of target-provided register class.
1217 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1218 };
1219
1220 /// \return Returns information about the register usages of the loop for the
1221 /// given vectorization factors.
1222 SmallVector<RegisterUsage, 8>
1223 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1224
1225 /// Collect values we want to ignore in the cost model.
1226 void collectValuesToIgnore();
1227
1228 /// Collect all element types in the loop for which widening is needed.
1229 void collectElementTypesForWidening();
1230
1231 /// Split reductions into those that happen in the loop, and those that happen
1232 /// outside. In loop reductions are collected into InLoopReductionChains.
1233 void collectInLoopReductions();
1234
1235 /// Returns true if we should use strict in-order reductions for the given
1236 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1237 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1238 /// of FP operations.
1239 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1240 return !Hints->allowReordering() && RdxDesc.isOrdered();
1241 }
1242
1243 /// \returns The smallest bitwidth each instruction can be represented with.
1244 /// The vector equivalents of these instructions should be truncated to this
1245 /// type.
1246 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1247 return MinBWs;
1248 }
1249
1250 /// \returns True if it is more profitable to scalarize instruction \p I for
1251 /// vectorization factor \p VF.
1252 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1253 assert(VF.isVector() &&(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1254, __extension__
__PRETTY_FUNCTION__))
1254 "Profitable to scalarize relevant only for VF > 1.")(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1254, __extension__
__PRETTY_FUNCTION__))
;
1255
1256 // Cost model is not run in the VPlan-native path - return conservative
1257 // result until this changes.
1258 if (EnableVPlanNativePath)
1259 return false;
1260
1261 auto Scalars = InstsToScalarize.find(VF);
1262 assert(Scalars != InstsToScalarize.end() &&(static_cast <bool> (Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability") ? void
(0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1263, __extension__
__PRETTY_FUNCTION__))
1263 "VF not yet analyzed for scalarization profitability")(static_cast <bool> (Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability") ? void
(0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1263, __extension__
__PRETTY_FUNCTION__))
;
1264 return Scalars->second.find(I) != Scalars->second.end();
1265 }
1266
1267 /// Returns true if \p I is known to be uniform after vectorization.
1268 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1269 if (VF.isScalar())
1270 return true;
1271
1272 // Cost model is not run in the VPlan-native path - return conservative
1273 // result until this changes.
1274 if (EnableVPlanNativePath)
1275 return false;
1276
1277 auto UniformsPerVF = Uniforms.find(VF);
1278 assert(UniformsPerVF != Uniforms.end() &&(static_cast <bool> (UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity") ? void (0) : __assert_fail
("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1279, __extension__
__PRETTY_FUNCTION__))
1279 "VF not yet analyzed for uniformity")(static_cast <bool> (UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity") ? void (0) : __assert_fail
("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1279, __extension__
__PRETTY_FUNCTION__))
;
1280 return UniformsPerVF->second.count(I);
1281 }
1282
1283 /// Returns true if \p I is known to be scalar after vectorization.
1284 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1285 if (VF.isScalar())
1286 return true;
1287
1288 // Cost model is not run in the VPlan-native path - return conservative
1289 // result until this changes.
1290 if (EnableVPlanNativePath)
1291 return false;
1292
1293 auto ScalarsPerVF = Scalars.find(VF);
1294 assert(ScalarsPerVF != Scalars.end() &&(static_cast <bool> (ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF") ? void (0) : __assert_fail
("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1295, __extension__
__PRETTY_FUNCTION__))
1295 "Scalar values are not calculated for VF")(static_cast <bool> (ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF") ? void (0) : __assert_fail
("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1295, __extension__
__PRETTY_FUNCTION__))
;
1296 return ScalarsPerVF->second.count(I);
1297 }
1298
1299 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1300 /// for vectorization factor \p VF.
1301 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1302 return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1303 !isProfitableToScalarize(I, VF) &&
1304 !isScalarAfterVectorization(I, VF);
1305 }
1306
1307 /// Decision that was taken during cost calculation for memory instruction.
1308 enum InstWidening {
1309 CM_Unknown,
1310 CM_Widen, // For consecutive accesses with stride +1.
1311 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1312 CM_Interleave,
1313 CM_GatherScatter,
1314 CM_Scalarize
1315 };
1316
1317 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1318 /// instruction \p I and vector width \p VF.
1319 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1320 InstructionCost Cost) {
1321 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1321, __extension__
__PRETTY_FUNCTION__))
;
1322 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1323 }
1324
1325 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1326 /// interleaving group \p Grp and vector width \p VF.
1327 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1328 ElementCount VF, InstWidening W,
1329 InstructionCost Cost) {
1330 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1330, __extension__
__PRETTY_FUNCTION__))
;
1331 /// Broadcast this decicion to all instructions inside the group.
1332 /// But the cost will be assigned to one instruction only.
1333 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1334 if (auto *I = Grp->getMember(i)) {
1335 if (Grp->getInsertPos() == I)
1336 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1337 else
1338 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1339 }
1340 }
1341 }
1342
1343 /// Return the cost model decision for the given instruction \p I and vector
1344 /// width \p VF. Return CM_Unknown if this instruction did not pass
1345 /// through the cost modeling.
1346 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1347 assert(VF.isVector() && "Expected VF to be a vector VF")(static_cast <bool> (VF.isVector() && "Expected VF to be a vector VF"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF to be a vector VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1347, __extension__
__PRETTY_FUNCTION__))
;
1348 // Cost model is not run in the VPlan-native path - return conservative
1349 // result until this changes.
1350 if (EnableVPlanNativePath)
1351 return CM_GatherScatter;
1352
1353 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1354 auto Itr = WideningDecisions.find(InstOnVF);
1355 if (Itr == WideningDecisions.end())
1356 return CM_Unknown;
1357 return Itr->second.first;
1358 }
1359
1360 /// Return the vectorization cost for the given instruction \p I and vector
1361 /// width \p VF.
1362 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1363 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1363, __extension__
__PRETTY_FUNCTION__))
;
1364 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1365 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&(static_cast <bool> (WideningDecisions.find(InstOnVF) !=
WideningDecisions.end() && "The cost is not calculated"
) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1366, __extension__
__PRETTY_FUNCTION__))
1366 "The cost is not calculated")(static_cast <bool> (WideningDecisions.find(InstOnVF) !=
WideningDecisions.end() && "The cost is not calculated"
) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1366, __extension__
__PRETTY_FUNCTION__))
;
1367 return WideningDecisions[InstOnVF].second;
1368 }
1369
1370 /// Return True if instruction \p I is an optimizable truncate whose operand
1371 /// is an induction variable. Such a truncate will be removed by adding a new
1372 /// induction variable with the destination type.
1373 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1374 // If the instruction is not a truncate, return false.
1375 auto *Trunc = dyn_cast<TruncInst>(I);
1376 if (!Trunc)
1377 return false;
1378
1379 // Get the source and destination types of the truncate.
1380 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1381 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1382
1383 // If the truncate is free for the given types, return false. Replacing a
1384 // free truncate with an induction variable would add an induction variable
1385 // update instruction to each iteration of the loop. We exclude from this
1386 // check the primary induction variable since it will need an update
1387 // instruction regardless.
1388 Value *Op = Trunc->getOperand(0);
1389 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1390 return false;
1391
1392 // If the truncated value is not an induction variable, return false.
1393 return Legal->isInductionPhi(Op);
1394 }
1395
1396 /// Collects the instructions to scalarize for each predicated instruction in
1397 /// the loop.
1398 void collectInstsToScalarize(ElementCount VF);
1399
1400 /// Collect Uniform and Scalar values for the given \p VF.
1401 /// The sets depend on CM decision for Load/Store instructions
1402 /// that may be vectorized as interleave, gather-scatter or scalarized.
1403 void collectUniformsAndScalars(ElementCount VF) {
1404 // Do the analysis once.
1405 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1406 return;
1407 setCostBasedWideningDecision(VF);
1408 collectLoopUniforms(VF);
1409 collectLoopScalars(VF);
1410 }
1411
1412 /// Returns true if the target machine supports masked store operation
1413 /// for the given \p DataType and kind of access to \p Ptr.
1414 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1415 return Legal->isConsecutivePtr(DataType, Ptr) &&
1416 TTI.isLegalMaskedStore(DataType, Alignment);
1417 }
1418
1419 /// Returns true if the target machine supports masked load operation
1420 /// for the given \p DataType and kind of access to \p Ptr.
1421 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1422 return Legal->isConsecutivePtr(DataType, Ptr) &&
1423 TTI.isLegalMaskedLoad(DataType, Alignment);
1424 }
1425
1426 /// Returns true if the target machine can represent \p V as a masked gather
1427 /// or scatter operation.
1428 bool isLegalGatherOrScatter(Value *V,
1429 ElementCount VF = ElementCount::getFixed(1)) {
1430 bool LI = isa<LoadInst>(V);
1431 bool SI = isa<StoreInst>(V);
1432 if (!LI && !SI)
1433 return false;
1434 auto *Ty = getLoadStoreType(V);
1435 Align Align = getLoadStoreAlignment(V);
1436 if (VF.isVector())
1437 Ty = VectorType::get(Ty, VF);
1438 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1439 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1440 }
1441
1442 /// Returns true if the target machine supports all of the reduction
1443 /// variables found for the given VF.
1444 bool canVectorizeReductions(ElementCount VF) const {
1445 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1446 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1447 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1448 }));
1449 }
1450
1451 /// Given costs for both strategies, return true if the scalar predication
1452 /// lowering should be used for div/rem. This incorporates an override
1453 /// option so it is not simply a cost comparison.
1454 bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1455 InstructionCost SafeDivisorCost) const {
1456 switch (ForceSafeDivisor) {
1457 case cl::BOU_UNSET:
1458 return ScalarCost < SafeDivisorCost;
1459 case cl::BOU_TRUE:
1460 return false;
1461 case cl::BOU_FALSE:
1462 return true;
1463 };
1464 llvm_unreachable("impossible case value")::llvm::llvm_unreachable_internal("impossible case value", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1464)
;
1465 }
1466
1467 /// Returns true if \p I is an instruction which requires predication and
1468 /// for which our chosen predication strategy is scalarization (i.e. we
1469 /// don't have an alternate strategy such as masking available).
1470 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1471 bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1472
1473 /// Returns true if \p I is an instruction that needs to be predicated
1474 /// at runtime. The result is independent of the predication mechanism.
1475 /// Superset of instructions that return true for isScalarWithPredication.
1476 bool isPredicatedInst(Instruction *I) const;
1477
1478 /// Return the costs for our two available strategies for lowering a
1479 /// div/rem operation which requires speculating at least one lane.
1480 /// First result is for scalarization (will be invalid for scalable
1481 /// vectors); second is for the safe-divisor strategy.
1482 std::pair<InstructionCost, InstructionCost>
1483 getDivRemSpeculationCost(Instruction *I,
1484 ElementCount VF) const;
1485
1486 /// Returns true if \p I is a memory instruction with consecutive memory
1487 /// access that can be widened.
1488 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1489
1490 /// Returns true if \p I is a memory instruction in an interleaved-group
1491 /// of memory accesses that can be vectorized with wide vector loads/stores
1492 /// and shuffles.
1493 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
1494
1495 /// Check if \p Instr belongs to any interleaved access group.
1496 bool isAccessInterleaved(Instruction *Instr) {
1497 return InterleaveInfo.isInterleaved(Instr);
1498 }
1499
1500 /// Get the interleaved access group that \p Instr belongs to.
1501 const InterleaveGroup<Instruction> *
1502 getInterleavedAccessGroup(Instruction *Instr) {
1503 return InterleaveInfo.getInterleaveGroup(Instr);
1504 }
1505
1506 /// Returns true if we're required to use a scalar epilogue for at least
1507 /// the final iteration of the original loop.
1508 bool requiresScalarEpilogue(ElementCount VF) const {
1509 if (!isScalarEpilogueAllowed())
1510 return false;
1511 // If we might exit from anywhere but the latch, must run the exiting
1512 // iteration in scalar form.
1513 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1514 return true;
1515 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1516 }
1517
1518 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1519 /// loop hint annotation.
1520 bool isScalarEpilogueAllowed() const {
1521 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1522 }
1523
1524 /// Returns true if all loop blocks should be masked to fold tail loop.
1525 bool foldTailByMasking() const { return FoldTailByMasking; }
1526
1527 /// Returns true if were tail-folding and want to use the active lane mask
1528 /// for vector loop control flow.
1529 bool useActiveLaneMaskForControlFlow() const {
1530 return FoldTailByMasking &&
1531 TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow;
1532 }
1533
1534 /// Returns true if the instructions in this block requires predication
1535 /// for any reason, e.g. because tail folding now requires a predicate
1536 /// or because the block in the original loop was predicated.
1537 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1538 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1539 }
1540
1541 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1542 /// nodes to the chain of instructions representing the reductions. Uses a
1543 /// MapVector to ensure deterministic iteration order.
1544 using ReductionChainMap =
1545 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1546
1547 /// Return the chain of instructions representing an inloop reduction.
1548 const ReductionChainMap &getInLoopReductionChains() const {
1549 return InLoopReductionChains;
1550 }
1551
1552 /// Returns true if the Phi is part of an inloop reduction.
1553 bool isInLoopReduction(PHINode *Phi) const {
1554 return InLoopReductionChains.count(Phi);
1555 }
1556
1557 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1558 /// with factor VF. Return the cost of the instruction, including
1559 /// scalarization overhead if it's needed.
1560 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1561
1562 /// Estimate cost of a call instruction CI if it were vectorized with factor
1563 /// VF. Return the cost of the instruction, including scalarization overhead
1564 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1565 /// scalarized -
1566 /// i.e. either vector version isn't available, or is too expensive.
1567 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1568 bool &NeedToScalarize) const;
1569
1570 /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1571 /// that of B.
1572 bool isMoreProfitable(const VectorizationFactor &A,
1573 const VectorizationFactor &B) const;
1574
1575 /// Invalidates decisions already taken by the cost model.
1576 void invalidateCostModelingDecisions() {
1577 WideningDecisions.clear();
1578 Uniforms.clear();
1579 Scalars.clear();
1580 }
1581
1582 /// Convenience function that returns the value of vscale_range iff
1583 /// vscale_range.min == vscale_range.max or otherwise returns the value
1584 /// returned by the corresponding TLI method.
1585 Optional<unsigned> getVScaleForTuning() const;
1586
1587private:
1588 unsigned NumPredStores = 0;
1589
1590 /// \return An upper bound for the vectorization factors for both
1591 /// fixed and scalable vectorization, where the minimum-known number of
1592 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1593 /// disabled or unsupported, then the scalable part will be equal to
1594 /// ElementCount::getScalable(0).
1595 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1596 ElementCount UserVF,
1597 bool FoldTailByMasking);
1598
1599 /// \return the maximized element count based on the targets vector
1600 /// registers and the loop trip-count, but limited to a maximum safe VF.
1601 /// This is a helper function of computeFeasibleMaxVF.
1602 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1603 unsigned SmallestType,
1604 unsigned WidestType,
1605 ElementCount MaxSafeVF,
1606 bool FoldTailByMasking);
1607
1608 /// \return the maximum legal scalable VF, based on the safe max number
1609 /// of elements.
1610 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1611
1612 /// The vectorization cost is a combination of the cost itself and a boolean
1613 /// indicating whether any of the contributing operations will actually
1614 /// operate on vector values after type legalization in the backend. If this
1615 /// latter value is false, then all operations will be scalarized (i.e. no
1616 /// vectorization has actually taken place).
1617 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1618
1619 /// Returns the expected execution cost. The unit of the cost does
1620 /// not matter because we use the 'cost' units to compare different
1621 /// vector widths. The cost that is returned is *not* normalized by
1622 /// the factor width. If \p Invalid is not nullptr, this function
1623 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1624 /// each instruction that has an Invalid cost for the given VF.
1625 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1626 VectorizationCostTy
1627 expectedCost(ElementCount VF,
1628 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1629
1630 /// Returns the execution time cost of an instruction for a given vector
1631 /// width. Vector width of one means scalar.
1632 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1633
1634 /// The cost-computation logic from getInstructionCost which provides
1635 /// the vector type as an output parameter.
1636 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1637 Type *&VectorTy);
1638
1639 /// Return the cost of instructions in an inloop reduction pattern, if I is
1640 /// part of that pattern.
1641 Optional<InstructionCost>
1642 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1643 TTI::TargetCostKind CostKind);
1644
1645 /// Calculate vectorization cost of memory instruction \p I.
1646 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1647
1648 /// The cost computation for scalarized memory instruction.
1649 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1650
1651 /// The cost computation for interleaving group of memory instructions.
1652 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1653
1654 /// The cost computation for Gather/Scatter instruction.
1655 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1656
1657 /// The cost computation for widening instruction \p I with consecutive
1658 /// memory access.
1659 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1660
1661 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1662 /// Load: scalar load + broadcast.
1663 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1664 /// element)
1665 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1666
1667 /// Estimate the overhead of scalarizing an instruction. This is a
1668 /// convenience wrapper for the type-based getScalarizationOverhead API.
1669 InstructionCost getScalarizationOverhead(Instruction *I,
1670 ElementCount VF) const;
1671
1672 /// Returns true if an artificially high cost for emulated masked memrefs
1673 /// should be used.
1674 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1675
1676 /// Map of scalar integer values to the smallest bitwidth they can be legally
1677 /// represented as. The vector equivalents of these values should be truncated
1678 /// to this type.
1679 MapVector<Instruction *, uint64_t> MinBWs;
1680
1681 /// A type representing the costs for instructions if they were to be
1682 /// scalarized rather than vectorized. The entries are Instruction-Cost
1683 /// pairs.
1684 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1685
1686 /// A set containing all BasicBlocks that are known to present after
1687 /// vectorization as a predicated block.
1688 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1689 PredicatedBBsAfterVectorization;
1690
1691 /// Records whether it is allowed to have the original scalar loop execute at
1692 /// least once. This may be needed as a fallback loop in case runtime
1693 /// aliasing/dependence checks fail, or to handle the tail/remainder
1694 /// iterations when the trip count is unknown or doesn't divide by the VF,
1695 /// or as a peel-loop to handle gaps in interleave-groups.
1696 /// Under optsize and when the trip count is very small we don't allow any
1697 /// iterations to execute in the scalar loop.
1698 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1699
1700 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1701 bool FoldTailByMasking = false;
1702
1703 /// A map holding scalar costs for different vectorization factors. The
1704 /// presence of a cost for an instruction in the mapping indicates that the
1705 /// instruction will be scalarized when vectorizing with the associated
1706 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1707 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1708
1709 /// Holds the instructions known to be uniform after vectorization.
1710 /// The data is collected per VF.
1711 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1712
1713 /// Holds the instructions known to be scalar after vectorization.
1714 /// The data is collected per VF.
1715 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1716
1717 /// Holds the instructions (address computations) that are forced to be
1718 /// scalarized.
1719 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1720
1721 /// PHINodes of the reductions that should be expanded in-loop along with
1722 /// their associated chains of reduction operations, in program order from top
1723 /// (PHI) to bottom
1724 ReductionChainMap InLoopReductionChains;
1725
1726 /// A Map of inloop reduction operations and their immediate chain operand.
1727 /// FIXME: This can be removed once reductions can be costed correctly in
1728 /// vplan. This was added to allow quick lookup to the inloop operations,
1729 /// without having to loop through InLoopReductionChains.
1730 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1731
1732 /// Returns the expected difference in cost from scalarizing the expression
1733 /// feeding a predicated instruction \p PredInst. The instructions to
1734 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1735 /// non-negative return value implies the expression will be scalarized.
1736 /// Currently, only single-use chains are considered for scalarization.
1737 InstructionCost computePredInstDiscount(Instruction *PredInst,
1738 ScalarCostsTy &ScalarCosts,
1739 ElementCount VF);
1740
1741 /// Collect the instructions that are uniform after vectorization. An
1742 /// instruction is uniform if we represent it with a single scalar value in
1743 /// the vectorized loop corresponding to each vector iteration. Examples of
1744 /// uniform instructions include pointer operands of consecutive or
1745 /// interleaved memory accesses. Note that although uniformity implies an
1746 /// instruction will be scalar, the reverse is not true. In general, a
1747 /// scalarized instruction will be represented by VF scalar values in the
1748 /// vectorized loop, each corresponding to an iteration of the original
1749 /// scalar loop.
1750 void collectLoopUniforms(ElementCount VF);
1751
1752 /// Collect the instructions that are scalar after vectorization. An
1753 /// instruction is scalar if it is known to be uniform or will be scalarized
1754 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1755 /// to the list if they are used by a load/store instruction that is marked as
1756 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1757 /// VF values in the vectorized loop, each corresponding to an iteration of
1758 /// the original scalar loop.
1759 void collectLoopScalars(ElementCount VF);
1760
1761 /// Keeps cost model vectorization decision and cost for instructions.
1762 /// Right now it is used for memory instructions only.
1763 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1764 std::pair<InstWidening, InstructionCost>>;
1765
1766 DecisionList WideningDecisions;
1767
1768 /// Returns true if \p V is expected to be vectorized and it needs to be
1769 /// extracted.
1770 bool needsExtract(Value *V, ElementCount VF) const {
1771 Instruction *I = dyn_cast<Instruction>(V);
1772 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1773 TheLoop->isLoopInvariant(I))
1774 return false;
1775
1776 // Assume we can vectorize V (and hence we need extraction) if the
1777 // scalars are not computed yet. This can happen, because it is called
1778 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1779 // the scalars are collected. That should be a safe assumption in most
1780 // cases, because we check if the operands have vectorizable types
1781 // beforehand in LoopVectorizationLegality.
1782 return Scalars.find(VF) == Scalars.end() ||
1783 !isScalarAfterVectorization(I, VF);
1784 };
1785
1786 /// Returns a range containing only operands needing to be extracted.
1787 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1788 ElementCount VF) const {
1789 return SmallVector<Value *, 4>(make_filter_range(
1790 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1791 }
1792
1793 /// Determines if we have the infrastructure to vectorize loop \p L and its
1794 /// epilogue, assuming the main loop is vectorized by \p VF.
1795 bool isCandidateForEpilogueVectorization(const Loop &L,
1796 const ElementCount VF) const;
1797
1798 /// Returns true if epilogue vectorization is considered profitable, and
1799 /// false otherwise.
1800 /// \p VF is the vectorization factor chosen for the original loop.
1801 bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1802
1803public:
1804 /// The loop that we evaluate.
1805 Loop *TheLoop;
1806
1807 /// Predicated scalar evolution analysis.
1808 PredicatedScalarEvolution &PSE;
1809
1810 /// Loop Info analysis.
1811 LoopInfo *LI;
1812
1813 /// Vectorization legality.
1814 LoopVectorizationLegality *Legal;
1815
1816 /// Vector target information.
1817 const TargetTransformInfo &TTI;
1818
1819 /// Target Library Info.
1820 const TargetLibraryInfo *TLI;
1821
1822 /// Demanded bits analysis.
1823 DemandedBits *DB;
1824
1825 /// Assumption cache.
1826 AssumptionCache *AC;
1827
1828 /// Interface to emit optimization remarks.
1829 OptimizationRemarkEmitter *ORE;
1830
1831 const Function *TheFunction;
1832
1833 /// Loop Vectorize Hint.
1834 const LoopVectorizeHints *Hints;
1835
1836 /// The interleave access information contains groups of interleaved accesses
1837 /// with the same stride and close to each other.
1838 InterleavedAccessInfo &InterleaveInfo;
1839
1840 /// Values to ignore in the cost model.
1841 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1842
1843 /// Values to ignore in the cost model when VF > 1.
1844 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1845
1846 /// All element types found in the loop.
1847 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1848
1849 /// Profitable vector factors.
1850 SmallVector<VectorizationFactor, 8> ProfitableVFs;
1851};
1852} // end namespace llvm
1853
1854/// Helper struct to manage generating runtime checks for vectorization.
1855///
1856/// The runtime checks are created up-front in temporary blocks to allow better
1857/// estimating the cost and un-linked from the existing IR. After deciding to
1858/// vectorize, the checks are moved back. If deciding not to vectorize, the
1859/// temporary blocks are completely removed.
1860class GeneratedRTChecks {
1861 /// Basic block which contains the generated SCEV checks, if any.
1862 BasicBlock *SCEVCheckBlock = nullptr;
1863
1864 /// The value representing the result of the generated SCEV checks. If it is
1865 /// nullptr, either no SCEV checks have been generated or they have been used.
1866 Value *SCEVCheckCond = nullptr;
1867
1868 /// Basic block which contains the generated memory runtime checks, if any.
1869 BasicBlock *MemCheckBlock = nullptr;
1870
1871 /// The value representing the result of the generated memory runtime checks.
1872 /// If it is nullptr, either no memory runtime checks have been generated or
1873 /// they have been used.
1874 Value *MemRuntimeCheckCond = nullptr;
1875
1876 DominatorTree *DT;
1877 LoopInfo *LI;
1878 TargetTransformInfo *TTI;
1879
1880 SCEVExpander SCEVExp;
1881 SCEVExpander MemCheckExp;
1882
1883 bool CostTooHigh = false;
1884
1885public:
1886 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1887 TargetTransformInfo *TTI, const DataLayout &DL)
1888 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1889 MemCheckExp(SE, DL, "scev.check") {}
1890
1891 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1892 /// accurately estimate the cost of the runtime checks. The blocks are
1893 /// un-linked from the IR and is added back during vector code generation. If
1894 /// there is no vector code generation, the check blocks are removed
1895 /// completely.
1896 void Create(Loop *L, const LoopAccessInfo &LAI,
1897 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1898
1899 // Hard cutoff to limit compile-time increase in case a very large number of
1900 // runtime checks needs to be generated.
1901 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1902 // profile info.
1903 CostTooHigh =
1904 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1905 if (CostTooHigh)
1906 return;
1907
1908 BasicBlock *LoopHeader = L->getHeader();
1909 BasicBlock *Preheader = L->getLoopPreheader();
1910
1911 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1912 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1913 // may be used by SCEVExpander. The blocks will be un-linked from their
1914 // predecessors and removed from LI & DT at the end of the function.
1915 if (!UnionPred.isAlwaysTrue()) {
1916 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1917 nullptr, "vector.scevcheck");
1918
1919 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1920 &UnionPred, SCEVCheckBlock->getTerminator());
1921 }
1922
1923 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1924 if (RtPtrChecking.Need) {
1925 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1926 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1927 "vector.memcheck");
1928
1929 auto DiffChecks = RtPtrChecking.getDiffChecks();
1930 if (DiffChecks) {
1931 Value *RuntimeVF = nullptr;
1932 MemRuntimeCheckCond = addDiffRuntimeChecks(
1933 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1934 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1935 if (!RuntimeVF)
1936 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1937 return RuntimeVF;
1938 },
1939 IC);
1940 } else {
1941 MemRuntimeCheckCond =
1942 addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1943 RtPtrChecking.getChecks(), MemCheckExp);
1944 }
1945 assert(MemRuntimeCheckCond &&(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1947, __extension__
__PRETTY_FUNCTION__))
1946 "no RT checks generated although RtPtrChecking "(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1947, __extension__
__PRETTY_FUNCTION__))
1947 "claimed checks are required")(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1947, __extension__
__PRETTY_FUNCTION__))
;
1948 }
1949
1950 if (!MemCheckBlock && !SCEVCheckBlock)
1951 return;
1952
1953 // Unhook the temporary block with the checks, update various places
1954 // accordingly.
1955 if (SCEVCheckBlock)
1956 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1957 if (MemCheckBlock)
1958 MemCheckBlock->replaceAllUsesWith(Preheader);
1959
1960 if (SCEVCheckBlock) {
1961 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1962 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1963 Preheader->getTerminator()->eraseFromParent();
1964 }
1965 if (MemCheckBlock) {
1966 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1967 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1968 Preheader->getTerminator()->eraseFromParent();
1969 }
1970
1971 DT->changeImmediateDominator(LoopHeader, Preheader);
1972 if (MemCheckBlock) {
1973 DT->eraseNode(MemCheckBlock);
1974 LI->removeBlock(MemCheckBlock);
1975 }
1976 if (SCEVCheckBlock) {
1977 DT->eraseNode(SCEVCheckBlock);
1978 LI->removeBlock(SCEVCheckBlock);
1979 }
1980 }
1981
1982 InstructionCost getCost() {
1983 if (SCEVCheckBlock || MemCheckBlock)
1984 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Calculating cost of runtime checks:\n"
; } } while (false)
;
1985
1986 if (CostTooHigh) {
1987 InstructionCost Cost;
1988 Cost.setInvalid();
1989 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " number of checks exceeded threshold\n"
; } } while (false)
;
1990 return Cost;
1991 }
1992
1993 InstructionCost RTCheckCost = 0;
1994 if (SCEVCheckBlock)
1995 for (Instruction &I : *SCEVCheckBlock) {
1996 if (SCEVCheckBlock->getTerminator() == &I)
1997 continue;
1998 InstructionCost C =
1999 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2000 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " " << C <<
" for " << I << "\n"; } } while (false)
;
2001 RTCheckCost += C;
2002 }
2003 if (MemCheckBlock)
2004 for (Instruction &I : *MemCheckBlock) {
2005 if (MemCheckBlock->getTerminator() == &I)
2006 continue;
2007 InstructionCost C =
2008 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2009 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " " << C <<
" for " << I << "\n"; } } while (false)
;
2010 RTCheckCost += C;
2011 }
2012
2013 if (SCEVCheckBlock || MemCheckBlock)
2014 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCostdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Total cost of runtime checks: "
<< RTCheckCost << "\n"; } } while (false)
2015 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Total cost of runtime checks: "
<< RTCheckCost << "\n"; } } while (false)
;
2016
2017 return RTCheckCost;
2018 }
2019
2020 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2021 /// unused.
2022 ~GeneratedRTChecks() {
2023 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2024 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2025 if (!SCEVCheckCond)
2026 SCEVCleaner.markResultUsed();
2027
2028 if (!MemRuntimeCheckCond)
2029 MemCheckCleaner.markResultUsed();
2030
2031 if (MemRuntimeCheckCond) {
2032 auto &SE = *MemCheckExp.getSE();
2033 // Memory runtime check generation creates compares that use expanded
2034 // values. Remove them before running the SCEVExpanderCleaners.
2035 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2036 if (MemCheckExp.isInsertedInstruction(&I))
2037 continue;
2038 SE.forgetValue(&I);
2039 I.eraseFromParent();
2040 }
2041 }
2042 MemCheckCleaner.cleanup();
2043 SCEVCleaner.cleanup();
2044
2045 if (SCEVCheckCond)
2046 SCEVCheckBlock->eraseFromParent();
2047 if (MemRuntimeCheckCond)
2048 MemCheckBlock->eraseFromParent();
2049 }
2050
2051 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2052 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2053 /// depending on the generated condition.
2054 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2055 BasicBlock *LoopVectorPreHeader,
2056 BasicBlock *LoopExitBlock) {
2057 if (!SCEVCheckCond)
2058 return nullptr;
2059
2060 Value *Cond = SCEVCheckCond;
2061 // Mark the check as used, to prevent it from being removed during cleanup.
2062 SCEVCheckCond = nullptr;
2063 if (auto *C = dyn_cast<ConstantInt>(Cond))
2064 if (C->isZero())
2065 return nullptr;
2066
2067 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2068
2069 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2070 // Create new preheader for vector loop.
2071 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2072 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2073
2074 SCEVCheckBlock->getTerminator()->eraseFromParent();
2075 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2076 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2077 SCEVCheckBlock);
2078
2079 DT->addNewBlock(SCEVCheckBlock, Pred);
2080 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2081
2082 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
2083 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
2084 return SCEVCheckBlock;
2085 }
2086
2087 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2088 /// the branches to branch to the vector preheader or \p Bypass, depending on
2089 /// the generated condition.
2090 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2091 BasicBlock *LoopVectorPreHeader) {
2092 // Check if we generated code that checks in runtime if arrays overlap.
2093 if (!MemRuntimeCheckCond)
2094 return nullptr;
2095
2096 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2097 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2098 MemCheckBlock);
2099
2100 DT->addNewBlock(MemCheckBlock, Pred);
2101 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2102 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2103
2104 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2105 PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2106
2107 ReplaceInstWithInst(
2108 MemCheckBlock->getTerminator(),
2109 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2110 MemCheckBlock->getTerminator()->setDebugLoc(
2111 Pred->getTerminator()->getDebugLoc());
2112
2113 // Mark the check as used, to prevent it from being removed during cleanup.
2114 MemRuntimeCheckCond = nullptr;
2115 return MemCheckBlock;
2116 }
2117};
2118
2119// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2120// vectorization. The loop needs to be annotated with #pragma omp simd
2121// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2122// vector length information is not provided, vectorization is not considered
2123// explicit. Interleave hints are not allowed either. These limitations will be
2124// relaxed in the future.
2125// Please, note that we are currently forced to abuse the pragma 'clang
2126// vectorize' semantics. This pragma provides *auto-vectorization hints*
2127// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2128// provides *explicit vectorization hints* (LV can bypass legal checks and
2129// assume that vectorization is legal). However, both hints are implemented
2130// using the same metadata (llvm.loop.vectorize, processed by
2131// LoopVectorizeHints). This will be fixed in the future when the native IR
2132// representation for pragma 'omp simd' is introduced.
2133static bool isExplicitVecOuterLoop(Loop *OuterLp,
2134 OptimizationRemarkEmitter *ORE) {
2135 assert(!OuterLp->isInnermost() && "This is not an outer loop")(static_cast <bool> (!OuterLp->isInnermost() &&
"This is not an outer loop") ? void (0) : __assert_fail ("!OuterLp->isInnermost() && \"This is not an outer loop\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2135, __extension__
__PRETTY_FUNCTION__))
;
2136 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2137
2138 // Only outer loops with an explicit vectorization hint are supported.
2139 // Unannotated outer loops are ignored.
2140 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2141 return false;
2142
2143 Function *Fn = OuterLp->getHeader()->getParent();
2144 if (!Hints.allowVectorization(Fn, OuterLp,
2145 true /*VectorizeOnlyWhenForced*/)) {
2146 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"
; } } while (false)
;
2147 return false;
2148 }
2149
2150 if (Hints.getInterleave() > 1) {
2151 // TODO: Interleave support is future work.
2152 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
2153 "outer loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
;
2154 Hints.emitRemarkWithHints();
2155 return false;
2156 }
2157
2158 return true;
2159}
2160
2161static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2162 OptimizationRemarkEmitter *ORE,
2163 SmallVectorImpl<Loop *> &V) {
2164 // Collect inner loops and outer loops without irreducible control flow. For
2165 // now, only collect outer loops that have explicit vectorization hints. If we
2166 // are stress testing the VPlan H-CFG construction, we collect the outermost
2167 // loop of every loop nest.
2168 if (L.isInnermost() || VPlanBuildStressTest ||
2169 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2170 LoopBlocksRPO RPOT(&L);
2171 RPOT.perform(LI);
2172 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2173 V.push_back(&L);
2174 // TODO: Collect inner loops inside marked outer loops in case
2175 // vectorization fails for the outer loop. Do not invoke
2176 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2177 // already known to be reducible. We can use an inherited attribute for
2178 // that.
2179 return;
2180 }
2181 }
2182 for (Loop *InnerL : L)
2183 collectSupportedLoops(*InnerL, LI, ORE, V);
2184}
2185
2186namespace {
2187
2188/// The LoopVectorize Pass.
2189struct LoopVectorize : public FunctionPass {
2190 /// Pass identification, replacement for typeid
2191 static char ID;
2192
2193 LoopVectorizePass Impl;
2194
2195 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2196 bool VectorizeOnlyWhenForced = false)
2197 : FunctionPass(ID),
2198 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2199 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2200 }
2201
2202 bool runOnFunction(Function &F) override {
2203 if (skipFunction(F))
2204 return false;
2205
2206 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2207 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2208 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2209 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2210 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2211 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2212 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2213 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2214 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2215 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2216 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2217 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2218 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2219
2220 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2221 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2222
2223 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2224 GetLAA, *ORE, PSI).MadeAnyChange;
2225 }
2226
2227 void getAnalysisUsage(AnalysisUsage &AU) const override {
2228 AU.addRequired<AssumptionCacheTracker>();
2229 AU.addRequired<BlockFrequencyInfoWrapperPass>();
2230 AU.addRequired<DominatorTreeWrapperPass>();
2231 AU.addRequired<LoopInfoWrapperPass>();
2232 AU.addRequired<ScalarEvolutionWrapperPass>();
2233 AU.addRequired<TargetTransformInfoWrapperPass>();
2234 AU.addRequired<AAResultsWrapperPass>();
2235 AU.addRequired<LoopAccessLegacyAnalysis>();
2236 AU.addRequired<DemandedBitsWrapperPass>();
2237 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2238 AU.addRequired<InjectTLIMappingsLegacy>();
2239
2240 // We currently do not preserve loopinfo/dominator analyses with outer loop
2241 // vectorization. Until this is addressed, mark these analyses as preserved
2242 // only for non-VPlan-native path.
2243 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2244 if (!EnableVPlanNativePath) {
2245 AU.addPreserved<LoopInfoWrapperPass>();
2246 AU.addPreserved<DominatorTreeWrapperPass>();
2247 }
2248
2249 AU.addPreserved<BasicAAWrapperPass>();
2250 AU.addPreserved<GlobalsAAWrapperPass>();
2251 AU.addRequired<ProfileSummaryInfoWrapperPass>();
2252 }
2253};
2254
2255} // end anonymous namespace
2256
2257//===----------------------------------------------------------------------===//
2258// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2259// LoopVectorizationCostModel and LoopVectorizationPlanner.
2260//===----------------------------------------------------------------------===//
2261
2262Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2263 // We need to place the broadcast of invariant variables outside the loop,
2264 // but only if it's proven safe to do so. Else, broadcast will be inside
2265 // vector loop body.
2266 Instruction *Instr = dyn_cast<Instruction>(V);
2267 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2268 (!Instr ||
2269 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2270 // Place the code for broadcasting invariant variables in the new preheader.
2271 IRBuilder<>::InsertPointGuard Guard(Builder);
2272 if (SafeToHoist)
2273 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2274
2275 // Broadcast the scalar into all locations in the vector.
2276 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2277
2278 return Shuf;
2279}
2280
2281/// This function adds
2282/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2283/// to each vector element of Val. The sequence starts at StartIndex.
2284/// \p Opcode is relevant for FP induction variable.
2285static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2286 Instruction::BinaryOps BinOp, ElementCount VF,
2287 IRBuilderBase &Builder) {
2288 assert(VF.isVector() && "only vector VFs are supported")(static_cast <bool> (VF.isVector() && "only vector VFs are supported"
) ? void (0) : __assert_fail ("VF.isVector() && \"only vector VFs are supported\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2288, __extension__
__PRETTY_FUNCTION__))
;
2289
2290 // Create and check the types.
2291 auto *ValVTy = cast<VectorType>(Val->getType());
2292 ElementCount VLen = ValVTy->getElementCount();
2293
2294 Type *STy = Val->getType()->getScalarType();
2295 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2296, __extension__
__PRETTY_FUNCTION__))
2296 "Induction Step must be an integer or FP")(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2296, __extension__
__PRETTY_FUNCTION__))
;
2297 assert(Step->getType() == STy && "Step has wrong type")(static_cast <bool> (Step->getType() == STy &&
"Step has wrong type") ? void (0) : __assert_fail ("Step->getType() == STy && \"Step has wrong type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2297, __extension__
__PRETTY_FUNCTION__))
;
2298
2299 SmallVector<Constant *, 8> Indices;
2300
2301 // Create a vector of consecutive numbers from zero to VF.
2302 VectorType *InitVecValVTy = ValVTy;
2303 if (STy->isFloatingPointTy()) {
2304 Type *InitVecValSTy =
2305 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2306 InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2307 }
2308 Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2309
2310 // Splat the StartIdx
2311 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2312
2313 if (STy->isIntegerTy()) {
2314 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2315 Step = Builder.CreateVectorSplat(VLen, Step);
2316 assert(Step->getType() == Val->getType() && "Invalid step vec")(static_cast <bool> (Step->getType() == Val->getType
() && "Invalid step vec") ? void (0) : __assert_fail (
"Step->getType() == Val->getType() && \"Invalid step vec\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2316, __extension__
__PRETTY_FUNCTION__))
;
2317 // FIXME: The newly created binary instructions should contain nsw/nuw
2318 // flags, which can be found from the original scalar operations.
2319 Step = Builder.CreateMul(InitVec, Step);
2320 return Builder.CreateAdd(Val, Step, "induction");
2321 }
2322
2323 // Floating point induction.
2324 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
== Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2325, __extension__
__PRETTY_FUNCTION__))
2325 "Binary Opcode should be specified for FP induction")(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
== Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2325, __extension__
__PRETTY_FUNCTION__))
;
2326 InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2327 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2328
2329 Step = Builder.CreateVectorSplat(VLen, Step);
2330 Value *MulOp = Builder.CreateFMul(InitVec, Step);
2331 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2332}
2333
2334/// Compute scalar induction steps. \p ScalarIV is the scalar induction
2335/// variable on which to base the steps, \p Step is the size of the step.
2336static void buildScalarSteps(Value *ScalarIV, Value *Step,
2337 const InductionDescriptor &ID, VPValue *Def,
2338 VPTransformState &State) {
2339 IRBuilderBase &Builder = State.Builder;
2340 // We shouldn't have to build scalar steps if we aren't vectorizing.
2341 assert(State.VF.isVector() && "VF should be greater than one")(static_cast <bool> (State.VF.isVector() && "VF should be greater than one"
) ? void (0) : __assert_fail ("State.VF.isVector() && \"VF should be greater than one\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2341, __extension__
__PRETTY_FUNCTION__))
;
2342 // Get the value type and ensure it and the step have the same integer type.
2343 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2344 assert(ScalarIVTy == Step->getType() &&(static_cast <bool> (ScalarIVTy == Step->getType() &&
"Val and Step should have the same type") ? void (0) : __assert_fail
("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2345, __extension__
__PRETTY_FUNCTION__))
2345 "Val and Step should have the same type")(static_cast <bool> (ScalarIVTy == Step->getType() &&
"Val and Step should have the same type") ? void (0) : __assert_fail
("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2345, __extension__
__PRETTY_FUNCTION__))
;
2346
2347 // We build scalar steps for both integer and floating-point induction
2348 // variables. Here, we determine the kind of arithmetic we will perform.
2349 Instruction::BinaryOps AddOp;
2350 Instruction::BinaryOps MulOp;
2351 if (ScalarIVTy->isIntegerTy()) {
2352 AddOp = Instruction::Add;
2353 MulOp = Instruction::Mul;
2354 } else {
2355 AddOp = ID.getInductionOpcode();
2356 MulOp = Instruction::FMul;
2357 }
2358
2359 // Determine the number of scalars we need to generate for each unroll
2360 // iteration.
2361 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2362 unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2363 // Compute the scalar steps and save the results in State.
2364 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2365 ScalarIVTy->getScalarSizeInBits());
2366 Type *VecIVTy = nullptr;
2367 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2368 if (!FirstLaneOnly && State.VF.isScalable()) {
2369 VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2370 UnitStepVec =
2371 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2372 SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2373 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2374 }
2375
2376 for (unsigned Part = 0; Part < State.UF; ++Part) {
2377 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2378
2379 if (!FirstLaneOnly && State.VF.isScalable()) {
2380 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2381 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2382 if (ScalarIVTy->isFloatingPointTy())
2383 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2384 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2385 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2386 State.set(Def, Add, Part);
2387 // It's useful to record the lane values too for the known minimum number
2388 // of elements so we do those below. This improves the code quality when
2389 // trying to extract the first element, for example.
2390 }
2391
2392 if (ScalarIVTy->isFloatingPointTy())
2393 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2394
2395 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2396 Value *StartIdx = Builder.CreateBinOp(
2397 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2398 // The step returned by `createStepForVF` is a runtime-evaluated value
2399 // when VF is scalable. Otherwise, it should be folded into a Constant.
2400 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&(static_cast <bool> ((State.VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2402, __extension__
__PRETTY_FUNCTION__))
2401 "Expected StartIdx to be folded to a constant when VF is not "(static_cast <bool> ((State.VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2402, __extension__
__PRETTY_FUNCTION__))
2402 "scalable")(static_cast <bool> ((State.VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2402, __extension__
__PRETTY_FUNCTION__))
;
2403 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2404 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2405 State.set(Def, Add, VPIteration(Part, Lane));
2406 }
2407 }
2408}
2409
2410// Generate code for the induction step. Note that induction steps are
2411// required to be loop-invariant
2412static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2413 Instruction *InsertBefore,
2414 Loop *OrigLoop = nullptr) {
2415 const DataLayout &DL = SE.getDataLayout();
2416 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&(static_cast <bool> ((!OrigLoop || SE.isLoopInvariant(Step
, OrigLoop)) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("(!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && \"Induction step should be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2417, __extension__
__PRETTY_FUNCTION__))
2417 "Induction step should be loop invariant")(static_cast <bool> ((!OrigLoop || SE.isLoopInvariant(Step
, OrigLoop)) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("(!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && \"Induction step should be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2417, __extension__
__PRETTY_FUNCTION__))
;
2418 if (auto *E = dyn_cast<SCEVUnknown>(Step))
2419 return E->getValue();
2420
2421 SCEVExpander Exp(SE, DL, "induction");
2422 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2423}
2424
2425/// Compute the transformed value of Index at offset StartValue using step
2426/// StepValue.
2427/// For integer induction, returns StartValue + Index * StepValue.
2428/// For pointer induction, returns StartValue[Index * StepValue].
2429/// FIXME: The newly created binary instructions should contain nsw/nuw
2430/// flags, which can be found from the original scalar operations.
2431static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
2432 Value *StartValue, Value *Step,
2433 const InductionDescriptor &ID) {
2434 assert(Index->getType()->getScalarType() == Step->getType() &&(static_cast <bool> (Index->getType()->getScalarType
() == Step->getType() && "Index scalar type does not match StepValue type"
) ? void (0) : __assert_fail ("Index->getType()->getScalarType() == Step->getType() && \"Index scalar type does not match StepValue type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2435, __extension__
__PRETTY_FUNCTION__))
2435 "Index scalar type does not match StepValue type")(static_cast <bool> (Index->getType()->getScalarType
() == Step->getType() && "Index scalar type does not match StepValue type"
) ? void (0) : __assert_fail ("Index->getType()->getScalarType() == Step->getType() && \"Index scalar type does not match StepValue type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2435, __extension__
__PRETTY_FUNCTION__))
;
2436
2437 // Note: the IR at this point is broken. We cannot use SE to create any new
2438 // SCEV and then expand it, hoping that SCEV's simplification will give us
2439 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2440 // lead to various SCEV crashes. So all we can do is to use builder and rely
2441 // on InstCombine for future simplifications. Here we handle some trivial
2442 // cases only.
2443 auto CreateAdd = [&B](Value *X, Value *Y) {
2444 assert(X->getType() == Y->getType() && "Types don't match!")(static_cast <bool> (X->getType() == Y->getType()
&& "Types don't match!") ? void (0) : __assert_fail (
"X->getType() == Y->getType() && \"Types don't match!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2444, __extension__
__PRETTY_FUNCTION__))
;
2445 if (auto *CX = dyn_cast<ConstantInt>(X))
2446 if (CX->isZero())
2447 return Y;
2448 if (auto *CY = dyn_cast<ConstantInt>(Y))
2449 if (CY->isZero())
2450 return X;
2451 return B.CreateAdd(X, Y);
2452 };
2453
2454 // We allow X to be a vector type, in which case Y will potentially be
2455 // splatted into a vector with the same element count.
2456 auto CreateMul = [&B](Value *X, Value *Y) {
2457 assert(X->getType()->getScalarType() == Y->getType() &&(static_cast <bool> (X->getType()->getScalarType(
) == Y->getType() && "Types don't match!") ? void (
0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2458, __extension__
__PRETTY_FUNCTION__))
2458 "Types don't match!")(static_cast <bool> (X->getType()->getScalarType(
) == Y->getType() && "Types don't match!") ? void (
0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2458, __extension__
__PRETTY_FUNCTION__))
;
2459 if (auto *CX = dyn_cast<ConstantInt>(X))
2460 if (CX->isOne())
2461 return Y;
2462 if (auto *CY = dyn_cast<ConstantInt>(Y))
2463 if (CY->isOne())
2464 return X;
2465 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2466 if (XVTy && !isa<VectorType>(Y->getType()))
2467 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2468 return B.CreateMul(X, Y);
2469 };
2470
2471 switch (ID.getKind()) {
2472 case InductionDescriptor::IK_IntInduction: {
2473 assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for integer inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2474, __extension__
__PRETTY_FUNCTION__))
2474 "Vector indices not supported for integer inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for integer inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2474, __extension__
__PRETTY_FUNCTION__))
;
2475 assert(Index->getType() == StartValue->getType() &&(static_cast <bool> (Index->getType() == StartValue->
getType() && "Index type does not match StartValue type"
) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2476, __extension__
__PRETTY_FUNCTION__))
2476 "Index type does not match StartValue type")(static_cast <bool> (Index->getType() == StartValue->
getType() && "Index type does not match StartValue type"
) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2476, __extension__
__PRETTY_FUNCTION__))
;
2477 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2478 return B.CreateSub(StartValue, Index);
2479 auto *Offset = CreateMul(Index, Step);
2480 return CreateAdd(StartValue, Offset);
2481 }
2482 case InductionDescriptor::IK_PtrInduction: {
2483 assert(isa<Constant>(Step) &&(static_cast <bool> (isa<Constant>(Step) &&
"Expected constant step for pointer induction") ? void (0) :
__assert_fail ("isa<Constant>(Step) && \"Expected constant step for pointer induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2484, __extension__
__PRETTY_FUNCTION__))
2484 "Expected constant step for pointer induction")(static_cast <bool> (isa<Constant>(Step) &&
"Expected constant step for pointer induction") ? void (0) :
__assert_fail ("isa<Constant>(Step) && \"Expected constant step for pointer induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2484, __extension__
__PRETTY_FUNCTION__))
;
2485 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2486 }
2487 case InductionDescriptor::IK_FpInduction: {
2488 assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for FP inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2489, __extension__
__PRETTY_FUNCTION__))
2489 "Vector indices not supported for FP inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for FP inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2489, __extension__
__PRETTY_FUNCTION__))
;
2490 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value")(static_cast <bool> (Step->getType()->isFloatingPointTy
() && "Expected FP Step value") ? void (0) : __assert_fail
("Step->getType()->isFloatingPointTy() && \"Expected FP Step value\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2490, __extension__
__PRETTY_FUNCTION__))
;
2491 auto InductionBinOp = ID.getInductionBinOp();
2492 assert(InductionBinOp &&(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2495, __extension__
__PRETTY_FUNCTION__))
2493 (InductionBinOp->getOpcode() == Instruction::FAdd ||(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2495, __extension__
__PRETTY_FUNCTION__))
2494 InductionBinOp->getOpcode() == Instruction::FSub) &&(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2495, __extension__
__PRETTY_FUNCTION__))
2495 "Original bin op should be defined for FP induction")(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2495, __extension__
__PRETTY_FUNCTION__))
;
2496
2497 Value *MulExp = B.CreateFMul(Step, Index);
2498 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2499 "induction");
2500 }
2501 case InductionDescriptor::IK_NoInduction:
2502 return nullptr;
2503 }
2504 llvm_unreachable("invalid enum")::llvm::llvm_unreachable_internal("invalid enum", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2504)
;
2505}
2506
2507void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2508 const VPIteration &Instance,
2509 VPTransformState &State) {
2510 Value *ScalarInst = State.get(Def, Instance);
2511 Value *VectorValue = State.get(Def, Instance.Part);
2512 VectorValue = Builder.CreateInsertElement(
2513 VectorValue, ScalarInst,
2514 Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2515 State.set(Def, VectorValue, Instance.Part);
2516}
2517
2518// Return whether we allow using masked interleave-groups (for dealing with
2519// strided loads/stores that reside in predicated blocks, or for dealing
2520// with gaps).
2521static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2522 // If an override option has been passed in for interleaved accesses, use it.
2523 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2524 return EnableMaskedInterleavedMemAccesses;
2525
2526 return TTI.enableMaskedInterleavedAccessVectorization();
2527}
2528
2529// Try to vectorize the interleave group that \p Instr belongs to.
2530//
2531// E.g. Translate following interleaved load group (factor = 3):
2532// for (i = 0; i < N; i+=3) {
2533// R = Pic[i]; // Member of index 0
2534// G = Pic[i+1]; // Member of index 1
2535// B = Pic[i+2]; // Member of index 2
2536// ... // do something to R, G, B
2537// }
2538// To:
2539// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2540// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2541// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2542// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2543//
2544// Or translate following interleaved store group (factor = 3):
2545// for (i = 0; i < N; i+=3) {
2546// ... do something to R, G, B
2547// Pic[i] = R; // Member of index 0
2548// Pic[i+1] = G; // Member of index 1
2549// Pic[i+2] = B; // Member of index 2
2550// }
2551// To:
2552// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2553// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2554// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2555// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2556// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2557void InnerLoopVectorizer::vectorizeInterleaveGroup(
2558 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2559 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2560 VPValue *BlockInMask) {
2561 Instruction *Instr = Group->getInsertPos();
2562 const DataLayout &DL = Instr->getModule()->getDataLayout();
2563
2564 // Prepare for the vector type of the interleaved load/store.
2565 Type *ScalarTy = getLoadStoreType(Instr);
2566 unsigned InterleaveFactor = Group->getFactor();
2567 assert(!VF.isScalable() && "scalable vectors not yet supported.")(static_cast <bool> (!VF.isScalable() && "scalable vectors not yet supported."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2567, __extension__
__PRETTY_FUNCTION__))
;
2568 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2569
2570 // Prepare for the new pointers.
2571 SmallVector<Value *, 2> AddrParts;
2572 unsigned Index = Group->getIndex(Instr);
2573
2574 // TODO: extend the masked interleaved-group support to reversed access.
2575 assert((!BlockInMask || !Group->isReverse()) &&(static_cast <bool> ((!BlockInMask || !Group->isReverse
()) && "Reversed masked interleave-group not supported."
) ? void (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2576, __extension__
__PRETTY_FUNCTION__))
2576 "Reversed masked interleave-group not supported.")(static_cast <bool> ((!BlockInMask || !Group->isReverse
()) && "Reversed masked interleave-group not supported."
) ? void (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2576, __extension__
__PRETTY_FUNCTION__))
;
2577
2578 // If the group is reverse, adjust the index to refer to the last vector lane
2579 // instead of the first. We adjust the index from the first vector lane,
2580 // rather than directly getting the pointer for lane VF - 1, because the
2581 // pointer operand of the interleaved access is supposed to be uniform. For
2582 // uniform instructions, we're only required to generate a value for the
2583 // first vector lane in each unroll iteration.
2584 if (Group->isReverse())
2585 Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2586
2587 for (unsigned Part = 0; Part < UF; Part++) {
2588 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2589 State.setDebugLocFromInst(AddrPart);
2590
2591 // Notice current instruction could be any index. Need to adjust the address
2592 // to the member of index 0.
2593 //
2594 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2595 // b = A[i]; // Member of index 0
2596 // Current pointer is pointed to A[i+1], adjust it to A[i].
2597 //
2598 // E.g. A[i+1] = a; // Member of index 1
2599 // A[i] = b; // Member of index 0
2600 // A[i+2] = c; // Member of index 2 (Current instruction)
2601 // Current pointer is pointed to A[i+2], adjust it to A[i].
2602
2603 bool InBounds = false;
2604 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2605 InBounds = gep->isInBounds();
2606 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2607 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2608
2609 // Cast to the vector pointer type.
2610 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2611 Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2612 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2613 }
2614
2615 State.setDebugLocFromInst(Instr);
2616 Value *PoisonVec = PoisonValue::get(VecTy);
2617
2618 Value *MaskForGaps = nullptr;
2619 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2620 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2621 assert(MaskForGaps && "Mask for Gaps is required but it is null")(static_cast <bool> (MaskForGaps && "Mask for Gaps is required but it is null"
) ? void (0) : __assert_fail ("MaskForGaps && \"Mask for Gaps is required but it is null\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2621, __extension__
__PRETTY_FUNCTION__))
;
2622 }
2623
2624 // Vectorize the interleaved load group.
2625 if (isa<LoadInst>(Instr)) {
2626 // For each unroll part, create a wide load for the group.
2627 SmallVector<Value *, 2> NewLoads;
2628 for (unsigned Part = 0; Part < UF; Part++) {
2629 Instruction *NewLoad;
2630 if (BlockInMask || MaskForGaps) {
2631 assert(useMaskedInterleavedAccesses(*TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(*TTI)
&& "masked interleaved groups are not allowed.") ? void
(0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2632, __extension__
__PRETTY_FUNCTION__))
2632 "masked interleaved groups are not allowed.")(static_cast <bool> (useMaskedInterleavedAccesses(*TTI)
&& "masked interleaved groups are not allowed.") ? void
(0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2632, __extension__
__PRETTY_FUNCTION__))
;
2633 Value *GroupMask = MaskForGaps;
2634 if (BlockInMask) {
2635 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2636 Value *ShuffledMask = Builder.CreateShuffleVector(
2637 BlockInMaskPart,
2638 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2639 "interleaved.mask");
2640 GroupMask = MaskForGaps
2641 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2642 MaskForGaps)
2643 : ShuffledMask;
2644 }
2645 NewLoad =
2646 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2647 GroupMask, PoisonVec, "wide.masked.vec");
2648 }
2649 else
2650 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2651 Group->getAlign(), "wide.vec");
2652 Group->addMetadata(NewLoad);
2653 NewLoads.push_back(NewLoad);
2654 }
2655
2656 // For each member in the group, shuffle out the appropriate data from the
2657 // wide loads.
2658 unsigned J = 0;
2659 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2660 Instruction *Member = Group->getMember(I);
2661
2662 // Skip the gaps in the group.
2663 if (!Member)
2664 continue;
2665
2666 auto StrideMask =
2667 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2668 for (unsigned Part = 0; Part < UF; Part++) {
2669 Value *StridedVec = Builder.CreateShuffleVector(
2670 NewLoads[Part], StrideMask, "strided.vec");
2671
2672 // If this member has different type, cast the result type.
2673 if (Member->getType() != ScalarTy) {
2674 assert(!VF.isScalable() && "VF is assumed to be non scalable.")(static_cast <bool> (!VF.isScalable() && "VF is assumed to be non scalable."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2674, __extension__
__PRETTY_FUNCTION__))
;
2675 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2676 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2677 }
2678
2679 if (Group->isReverse())
2680 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2681
2682 State.set(VPDefs[J], StridedVec, Part);
2683 }
2684 ++J;
2685 }
2686 return;
2687 }
2688
2689 // The sub vector type for current instruction.
2690 auto *SubVT = VectorType::get(ScalarTy, VF);
2691
2692 // Vectorize the interleaved store group.
2693 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2694 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&(static_cast <bool> ((!MaskForGaps || useMaskedInterleavedAccesses
(*TTI)) && "masked interleaved groups are not allowed."
) ? void (0) : __assert_fail ("(!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2695, __extension__
__PRETTY_FUNCTION__))
2695 "masked interleaved groups are not allowed.")(static_cast <bool> ((!MaskForGaps || useMaskedInterleavedAccesses
(*TTI)) && "masked interleaved groups are not allowed."
) ? void (0) : __assert_fail ("(!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2695, __extension__
__PRETTY_FUNCTION__))
;
2696 assert((!MaskForGaps || !VF.isScalable()) &&(static_cast <bool> ((!MaskForGaps || !VF.isScalable())
&& "masking gaps for scalable vectors is not yet supported."
) ? void (0) : __assert_fail ("(!MaskForGaps || !VF.isScalable()) && \"masking gaps for scalable vectors is not yet supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2697, __extension__
__PRETTY_FUNCTION__))
2697 "masking gaps for scalable vectors is not yet supported.")(static_cast <bool> ((!MaskForGaps || !VF.isScalable())
&& "masking gaps for scalable vectors is not yet supported."
) ? void (0) : __assert_fail ("(!MaskForGaps || !VF.isScalable()) && \"masking gaps for scalable vectors is not yet supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2697, __extension__
__PRETTY_FUNCTION__))
;
2698 for (unsigned Part = 0; Part < UF; Part++) {
2699 // Collect the stored vector from each member.
2700 SmallVector<Value *, 4> StoredVecs;
2701 for (unsigned i = 0; i < InterleaveFactor; i++) {
2702 assert((Group->getMember(i) || MaskForGaps) &&(static_cast <bool> ((Group->getMember(i) || MaskForGaps
) && "Fail to get a member from an interleaved store group"
) ? void (0) : __assert_fail ("(Group->getMember(i) || MaskForGaps) && \"Fail to get a member from an interleaved store group\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2703, __extension__
__PRETTY_FUNCTION__))
2703 "Fail to get a member from an interleaved store group")(static_cast <bool> ((Group->getMember(i) || MaskForGaps
) && "Fail to get a member from an interleaved store group"
) ? void (0) : __assert_fail ("(Group->getMember(i) || MaskForGaps) && \"Fail to get a member from an interleaved store group\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2703, __extension__
__PRETTY_FUNCTION__))
;
2704 Instruction *Member = Group->getMember(i);
2705
2706 // Skip the gaps in the group.
2707 if (!Member) {
2708 Value *Undef = PoisonValue::get(SubVT);
2709 StoredVecs.push_back(Undef);
2710 continue;
2711 }
2712
2713 Value *StoredVec = State.get(StoredValues[i], Part);
2714
2715 if (Group->isReverse())
2716 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2717
2718 // If this member has different type, cast it to a unified type.
2719
2720 if (StoredVec->getType() != SubVT)
2721 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2722
2723 StoredVecs.push_back(StoredVec);
2724 }
2725
2726 // Concatenate all vectors into a wide vector.
2727 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2728
2729 // Interleave the elements in the wide vector.
2730 Value *IVec = Builder.CreateShuffleVector(
2731 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2732 "interleaved.vec");
2733
2734 Instruction *NewStoreInstr;
2735 if (BlockInMask || MaskForGaps) {
2736 Value *GroupMask = MaskForGaps;
2737 if (BlockInMask) {
2738 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2739 Value *ShuffledMask = Builder.CreateShuffleVector(
2740 BlockInMaskPart,
2741 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2742 "interleaved.mask");
2743 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2744 ShuffledMask, MaskForGaps)
2745 : ShuffledMask;
2746 }
2747 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2748 Group->getAlign(), GroupMask);
2749 } else
2750 NewStoreInstr =
2751 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2752
2753 Group->addMetadata(NewStoreInstr);
2754 }
2755}
2756
2757void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2758 VPReplicateRecipe *RepRecipe,
2759 const VPIteration &Instance,
2760 bool IfPredicateInstr,
2761 VPTransformState &State) {
2762 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")(static_cast <bool> (!Instr->getType()->isAggregateType
() && "Can't handle vectors") ? void (0) : __assert_fail
("!Instr->getType()->isAggregateType() && \"Can't handle vectors\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2762, __extension__
__PRETTY_FUNCTION__))
;
2763
2764 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2765 // the first lane and part.
2766 if (isa<NoAliasScopeDeclInst>(Instr))
2767 if (!Instance.isFirstIteration())
2768 return;
2769
2770 // Does this instruction return a value ?
2771 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2772
2773 Instruction *Cloned = Instr->clone();
2774 if (!IsVoidRetTy)
2775 Cloned->setName(Instr->getName() + ".cloned");
2776
2777 // If the scalarized instruction contributes to the address computation of a
2778 // widen masked load/store which was in a basic block that needed predication
2779 // and is not predicated after vectorization, we can't propagate
2780 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2781 // instruction could feed a poison value to the base address of the widen
2782 // load/store.
2783 if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2784 Cloned->dropPoisonGeneratingFlags();
2785
2786 if (Instr->getDebugLoc())
2787 State.setDebugLocFromInst(Instr);
2788
2789 // Replace the operands of the cloned instructions with their scalar
2790 // equivalents in the new loop.
2791 for (const auto &I : enumerate(RepRecipe->operands())) {
2792 auto InputInstance = Instance;
2793 VPValue *Operand = I.value();
2794 if (vputils::isUniformAfterVectorization(Operand))
2795 InputInstance.Lane = VPLane::getFirstLane();
2796 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2797 }
2798 State.addNewMetadata(Cloned, Instr);
2799
2800 // Place the cloned scalar in the new loop.
2801 State.Builder.Insert(Cloned);
2802
2803 State.set(RepRecipe, Cloned, Instance);
2804
2805 // If we just cloned a new assumption, add it the assumption cache.
2806 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2807 AC->registerAssumption(II);
2808
2809 // End if-block.
2810 if (IfPredicateInstr)
2811 PredicatedInstructions.push_back(Cloned);
2812}
2813
2814Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
2815 if (TripCount)
2816 return TripCount;
2817
2818 assert(InsertBlock)(static_cast <bool> (InsertBlock) ? void (0) : __assert_fail
("InsertBlock", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2818, __extension__ __PRETTY_FUNCTION__))
;
2819 IRBuilder<> Builder(InsertBlock->getTerminator());
2820 // Find the loop boundaries.
2821 ScalarEvolution *SE = PSE.getSE();
2822 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2823 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&(static_cast <bool> (!isa<SCEVCouldNotCompute>(BackedgeTakenCount
) && "Invalid loop count") ? void (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2824, __extension__
__PRETTY_FUNCTION__))
2824 "Invalid loop count")(static_cast <bool> (!isa<SCEVCouldNotCompute>(BackedgeTakenCount
) && "Invalid loop count") ? void (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2824, __extension__
__PRETTY_FUNCTION__))
;
2825
2826 Type *IdxTy = Legal->getWidestInductionType();
2827 assert(IdxTy && "No type for induction")(static_cast <bool> (IdxTy && "No type for induction"
) ? void (0) : __assert_fail ("IdxTy && \"No type for induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2827, __extension__
__PRETTY_FUNCTION__))
;
2828
2829 // The exit count might have the type of i64 while the phi is i32. This can
2830 // happen if we have an induction variable that is sign extended before the
2831 // compare. The only way that we get a backedge taken count is that the
2832 // induction variable was signed and as such will not overflow. In such a case
2833 // truncation is legal.
2834 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2835 IdxTy->getPrimitiveSizeInBits())
2836 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2837 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2838
2839 // Get the total trip count from the count by adding 1.
2840 const SCEV *ExitCount = SE->getAddExpr(
2841 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2842
2843 const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
2844
2845 // Expand the trip count and place the new instructions in the preheader.
2846 // Notice that the pre-header does not change, only the loop body.
2847 SCEVExpander Exp(*SE, DL, "induction");
2848
2849 // Count holds the overall loop count (N).
2850 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2851 InsertBlock->getTerminator());
2852
2853 if (TripCount->getType()->isPointerTy())
2854 TripCount =
2855 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2856 InsertBlock->getTerminator());
2857
2858 return TripCount;
2859}
2860
2861Value *
2862InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2863 if (VectorTripCount)
2864 return VectorTripCount;
2865
2866 Value *TC = getOrCreateTripCount(InsertBlock);
2867 IRBuilder<> Builder(InsertBlock->getTerminator());
2868
2869 Type *Ty = TC->getType();
2870 // This is where we can make the step a runtime constant.
2871 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2872
2873 // If the tail is to be folded by masking, round the number of iterations N
2874 // up to a multiple of Step instead of rounding down. This is done by first
2875 // adding Step-1 and then rounding down. Note that it's ok if this addition
2876 // overflows: the vector induction variable will eventually wrap to zero given
2877 // that it starts at zero and its Step is a power of two; the loop will then
2878 // exit, with the last early-exit vector comparison also producing all-true.
2879 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2880 // is accounted for in emitIterationCountCheck that adds an overflow check.
2881 if (Cost->foldTailByMasking()) {
2882 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
) * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2883, __extension__
__PRETTY_FUNCTION__))
2883 "VF*UF must be a power of 2 when folding tail by masking")(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
) * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2883, __extension__
__PRETTY_FUNCTION__))
;
2884 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2885 TC = Builder.CreateAdd(
2886 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2887 }
2888
2889 // Now we need to generate the expression for the part of the loop that the
2890 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2891 // iterations are not required for correctness, or N - Step, otherwise. Step
2892 // is equal to the vectorization factor (number of SIMD elements) times the
2893 // unroll factor (number of SIMD instructions).
2894 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2895
2896 // There are cases where we *must* run at least one iteration in the remainder
2897 // loop. See the cost model for when this can happen. If the step evenly
2898 // divides the trip count, we set the remainder to be equal to the step. If
2899 // the step does not evenly divide the trip count, no adjustment is necessary
2900 // since there will already be scalar iterations. Note that the minimum
2901 // iterations check ensures that N >= Step.
2902 if (Cost->requiresScalarEpilogue(VF)) {
2903 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2904 R = Builder.CreateSelect(IsZero, Step, R);
2905 }
2906
2907 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2908
2909 return VectorTripCount;
2910}
2911
2912Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2913 const DataLayout &DL) {
2914 // Verify that V is a vector type with same number of elements as DstVTy.
2915 auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2916 unsigned VF = DstFVTy->getNumElements();
2917 auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2918 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match")(static_cast <bool> ((VF == SrcVecTy->getNumElements
()) && "Vector dimensions do not match") ? void (0) :
__assert_fail ("(VF == SrcVecTy->getNumElements()) && \"Vector dimensions do not match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2918, __extension__
__PRETTY_FUNCTION__))
;
2919 Type *SrcElemTy = SrcVecTy->getElementType();
2920 Type *DstElemTy = DstFVTy->getElementType();
2921 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2922, __extension__
__PRETTY_FUNCTION__))
2922 "Vector elements must have same size")(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2922, __extension__
__PRETTY_FUNCTION__))
;
2923
2924 // Do a direct cast if element types are castable.
2925 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2926 return Builder.CreateBitOrPointerCast(V, DstFVTy);
2927 }
2928 // V cannot be directly casted to desired vector type.
2929 // May happen when V is a floating point vector but DstVTy is a vector of
2930 // pointers or vice-versa. Handle this using a two-step bitcast using an
2931 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2932 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2933, __extension__
__PRETTY_FUNCTION__))
2933 "Only one type should be a pointer type")(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2933, __extension__
__PRETTY_FUNCTION__))
;
2934 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&(static_cast <bool> ((DstElemTy->isFloatingPointTy()
!= SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2935, __extension__
__PRETTY_FUNCTION__))
2935 "Only one type should be a floating point type")(static_cast <bool> ((DstElemTy->isFloatingPointTy()
!= SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2935, __extension__
__PRETTY_FUNCTION__))
;
2936 Type *IntTy =
2937 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2938 auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2939 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2940 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2941}
2942
2943void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2944 Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
2945 // Reuse existing vector loop preheader for TC checks.
2946 // Note that new preheader block is generated for vector loop.
2947 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2948 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2949
2950 // Generate code to check if the loop's trip count is less than VF * UF, or
2951 // equal to it in case a scalar epilogue is required; this implies that the
2952 // vector trip count is zero. This check also covers the case where adding one
2953 // to the backedge-taken count overflowed leading to an incorrect trip count
2954 // of zero. In this case we will also jump to the scalar loop.
2955 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2956 : ICmpInst::ICMP_ULT;
2957
2958 // If tail is to be folded, vector loop takes care of all iterations.
2959 Type *CountTy = Count->getType();
2960 Value *CheckMinIters = Builder.getFalse();
2961 auto CreateStep = [&]() -> Value * {
2962 // Create step with max(MinProTripCount, UF * VF).
2963 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2964 return createStepForVF(Builder, CountTy, VF, UF);
2965
2966 Value *MinProfTC =
2967 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2968 if (!VF.isScalable())
2969 return MinProfTC;
2970 return Builder.CreateBinaryIntrinsic(
2971 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2972 };
2973
2974 if (!Cost->foldTailByMasking())
2975 CheckMinIters =
2976 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2977 else if (VF.isScalable()) {
2978 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2979 // an overflow to zero when updating induction variables and so an
2980 // additional overflow check is required before entering the vector loop.
2981
2982 // Get the maximum unsigned value for the type.
2983 Value *MaxUIntTripCount =
2984 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2985 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2986
2987 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2988 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2989 }
2990
2991 // Create new preheader for vector loop.
2992 LoopVectorPreHeader =
2993 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2994 "vector.ph");
2995
2996 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2998, __extension__
__PRETTY_FUNCTION__))
2997 DT->getNode(Bypass)->getIDom()) &&(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2998, __extension__
__PRETTY_FUNCTION__))
2998 "TC check is expected to dominate Bypass")(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2998, __extension__
__PRETTY_FUNCTION__))
;
2999
3000 // Update dominator for Bypass & LoopExit (if needed).
3001 DT->changeImmediateDominator(Bypass, TCCheckBlock);
3002 if (!Cost->requiresScalarEpilogue(VF))
3003 // If there is an epilogue which must run, there's no edge from the
3004 // middle block to exit blocks and thus no need to update the immediate
3005 // dominator of the exit blocks.
3006 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3007
3008 ReplaceInstWithInst(
3009 TCCheckBlock->getTerminator(),
3010 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3011 LoopBypassBlocks.push_back(TCCheckBlock);
3012}
3013
3014BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
3015 BasicBlock *const SCEVCheckBlock =
3016 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
3017 if (!SCEVCheckBlock)
3018 return nullptr;
3019
3020 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3023, __extension__
__PRETTY_FUNCTION__))
3021 (OptForSizeBasedOnProfile &&(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3023, __extension__
__PRETTY_FUNCTION__))
3022 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3023, __extension__
__PRETTY_FUNCTION__))
3023 "Cannot SCEV check stride or overflow when optimizing for size")(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3023, __extension__
__PRETTY_FUNCTION__))
;
3024
3025
3026 // Update dominator only if this is first RT check.
3027 if (LoopBypassBlocks.empty()) {
3028 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3029 if (!Cost->requiresScalarEpilogue(VF))
3030 // If there is an epilogue which must run, there's no edge from the
3031 // middle block to exit blocks and thus no need to update the immediate
3032 // dominator of the exit blocks.
3033 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3034 }
3035
3036 LoopBypassBlocks.push_back(SCEVCheckBlock);
3037 AddedSafetyChecks = true;
3038 return SCEVCheckBlock;
3039}
3040
3041BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
3042 // VPlan-native path does not do any analysis for runtime checks currently.
3043 if (EnableVPlanNativePath)
3044 return nullptr;
3045
3046 BasicBlock *const MemCheckBlock =
3047 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
3048
3049 // Check if we generated code that checks in runtime if arrays overlap. We put
3050 // the checks into a separate block to make the more common case of few
3051 // elements faster.
3052 if (!MemCheckBlock)
3053 return nullptr;
3054
3055 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3056 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3058, __extension__
__PRETTY_FUNCTION__))
3057 "Cannot emit memory checks when optimizing for size, unless forced "(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3058, __extension__
__PRETTY_FUNCTION__))
3058 "to vectorize.")(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3058, __extension__
__PRETTY_FUNCTION__))
;
3059 ORE->emit([&]() {
3060 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationCodeSize",
3061 OrigLoop->getStartLoc(),
3062 OrigLoop->getHeader())
3063 << "Code-size may be reduced by not forcing "
3064 "vectorization, or by source-code modifications "
3065 "eliminating the need for runtime checks "
3066 "(e.g., adding 'restrict').";
3067 });
3068 }
3069
3070 LoopBypassBlocks.push_back(MemCheckBlock);
3071
3072 AddedSafetyChecks = true;
3073
3074 return MemCheckBlock;
3075}
3076
3077void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3078 LoopScalarBody = OrigLoop->getHeader();
3079 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3080 assert(LoopVectorPreHeader && "Invalid loop structure")(static_cast <bool> (LoopVectorPreHeader && "Invalid loop structure"
) ? void (0) : __assert_fail ("LoopVectorPreHeader && \"Invalid loop structure\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3080, __extension__
__PRETTY_FUNCTION__))
;
3081 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3082 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&(static_cast <bool> ((LoopExitBlock || Cost->requiresScalarEpilogue
(VF)) && "multiple exit loop without required epilogue?"
) ? void (0) : __assert_fail ("(LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && \"multiple exit loop without required epilogue?\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3083, __extension__
__PRETTY_FUNCTION__))
3083 "multiple exit loop without required epilogue?")(static_cast <bool> ((LoopExitBlock || Cost->requiresScalarEpilogue
(VF)) && "multiple exit loop without required epilogue?"
) ? void (0) : __assert_fail ("(LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && \"multiple exit loop without required epilogue?\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3083, __extension__
__PRETTY_FUNCTION__))
;
3084
3085 LoopMiddleBlock =
3086 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3087 LI, nullptr, Twine(Prefix) + "middle.block");
3088 LoopScalarPreHeader =
3089 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3090 nullptr, Twine(Prefix) + "scalar.ph");
3091
3092 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3093
3094 // Set up the middle block terminator. Two cases:
3095 // 1) If we know that we must execute the scalar epilogue, emit an
3096 // unconditional branch.
3097 // 2) Otherwise, we must have a single unique exit block (due to how we
3098 // implement the multiple exit case). In this case, set up a conditional
3099 // branch from the middle block to the loop scalar preheader, and the
3100 // exit block. completeLoopSkeleton will update the condition to use an
3101 // iteration check, if required to decide whether to execute the remainder.
3102 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3103 BranchInst::Create(LoopScalarPreHeader) :
3104 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3105 Builder.getTrue());
3106 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3107 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3108
3109 // Update dominator for loop exit. During skeleton creation, only the vector
3110 // pre-header and the middle block are created. The vector loop is entirely
3111 // created during VPlan exection.
3112 if (!Cost->requiresScalarEpilogue(VF))
3113 // If there is an epilogue which must run, there's no edge from the
3114 // middle block to exit blocks and thus no need to update the immediate
3115 // dominator of the exit blocks.
3116 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3117}
3118
3119PHINode *InnerLoopVectorizer::createInductionResumeValue(
3120 PHINode *OrigPhi, const InductionDescriptor &II,
3121 ArrayRef<BasicBlock *> BypassBlocks,
3122 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3123 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3124 assert(VectorTripCount && "Expected valid arguments")(static_cast <bool> (VectorTripCount && "Expected valid arguments"
) ? void (0) : __assert_fail ("VectorTripCount && \"Expected valid arguments\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3124, __extension__
__PRETTY_FUNCTION__))
;
3125
3126 Instruction *OldInduction = Legal->getPrimaryInduction();
3127 Value *&EndValue = IVEndValues[OrigPhi];
3128 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3129 if (OrigPhi == OldInduction) {
3130 // We know what the end value is.
3131 EndValue = VectorTripCount;
3132 } else {
3133 IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3134
3135 // Fast-math-flags propagate from the original induction instruction.
3136 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3137 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3138
3139 Type *StepType = II.getStep()->getType();
3140 Instruction::CastOps CastOp =
3141 CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3142 Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc");
3143 Value *Step =
3144 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3145 EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
3146 EndValue->setName("ind.end");
3147
3148 // Compute the end value for the additional bypass (if applicable).
3149 if (AdditionalBypass.first) {
3150 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3151 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, StepType,
3152 true);
3153 Value *Step =
3154 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3155 VTC = B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc");
3156 EndValueFromAdditionalBypass =
3157 emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
3158 EndValueFromAdditionalBypass->setName("ind.end");
3159 }
3160 }
3161
3162 // Create phi nodes to merge from the backedge-taken check block.
3163 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3164 LoopScalarPreHeader->getTerminator());
3165 // Copy original phi DL over to the new one.
3166 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3167
3168 // The new PHI merges the original incoming value, in case of a bypass,
3169 // or the value at the end of the vectorized loop.
3170 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3171
3172 // Fix the scalar body counter (PHI node).
3173 // The old induction's phi node in the scalar body needs the truncated
3174 // value.
3175 for (BasicBlock *BB : BypassBlocks)
3176 BCResumeVal->addIncoming(II.getStartValue(), BB);
3177
3178 if (AdditionalBypass.first)
3179 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3180 EndValueFromAdditionalBypass);
3181 return BCResumeVal;
3182}
3183
3184void InnerLoopVectorizer::createInductionResumeValues(
3185 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3186 assert(((AdditionalBypass.first && AdditionalBypass.second) ||(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3188, __extension__
__PRETTY_FUNCTION__))
3187 (!AdditionalBypass.first && !AdditionalBypass.second)) &&(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3188, __extension__
__PRETTY_FUNCTION__))
3188 "Inconsistent information about additional bypass.")(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3188, __extension__
__PRETTY_FUNCTION__))
;
3189 // We are going to resume the execution of the scalar loop.
3190 // Go over all of the induction variables that we found and fix the
3191 // PHIs that are left in the scalar version of the loop.
3192 // The starting values of PHI nodes depend on the counter of the last
3193 // iteration in the vectorized loop.
3194 // If we come from a bypass edge then we need to start from the original
3195 // start value.
3196 for (const auto &InductionEntry : Legal->getInductionVars()) {
3197 PHINode *OrigPhi = InductionEntry.first;
3198 const InductionDescriptor &II = InductionEntry.second;
3199 PHINode *BCResumeVal = createInductionResumeValue(
3200 OrigPhi, II, LoopBypassBlocks, AdditionalBypass);
3201 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3202 }
3203}
3204
3205BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) {
3206 // The trip counts should be cached by now.
3207 Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
3208 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3209
3210 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3211
3212 // Add a check in the middle block to see if we have completed
3213 // all of the iterations in the first vector loop. Three cases:
3214 // 1) If we require a scalar epilogue, there is no conditional branch as
3215 // we unconditionally branch to the scalar preheader. Do nothing.
3216 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3217 // Thus if tail is to be folded, we know we don't need to run the
3218 // remainder and we can use the previous value for the condition (true).
3219 // 3) Otherwise, construct a runtime check.
3220 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3221 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3222 Count, VectorTripCount, "cmp.n",
3223 LoopMiddleBlock->getTerminator());
3224
3225 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3226 // of the corresponding compare because they may have ended up with
3227 // different line numbers and we want to avoid awkward line stepping while
3228 // debugging. Eg. if the compare has got a line number inside the loop.
3229 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3230 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3231 }
3232
3233#ifdef EXPENSIVE_CHECKS
3234 assert(DT->verify(DominatorTree::VerificationLevel::Fast))(static_cast <bool> (DT->verify(DominatorTree::VerificationLevel
::Fast)) ? void (0) : __assert_fail ("DT->verify(DominatorTree::VerificationLevel::Fast)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3234, __extension__
__PRETTY_FUNCTION__))
;
3235#endif
3236
3237 return LoopVectorPreHeader;
3238}
3239
3240std::pair<BasicBlock *, Value *>
3241InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3242 /*
3243 In this function we generate a new loop. The new loop will contain
3244 the vectorized instructions while the old loop will continue to run the
3245 scalar remainder.
3246
3247 [ ] <-- loop iteration number check.
3248 / |
3249 / v
3250 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3251 | / |
3252 | / v
3253 || [ ] <-- vector pre header.
3254 |/ |
3255 | v
3256 | [ ] \
3257 | [ ]_| <-- vector loop (created during VPlan execution).
3258 | |
3259 | v
3260 \ -[ ] <--- middle-block.
3261 \/ |
3262 /\ v
3263 | ->[ ] <--- new preheader.
3264 | |
3265 (opt) v <-- edge from middle to exit iff epilogue is not required.
3266 | [ ] \
3267 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3268 \ |
3269 \ v
3270 >[ ] <-- exit block(s).
3271 ...
3272 */
3273
3274 // Get the metadata of the original loop before it gets modified.
3275 MDNode *OrigLoopID = OrigLoop->getLoopID();
3276
3277 // Workaround! Compute the trip count of the original loop and cache it
3278 // before we start modifying the CFG. This code has a systemic problem
3279 // wherein it tries to run analysis over partially constructed IR; this is
3280 // wrong, and not simply for SCEV. The trip count of the original loop
3281 // simply happens to be prone to hitting this in practice. In theory, we
3282 // can hit the same issue for any SCEV, or ValueTracking query done during
3283 // mutation. See PR49900.
3284 getOrCreateTripCount(OrigLoop->getLoopPreheader());
3285
3286 // Create an empty vector loop, and prepare basic blocks for the runtime
3287 // checks.
3288 createVectorLoopSkeleton("");
3289
3290 // Now, compare the new count to zero. If it is zero skip the vector loop and
3291 // jump to the scalar loop. This check also covers the case where the
3292 // backedge-taken count is uint##_max: adding one to it will overflow leading
3293 // to an incorrect trip count of zero. In this (rare) case we will also jump
3294 // to the scalar loop.
3295 emitIterationCountCheck(LoopScalarPreHeader);
3296
3297 // Generate the code to check any assumptions that we've made for SCEV
3298 // expressions.
3299 emitSCEVChecks(LoopScalarPreHeader);
3300
3301 // Generate the code that checks in runtime if arrays overlap. We put the
3302 // checks into a separate block to make the more common case of few elements
3303 // faster.
3304 emitMemRuntimeChecks(LoopScalarPreHeader);
3305
3306 // Emit phis for the new starting index of the scalar loop.
3307 createInductionResumeValues();
3308
3309 return {completeLoopSkeleton(OrigLoopID), nullptr};
3310}
3311
3312// Fix up external users of the induction variable. At this point, we are
3313// in LCSSA form, with all external PHIs that use the IV having one input value,
3314// coming from the remainder loop. We need those PHIs to also have a correct
3315// value for the IV when arriving directly from the middle block.
3316void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3317 const InductionDescriptor &II,
3318 Value *VectorTripCount, Value *EndValue,
3319 BasicBlock *MiddleBlock,
3320 BasicBlock *VectorHeader, VPlan &Plan) {
3321 // There are two kinds of external IV usages - those that use the value
3322 // computed in the last iteration (the PHI) and those that use the penultimate
3323 // value (the value that feeds into the phi from the loop latch).
3324 // We allow both, but they, obviously, have different values.
3325
3326 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block")(static_cast <bool> (OrigLoop->getUniqueExitBlock() &&
"Expected a single exit block") ? void (0) : __assert_fail (
"OrigLoop->getUniqueExitBlock() && \"Expected a single exit block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3326, __extension__
__PRETTY_FUNCTION__))
;
3327
3328 DenseMap<Value *, Value *> MissingVals;
3329
3330 // An external user of the last iteration's value should see the value that
3331 // the remainder loop uses to initialize its own IV.
3332 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3333 for (User *U : PostInc->users()) {
3334 Instruction *UI = cast<Instruction>(U);
3335 if (!OrigLoop->contains(UI)) {
3336 assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3336, __extension__
__PRETTY_FUNCTION__))
;
3337 MissingVals[UI] = EndValue;
3338 }
3339 }
3340
3341 // An external user of the penultimate value need to see EndValue - Step.
3342 // The simplest way to get this is to recompute it from the constituent SCEVs,
3343 // that is Start + (Step * (CRD - 1)).
3344 for (User *U : OrigPhi->users()) {
3345 auto *UI = cast<Instruction>(U);
3346 if (!OrigLoop->contains(UI)) {
3347 assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3347, __extension__
__PRETTY_FUNCTION__))
;
3348
3349 IRBuilder<> B(MiddleBlock->getTerminator());
3350
3351 // Fast-math-flags propagate from the original induction instruction.
3352 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3353 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3354
3355 Value *CountMinusOne = B.CreateSub(
3356 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3357 Value *CMO =
3358 !II.getStep()->getType()->isIntegerTy()
3359 ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3360 II.getStep()->getType())
3361 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3362 CMO->setName("cast.cmo");
3363
3364 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3365 VectorHeader->getTerminator());
3366 Value *Escape =
3367 emitTransformedIndex(B, CMO, II.getStartValue(), Step, II);
3368 Escape->setName("ind.escape");
3369 MissingVals[UI] = Escape;
3370 }
3371 }
3372
3373 for (auto &I : MissingVals) {
3374 PHINode *PHI = cast<PHINode>(I.first);
3375 // One corner case we have to handle is two IVs "chasing" each-other,
3376 // that is %IV2 = phi [...], [ %IV1, %latch ]
3377 // In this case, if IV1 has an external use, we need to avoid adding both
3378 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3379 // don't already have an incoming value for the middle block.
3380 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3381 PHI->addIncoming(I.second, MiddleBlock);
3382 Plan.removeLiveOut(PHI);
3383 }
3384 }
3385}
3386
3387namespace {
3388
3389struct CSEDenseMapInfo {
3390 static bool canHandle(const Instruction *I) {
3391 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3392 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3393 }
3394
3395 static inline Instruction *getEmptyKey() {
3396 return DenseMapInfo<Instruction *>::getEmptyKey();
3397 }
3398
3399 static inline Instruction *getTombstoneKey() {
3400 return DenseMapInfo<Instruction *>::getTombstoneKey();
3401 }
3402
3403 static unsigned getHashValue(const Instruction *I) {
3404 assert(canHandle(I) && "Unknown instruction!")(static_cast <bool> (canHandle(I) && "Unknown instruction!"
) ? void (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3404, __extension__
__PRETTY_FUNCTION__))
;
3405 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3406 I->value_op_end()));
3407 }
3408
3409 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3410 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3411 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3412 return LHS == RHS;
3413 return LHS->isIdenticalTo(RHS);
3414 }
3415};
3416
3417} // end anonymous namespace
3418
3419///Perform cse of induction variable instructions.
3420static void cse(BasicBlock *BB) {
3421 // Perform simple cse.
3422 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3423 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3424 if (!CSEDenseMapInfo::canHandle(&In))
3425 continue;
3426
3427 // Check if we can replace this instruction with any of the
3428 // visited instructions.
3429 if (Instruction *V = CSEMap.lookup(&In)) {
3430 In.replaceAllUsesWith(V);
3431 In.eraseFromParent();
3432 continue;
3433 }
3434
3435 CSEMap[&In] = &In;
3436 }
3437}
3438
3439InstructionCost
3440LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3441 bool &NeedToScalarize) const {
3442 Function *F = CI->getCalledFunction();
3443 Type *ScalarRetTy = CI->getType();
3444 SmallVector<Type *, 4> Tys, ScalarTys;
3445 for (auto &ArgOp : CI->args())
3446 ScalarTys.push_back(ArgOp->getType());
3447
3448 // Estimate cost of scalarized vector call. The source operands are assumed
3449 // to be vectors, so we need to extract individual elements from there,
3450 // execute VF scalar calls, and then gather the result into the vector return
3451 // value.
3452 InstructionCost ScalarCallCost =
3453 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3454 if (VF.isScalar())
3455 return ScalarCallCost;
3456
3457 // Compute corresponding vector type for return value and arguments.
3458 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3459 for (Type *ScalarTy : ScalarTys)
3460 Tys.push_back(ToVectorTy(ScalarTy, VF));
3461
3462 // Compute costs of unpacking argument values for the scalar calls and
3463 // packing the return values to a vector.
3464 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3465
3466 InstructionCost Cost =
3467 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3468
3469 // If we can't emit a vector call for this function, then the currently found
3470 // cost is the cost we need to return.
3471 NeedToScalarize = true;
3472 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3473 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3474
3475 if (!TLI || CI->isNoBuiltin() || !VecFunc)
3476 return Cost;
3477
3478 // If the corresponding vector cost is cheaper, return its cost.
3479 InstructionCost VectorCallCost =
3480 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3481 if (VectorCallCost < Cost) {
3482 NeedToScalarize = false;
3483 Cost = VectorCallCost;
3484 }
3485 return Cost;
3486}
3487
3488static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3489 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3490 return Elt;
3491 return VectorType::get(Elt, VF);
3492}
3493
3494InstructionCost
3495LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3496 ElementCount VF) const {
3497 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3498 assert(ID && "Expected intrinsic call!")(static_cast <bool> (ID && "Expected intrinsic call!"
) ? void (0) : __assert_fail ("ID && \"Expected intrinsic call!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3498, __extension__
__PRETTY_FUNCTION__))
;
3499 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3500 FastMathFlags FMF;
3501 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3502 FMF = FPMO->getFastMathFlags();
3503
3504 SmallVector<const Value *> Arguments(CI->args());
3505 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3506 SmallVector<Type *> ParamTys;
3507 std::transform(FTy->param_begin(), FTy->param_end(),
3508 std::back_inserter(ParamTys),
3509 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3510
3511 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3512 dyn_cast<IntrinsicInst>(CI));
3513 return TTI.getIntrinsicInstrCost(CostAttrs,
3514 TargetTransformInfo::TCK_RecipThroughput);
3515}
3516
3517static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3518 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3519 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3520 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3521}
3522
3523static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3524 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3525 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3526 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3527}
3528
3529void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3530 // For every instruction `I` in MinBWs, truncate the operands, create a
3531 // truncated version of `I` and reextend its result. InstCombine runs
3532 // later and will remove any ext/trunc pairs.
3533 SmallPtrSet<Value *, 4> Erased;
3534 for (const auto &KV : Cost->getMinimalBitwidths()) {
3535 // If the value wasn't vectorized, we must maintain the original scalar
3536 // type. The absence of the value from State indicates that it
3537 // wasn't vectorized.
3538 // FIXME: Should not rely on getVPValue at this point.
3539 VPValue *Def = State.Plan->getVPValue(KV.first, true);
3540 if (!State.hasAnyVectorValue(Def))
3541 continue;
3542 for (unsigned Part = 0; Part < UF; ++Part) {
3543 Value *I = State.get(Def, Part);
3544 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3545 continue;
3546 Type *OriginalTy = I->getType();
3547 Type *ScalarTruncatedTy =
3548 IntegerType::get(OriginalTy->getContext(), KV.second);
3549 auto *TruncatedTy = VectorType::get(
3550 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3551 if (TruncatedTy == OriginalTy)
3552 continue;
3553
3554 IRBuilder<> B(cast<Instruction>(I));
3555 auto ShrinkOperand = [&](Value *V) -> Value * {
3556 if (auto *ZI = dyn_cast<ZExtInst>(V))
3557 if (ZI->getSrcTy() == TruncatedTy)
3558 return ZI->getOperand(0);
3559 return B.CreateZExtOrTrunc(V, TruncatedTy);
3560 };
3561
3562 // The actual instruction modification depends on the instruction type,
3563 // unfortunately.
3564 Value *NewI = nullptr;
3565 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3566 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3567 ShrinkOperand(BO->getOperand(1)));
3568
3569 // Any wrapping introduced by shrinking this operation shouldn't be
3570 // considered undefined behavior. So, we can't unconditionally copy
3571 // arithmetic wrapping flags to NewI.
3572 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3573 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3574 NewI =
3575 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3576 ShrinkOperand(CI->getOperand(1)));
3577 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3578 NewI = B.CreateSelect(SI->getCondition(),
3579 ShrinkOperand(SI->getTrueValue()),
3580 ShrinkOperand(SI->getFalseValue()));
3581 } else if (auto *CI = dyn_cast<CastInst>(I)) {
3582 switch (CI->getOpcode()) {
3583 default:
3584 llvm_unreachable("Unhandled cast!")::llvm::llvm_unreachable_internal("Unhandled cast!", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3584)
;
3585 case Instruction::Trunc:
3586 NewI = ShrinkOperand(CI->getOperand(0));
3587 break;
3588 case Instruction::SExt:
3589 NewI = B.CreateSExtOrTrunc(
3590 CI->getOperand(0),
3591 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3592 break;
3593 case Instruction::ZExt:
3594 NewI = B.CreateZExtOrTrunc(
3595 CI->getOperand(0),
3596 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3597 break;
3598 }
3599 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3600 auto Elements0 =
3601 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3602 auto *O0 = B.CreateZExtOrTrunc(
3603 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3604 auto Elements1 =
3605 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3606 auto *O1 = B.CreateZExtOrTrunc(
3607 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3608
3609 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3610 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3611 // Don't do anything with the operands, just extend the result.
3612 continue;
3613 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3614 auto Elements =
3615 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3616 auto *O0 = B.CreateZExtOrTrunc(
3617 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3618 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3619 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3620 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3621 auto Elements =
3622 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3623 auto *O0 = B.CreateZExtOrTrunc(
3624 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3625 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3626 } else {
3627 // If we don't know what to do, be conservative and don't do anything.
3628 continue;
3629 }
3630
3631 // Lastly, extend the result.
3632 NewI->takeName(cast<Instruction>(I));
3633 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3634 I->replaceAllUsesWith(Res);
3635 cast<Instruction>(I)->eraseFromParent();
3636 Erased.insert(I);
3637 State.reset(Def, Res, Part);
3638 }
3639 }
3640
3641 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3642 for (const auto &KV : Cost->getMinimalBitwidths()) {
3643 // If the value wasn't vectorized, we must maintain the original scalar
3644 // type. The absence of the value from State indicates that it
3645 // wasn't vectorized.
3646 // FIXME: Should not rely on getVPValue at this point.
3647 VPValue *Def = State.Plan->getVPValue(KV.first, true);
3648 if (!State.hasAnyVectorValue(Def))
3649 continue;
3650 for (unsigned Part = 0; Part < UF; ++Part) {
3651 Value *I = State.get(Def, Part);
3652 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3653 if (Inst && Inst->use_empty()) {
3654 Value *NewI = Inst->getOperand(0);
3655 Inst->eraseFromParent();
3656 State.reset(Def, NewI, Part);
3657 }
3658 }
3659 }
3660}
3661
3662void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3663 VPlan &Plan) {
3664 // Insert truncates and extends for any truncated instructions as hints to
3665 // InstCombine.
3666 if (VF.isVector())
3667 truncateToMinimalBitwidths(State);
3668
3669 // Fix widened non-induction PHIs by setting up the PHI operands.
3670 if (EnableVPlanNativePath)
3671 fixNonInductionPHIs(Plan, State);
3672
3673 // At this point every instruction in the original loop is widened to a
3674 // vector form. Now we need to fix the recurrences in the loop. These PHI
3675 // nodes are currently empty because we did not want to introduce cycles.
3676 // This is the second stage of vectorizing recurrences.
3677 fixCrossIterationPHIs(State);
3678
3679 // Forget the original basic block.
3680 PSE.getSE()->forgetLoop(OrigLoop);
3681
3682 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
3683 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3684 if (Cost->requiresScalarEpilogue(VF)) {
3685 // No edge from the middle block to the unique exit block has been inserted
3686 // and there is nothing to fix from vector loop; phis should have incoming
3687 // from scalar loop only.
3688 Plan.clearLiveOuts();
3689 } else {
3690 // If we inserted an edge from the middle block to the unique exit block,
3691 // update uses outside the loop (phis) to account for the newly inserted
3692 // edge.
3693
3694 // Fix-up external users of the induction variables.
3695 for (const auto &Entry : Legal->getInductionVars())
3696 fixupIVUsers(Entry.first, Entry.second,
3697 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3698 IVEndValues[Entry.first], LoopMiddleBlock,
3699 VectorLoop->getHeader(), Plan);
3700 }
3701
3702 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3703 // in the exit block, so update the builder.
3704 State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
3705 for (const auto &KV : Plan.getLiveOuts())
3706 KV.second->fixPhi(Plan, State);
3707
3708 for (Instruction *PI : PredicatedInstructions)
3709 sinkScalarOperands(&*PI);
3710
3711 // Remove redundant induction instructions.
3712 cse(VectorLoop->getHeader());
3713
3714 // Set/update profile weights for the vector and remainder loops as original
3715 // loop iterations are now distributed among them. Note that original loop
3716 // represented by LoopScalarBody becomes remainder loop after vectorization.
3717 //
3718 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3719 // end up getting slightly roughened result but that should be OK since
3720 // profile is not inherently precise anyway. Note also possible bypass of
3721 // vector code caused by legality checks is ignored, assigning all the weight
3722 // to the vector loop, optimistically.
3723 //
3724 // For scalable vectorization we can't know at compile time how many iterations
3725 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3726 // vscale of '1'.
3727 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3728 LI->getLoopFor(LoopScalarBody),
3729 VF.getKnownMinValue() * UF);
3730}
3731
3732void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3733 // In order to support recurrences we need to be able to vectorize Phi nodes.
3734 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3735 // stage #2: We now need to fix the recurrences by adding incoming edges to
3736 // the currently empty PHI nodes. At this point every instruction in the
3737 // original loop is widened to a vector form so we can use them to construct
3738 // the incoming edges.
3739 VPBasicBlock *Header =
3740 State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
3741 for (VPRecipeBase &R : Header->phis()) {
3742 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3743 fixReduction(ReductionPhi, State);
3744 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3745 fixFixedOrderRecurrence(FOR, State);
3746 }
3747}
3748
3749void InnerLoopVectorizer::fixFixedOrderRecurrence(
3750 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3751 // This is the second phase of vectorizing first-order recurrences. An
3752 // overview of the transformation is described below. Suppose we have the
3753 // following loop.
3754 //
3755 // for (int i = 0; i < n; ++i)
3756 // b[i] = a[i] - a[i - 1];
3757 //
3758 // There is a first-order recurrence on "a". For this loop, the shorthand
3759 // scalar IR looks like:
3760 //
3761 // scalar.ph:
3762 // s_init = a[-1]
3763 // br scalar.body
3764 //
3765 // scalar.body:
3766 // i = phi [0, scalar.ph], [i+1, scalar.body]
3767 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3768 // s2 = a[i]
3769 // b[i] = s2 - s1
3770 // br cond, scalar.body, ...
3771 //
3772 // In this example, s1 is a recurrence because it's value depends on the
3773 // previous iteration. In the first phase of vectorization, we created a
3774 // vector phi v1 for s1. We now complete the vectorization and produce the
3775 // shorthand vector IR shown below (for VF = 4, UF = 1).
3776 //
3777 // vector.ph:
3778 // v_init = vector(..., ..., ..., a[-1])
3779 // br vector.body
3780 //
3781 // vector.body
3782 // i = phi [0, vector.ph], [i+4, vector.body]
3783 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3784 // v2 = a[i, i+1, i+2, i+3];
3785 // v3 = vector(v1(3), v2(0, 1, 2))
3786 // b[i, i+1, i+2, i+3] = v2 - v3
3787 // br cond, vector.body, middle.block
3788 //
3789 // middle.block:
3790 // x = v2(3)
3791 // br scalar.ph
3792 //
3793 // scalar.ph:
3794 // s_init = phi [x, middle.block], [a[-1], otherwise]
3795 // br scalar.body
3796 //
3797 // After execution completes the vector loop, we extract the next value of
3798 // the recurrence (x) to use as the initial value in the scalar loop.
3799
3800 // Extract the last vector element in the middle block. This will be the
3801 // initial value for the recurrence when jumping to the scalar loop.
3802 VPValue *PreviousDef = PhiR->getBackedgeValue();
3803 Value *Incoming = State.get(PreviousDef, UF - 1);
3804 auto *ExtractForScalar = Incoming;
3805 auto *IdxTy = Builder.getInt32Ty();
3806 if (VF.isVector()) {
3807 auto *One = ConstantInt::get(IdxTy, 1);
3808 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3809 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3810 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3811 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
3812 "vector.recur.extract");
3813 }
3814 // Extract the second last element in the middle block if the
3815 // Phi is used outside the loop. We need to extract the phi itself
3816 // and not the last element (the phi update in the current iteration). This
3817 // will be the value when jumping to the exit block from the LoopMiddleBlock,
3818 // when the scalar loop is not run at all.
3819 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3820 if (VF.isVector()) {
3821 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3822 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3823 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3824 Incoming, Idx, "vector.recur.extract.for.phi");
3825 } else if (UF > 1)
3826 // When loop is unrolled without vectorizing, initialize
3827 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
3828 // of `Incoming`. This is analogous to the vectorized case above: extracting
3829 // the second last element when VF > 1.
3830 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3831
3832 // Fix the initial value of the original recurrence in the scalar loop.
3833 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3834 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3835 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3836 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3837 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3838 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3839 Start->addIncoming(Incoming, BB);
3840 }
3841
3842 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3843 Phi->setName("scalar.recur");
3844
3845 // Finally, fix users of the recurrence outside the loop. The users will need
3846 // either the last value of the scalar recurrence or the last value of the
3847 // vector recurrence we extracted in the middle block. Since the loop is in
3848 // LCSSA form, we just need to find all the phi nodes for the original scalar
3849 // recurrence in the exit block, and then add an edge for the middle block.
3850 // Note that LCSSA does not imply single entry when the original scalar loop
3851 // had multiple exiting edges (as we always run the last iteration in the
3852 // scalar epilogue); in that case, there is no edge from middle to exit and
3853 // and thus no phis which needed updated.
3854 if (!Cost->requiresScalarEpilogue(VF))
3855 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3856 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
3857 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3858 State.Plan->removeLiveOut(&LCSSAPhi);
3859 }
3860}
3861
3862void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3863 VPTransformState &State) {
3864 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3865 // Get it's reduction variable descriptor.
3866 assert(Legal->isReductionVariable(OrigPhi) &&(static_cast <bool> (Legal->isReductionVariable(OrigPhi
) && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3867, __extension__
__PRETTY_FUNCTION__))
3867 "Unable to find the reduction variable")(static_cast <bool> (Legal->isReductionVariable(OrigPhi
) && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3867, __extension__
__PRETTY_FUNCTION__))
;
3868 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3869
3870 RecurKind RK = RdxDesc.getRecurrenceKind();
3871 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3872 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3873 State.setDebugLocFromInst(ReductionStartValue);
3874
3875 VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3876 // This is the vector-clone of the value that leaves the loop.
3877 Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3878
3879 // Wrap flags are in general invalid after vectorization, clear them.
3880 clearReductionWrapFlags(PhiR, State);
3881
3882 // Before each round, move the insertion point right between
3883 // the PHIs and the values we are going to write.
3884 // This allows us to write both PHINodes and the extractelement
3885 // instructions.
3886 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3887
3888 State.setDebugLocFromInst(LoopExitInst);
3889
3890 Type *PhiTy = OrigPhi->getType();
3891
3892 VPBasicBlock *LatchVPBB =
3893 PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
3894 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
3895 // If tail is folded by masking, the vector value to leave the loop should be
3896 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3897 // instead of the former. For an inloop reduction the reduction will already
3898 // be predicated, and does not need to be handled here.
3899 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3900 for (unsigned Part = 0; Part < UF; ++Part) {
3901 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3902 SelectInst *Sel = nullptr;
3903 for (User *U : VecLoopExitInst->users()) {
3904 if (isa<SelectInst>(U)) {
3905 assert(!Sel && "Reduction exit feeding two selects")(static_cast <bool> (!Sel && "Reduction exit feeding two selects"
) ? void (0) : __assert_fail ("!Sel && \"Reduction exit feeding two selects\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3905, __extension__
__PRETTY_FUNCTION__))
;
3906 Sel = cast<SelectInst>(U);
3907 } else
3908 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select")(static_cast <bool> (isa<PHINode>(U) && "Reduction exit must feed Phi's or select"
) ? void (0) : __assert_fail ("isa<PHINode>(U) && \"Reduction exit must feed Phi's or select\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3908, __extension__
__PRETTY_FUNCTION__))
;
3909 }
3910 assert(Sel && "Reduction exit feeds no select")(static_cast <bool> (Sel && "Reduction exit feeds no select"
) ? void (0) : __assert_fail ("Sel && \"Reduction exit feeds no select\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3910, __extension__
__PRETTY_FUNCTION__))
;
3911 State.reset(LoopExitInstDef, Sel, Part);
3912
3913 if (isa<FPMathOperator>(Sel))
3914 Sel->setFastMathFlags(RdxDesc.getFastMathFlags());
3915
3916 // If the target can create a predicated operator for the reduction at no
3917 // extra cost in the loop (for example a predicated vadd), it can be
3918 // cheaper for the select to remain in the loop than be sunk out of it,
3919 // and so use the select value for the phi instead of the old
3920 // LoopExitValue.
3921 if (PreferPredicatedReductionSelect ||
3922 TTI->preferPredicatedReductionSelect(
3923 RdxDesc.getOpcode(), PhiTy,
3924 TargetTransformInfo::ReductionFlags())) {
3925 auto *VecRdxPhi =
3926 cast<PHINode>(State.get(PhiR, Part));
3927 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3928 }
3929 }
3930 }
3931
3932 // If the vector reduction can be performed in a smaller type, we truncate
3933 // then extend the loop exit value to enable InstCombine to evaluate the
3934 // entire expression in the smaller type.
3935 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3936 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!")(static_cast <bool> (!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"
) ? void (0) : __assert_fail ("!PhiR->isInLoop() && \"Unexpected truncated inloop reduction!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3936, __extension__
__PRETTY_FUNCTION__))
;
3937 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3938 Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3939 VectorParts RdxParts(UF);
3940 for (unsigned Part = 0; Part < UF; ++Part) {
3941 RdxParts[Part] = State.get(LoopExitInstDef, Part);
3942 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3943 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3944 : Builder.CreateZExt(Trunc, VecTy);
3945 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3946 if (U != Trunc) {
3947 U->replaceUsesOfWith(RdxParts[Part], Extnd);
3948 RdxParts[Part] = Extnd;
3949 }
3950 }
3951 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3952 for (unsigned Part = 0; Part < UF; ++Part) {
3953 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3954 State.reset(LoopExitInstDef, RdxParts[Part], Part);
3955 }
3956 }
3957
3958 // Reduce all of the unrolled parts into a single vector.
3959 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3960 unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3961
3962 // The middle block terminator has already been assigned a DebugLoc here (the
3963 // OrigLoop's single latch terminator). We want the whole middle block to
3964 // appear to execute on this line because: (a) it is all compiler generated,
3965 // (b) these instructions are always executed after evaluating the latch
3966 // conditional branch, and (c) other passes may add new predecessors which
3967 // terminate on this line. This is the easiest way to ensure we don't
3968 // accidentally cause an extra step back into the loop while debugging.
3969 State.setDebugLocFromInst(LoopMiddleBlock->getTerminator());
3970 if (PhiR->isOrdered())
3971 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3972 else {
3973 // Floating-point operations should have some FMF to enable the reduction.
3974 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3975 Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3976 for (unsigned Part = 1; Part < UF; ++Part) {
3977 Value *RdxPart = State.get(LoopExitInstDef, Part);
3978 if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
3979 ReducedPartRdx = Builder.CreateBinOp(
3980 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
3981 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
3982 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
3983 ReducedPartRdx, RdxPart);
3984 else
3985 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
3986 }
3987 }
3988
3989 // Create the reduction after the loop. Note that inloop reductions create the
3990 // target reduction in the loop using a Reduction recipe.
3991 if (VF.isVector() && !PhiR->isInLoop()) {
3992 ReducedPartRdx =
3993 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
3994 // If the reduction can be performed in a smaller type, we need to extend
3995 // the reduction to the wider type before we branch to the original loop.
3996 if (PhiTy != RdxDesc.getRecurrenceType())
3997 ReducedPartRdx = RdxDesc.isSigned()
3998 ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
3999 : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4000 }
4001
4002 PHINode *ResumePhi =
4003 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
4004
4005 // Create a phi node that merges control-flow from the backedge-taken check
4006 // block and the middle block.
4007 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4008 LoopScalarPreHeader->getTerminator());
4009
4010 // If we are fixing reductions in the epilogue loop then we should already
4011 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
4012 // we carry over the incoming values correctly.
4013 for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
4014 if (Incoming == LoopMiddleBlock)
4015 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
4016 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
4017 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
4018 Incoming);
4019 else
4020 BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4021 }
4022
4023 // Set the resume value for this reduction
4024 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4025
4026 // If there were stores of the reduction value to a uniform memory address
4027 // inside the loop, create the final store here.
4028 if (StoreInst *SI = RdxDesc.IntermediateStore) {
4029 StoreInst *NewSI =
4030 Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
4031 propagateMetadata(NewSI, SI);
4032
4033 // If the reduction value is used in other places,
4034 // then let the code below create PHI's for that.
4035 }
4036
4037 // Now, we need to fix the users of the reduction variable
4038 // inside and outside of the scalar remainder loop.
4039
4040 // We know that the loop is in LCSSA form. We need to update the PHI nodes
4041 // in the exit blocks. See comment on analogous loop in
4042 // fixFixedOrderRecurrence for a more complete explaination of the logic.
4043 if (!Cost->requiresScalarEpilogue(VF))
4044 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4045 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
4046 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4047 State.Plan->removeLiveOut(&LCSSAPhi);
4048 }
4049
4050 // Fix the scalar loop reduction variable with the incoming reduction sum
4051 // from the vector body and from the backedge value.
4052 int IncomingEdgeBlockIdx =
4053 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4054 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")(static_cast <bool> (IncomingEdgeBlockIdx >= 0 &&
"Invalid block index") ? void (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4054, __extension__
__PRETTY_FUNCTION__))
;
4055 // Pick the other block.
4056 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4057 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4058 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4059}
4060
4061void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
4062 VPTransformState &State) {
4063 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4064 RecurKind RK = RdxDesc.getRecurrenceKind();
4065 if (RK != RecurKind::Add && RK != RecurKind::Mul)
4066 return;
4067
4068 SmallVector<VPValue *, 8> Worklist;
4069 SmallPtrSet<VPValue *, 8> Visited;
4070 Worklist.push_back(PhiR);
4071 Visited.insert(PhiR);
4072
4073 while (!Worklist.empty()) {
4074 VPValue *Cur = Worklist.pop_back_val();
4075 for (unsigned Part = 0; Part < UF; ++Part) {
4076 Value *V = State.get(Cur, Part);
4077 if (!isa<OverflowingBinaryOperator>(V))
4078 break;
4079 cast<Instruction>(V)->dropPoisonGeneratingFlags();
4080 }
4081
4082 for (VPUser *U : Cur->users()) {
4083 auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
4084 if (!UserRecipe)
4085 continue;
4086 for (VPValue *V : UserRecipe->definedValues())
4087 if (Visited.insert(V).second)
4088 Worklist.push_back(V);
4089 }
4090 }
4091}
4092
4093void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4094 // The basic block and loop containing the predicated instruction.
4095 auto *PredBB = PredInst->getParent();
4096 auto *VectorLoop = LI->getLoopFor(PredBB);
4097
4098 // Initialize a worklist with the operands of the predicated instruction.
4099 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4100
4101 // Holds instructions that we need to analyze again. An instruction may be
4102 // reanalyzed if we don't yet know if we can sink it or not.
4103 SmallVector<Instruction *, 8> InstsToReanalyze;
4104
4105 // Returns true if a given use occurs in the predicated block. Phi nodes use
4106 // their operands in their corresponding predecessor blocks.
4107 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4108 auto *I = cast<Instruction>(U.getUser());
4109 BasicBlock *BB = I->getParent();
4110 if (auto *Phi = dyn_cast<PHINode>(I))
4111 BB = Phi->getIncomingBlock(
4112 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4113 return BB == PredBB;
4114 };
4115
4116 // Iteratively sink the scalarized operands of the predicated instruction
4117 // into the block we created for it. When an instruction is sunk, it's
4118 // operands are then added to the worklist. The algorithm ends after one pass
4119 // through the worklist doesn't sink a single instruction.
4120 bool Changed;
4121 do {
4122 // Add the instructions that need to be reanalyzed to the worklist, and
4123 // reset the changed indicator.
4124 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4125 InstsToReanalyze.clear();
4126 Changed = false;
4127
4128 while (!Worklist.empty()) {
4129 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4130
4131 // We can't sink an instruction if it is a phi node, is not in the loop,
4132 // or may have side effects.
4133 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4134 I->mayHaveSideEffects())
4135 continue;
4136
4137 // If the instruction is already in PredBB, check if we can sink its
4138 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4139 // sinking the scalar instruction I, hence it appears in PredBB; but it
4140 // may have failed to sink I's operands (recursively), which we try
4141 // (again) here.
4142 if (I->getParent() == PredBB) {
4143 Worklist.insert(I->op_begin(), I->op_end());
4144 continue;
4145 }
4146
4147 // It's legal to sink the instruction if all its uses occur in the
4148 // predicated block. Otherwise, there's nothing to do yet, and we may
4149 // need to reanalyze the instruction.
4150 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4151 InstsToReanalyze.push_back(I);
4152 continue;
4153 }
4154
4155 // Move the instruction to the beginning of the predicated block, and add
4156 // it's operands to the worklist.
4157 I->moveBefore(&*PredBB->getFirstInsertionPt());
4158 Worklist.insert(I->op_begin(), I->op_end());
4159
4160 // The sinking may have enabled other instructions to be sunk, so we will
4161 // need to iterate.
4162 Changed = true;
4163 }
4164 } while (Changed);
4165}
4166
4167void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
4168 VPTransformState &State) {
4169 auto Iter = depth_first(
4170 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry()));
4171 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4172 for (VPRecipeBase &P : VPBB->phis()) {
4173 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
4174 if (!VPPhi)
4175 continue;
4176 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4177 // Make sure the builder has a valid insert point.
4178 Builder.SetInsertPoint(NewPhi);
4179 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4180 VPValue *Inc = VPPhi->getIncomingValue(i);
4181 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4182 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4183 }
4184 }
4185 }
4186}
4187
4188bool InnerLoopVectorizer::useOrderedReductions(
4189 const RecurrenceDescriptor &RdxDesc) {
4190 return Cost->useOrderedReductions(RdxDesc);
4191}
4192
4193void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4194 // We should not collect Scalars more than once per VF. Right now, this
4195 // function is called from collectUniformsAndScalars(), which already does
4196 // this check. Collecting Scalars for VF=1 does not make any sense.
4197 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&(static_cast <bool> (VF.isVector() && Scalars.find
(VF) == Scalars.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4198, __extension__
__PRETTY_FUNCTION__))
4198 "This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Scalars.find
(VF) == Scalars.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4198, __extension__
__PRETTY_FUNCTION__))
;
4199
4200 // This avoids any chances of creating a REPLICATE recipe during planning
4201 // since that would result in generation of scalarized code during execution,
4202 // which is not supported for scalable vectors.
4203 if (VF.isScalable()) {
4204 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
4205 return;
4206 }
4207
4208 SmallSetVector<Instruction *, 8> Worklist;
4209
4210 // These sets are used to seed the analysis with pointers used by memory
4211 // accesses that will remain scalar.
4212 SmallSetVector<Instruction *, 8> ScalarPtrs;
4213 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4214 auto *Latch = TheLoop->getLoopLatch();
4215
4216 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4217 // The pointer operands of loads and stores will be scalar as long as the
4218 // memory access is not a gather or scatter operation. The value operand of a
4219 // store will remain scalar if the store is scalarized.
4220 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4221 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4222 assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4223, __extension__
__PRETTY_FUNCTION__))
4223 "Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4223, __extension__
__PRETTY_FUNCTION__))
;
4224 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4225 if (Ptr == Store->getValueOperand())
4226 return WideningDecision == CM_Scalarize;
4227 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4228, __extension__
__PRETTY_FUNCTION__))
4228 "Ptr is neither a value or pointer operand")(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4228, __extension__
__PRETTY_FUNCTION__))
;
4229 return WideningDecision != CM_GatherScatter;
4230 };
4231
4232 // A helper that returns true if the given value is a bitcast or
4233 // getelementptr instruction contained in the loop.
4234 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4235 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4236 isa<GetElementPtrInst>(V)) &&
4237 !TheLoop->isLoopInvariant(V);
4238 };
4239
4240 // A helper that evaluates a memory access's use of a pointer. If the use will
4241 // be a scalar use and the pointer is only used by memory accesses, we place
4242 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4243 // PossibleNonScalarPtrs.
4244 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4245 // We only care about bitcast and getelementptr instructions contained in
4246 // the loop.
4247 if (!isLoopVaryingBitCastOrGEP(Ptr))
4248 return;
4249
4250 // If the pointer has already been identified as scalar (e.g., if it was
4251 // also identified as uniform), there's nothing to do.
4252 auto *I = cast<Instruction>(Ptr);
4253 if (Worklist.count(I))
4254 return;
4255
4256 // If the use of the pointer will be a scalar use, and all users of the
4257 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4258 // place the pointer in PossibleNonScalarPtrs.
4259 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4260 return isa<LoadInst>(U) || isa<StoreInst>(U);
4261 }))
4262 ScalarPtrs.insert(I);
4263 else
4264 PossibleNonScalarPtrs.insert(I);
4265 };
4266
4267 // We seed the scalars analysis with three classes of instructions: (1)
4268 // instructions marked uniform-after-vectorization and (2) bitcast,
4269 // getelementptr and (pointer) phi instructions used by memory accesses
4270 // requiring a scalar use.
4271 //
4272 // (1) Add to the worklist all instructions that have been identified as
4273 // uniform-after-vectorization.
4274 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4275
4276 // (2) Add to the worklist all bitcast and getelementptr instructions used by
4277 // memory accesses requiring a scalar use. The pointer operands of loads and
4278 // stores will be scalar as long as the memory accesses is not a gather or
4279 // scatter operation. The value operand of a store will remain scalar if the
4280 // store is scalarized.
4281 for (auto *BB : TheLoop->blocks())
4282 for (auto &I : *BB) {
4283 if (auto *Load = dyn_cast<LoadInst>(&I)) {
4284 evaluatePtrUse(Load, Load->getPointerOperand());
4285 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4286 evaluatePtrUse(Store, Store->getPointerOperand());
4287 evaluatePtrUse(Store, Store->getValueOperand());
4288 }
4289 }
4290 for (auto *I : ScalarPtrs)
4291 if (!PossibleNonScalarPtrs.count(I)) {
4292 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *I << "\n"; } } while (false)
;
4293 Worklist.insert(I);
4294 }
4295
4296 // Insert the forced scalars.
4297 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
4298 // induction variable when the PHI user is scalarized.
4299 auto ForcedScalar = ForcedScalars.find(VF);
4300 if (ForcedScalar != ForcedScalars.end())
4301 for (auto *I : ForcedScalar->second) {
4302 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found (forced) scalar instruction: "
<< *I << "\n"; } } while (false)
;
4303 Worklist.insert(I);
4304 }
4305
4306 // Expand the worklist by looking through any bitcasts and getelementptr
4307 // instructions we've already identified as scalar. This is similar to the
4308 // expansion step in collectLoopUniforms(); however, here we're only
4309 // expanding to include additional bitcasts and getelementptr instructions.
4310 unsigned Idx = 0;
4311 while (Idx != Worklist.size()) {
4312 Instruction *Dst = Worklist[Idx++];
4313 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4314 continue;
4315 auto *Src = cast<Instruction>(Dst->getOperand(0));
4316 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4317 auto *J = cast<Instruction>(U);
4318 return !TheLoop->contains(J) || Worklist.count(J) ||
4319 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4320 isScalarUse(J, Src));
4321 })) {
4322 Worklist.insert(Src);
4323 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Src << "\n"; } } while (false)
;
4324 }
4325 }
4326
4327 // An induction variable will remain scalar if all users of the induction
4328 // variable and induction variable update remain scalar.
4329 for (const auto &Induction : Legal->getInductionVars()) {
4330 auto *Ind = Induction.first;
4331 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4332
4333 // If tail-folding is applied, the primary induction variable will be used
4334 // to feed a vector compare.
4335 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4336 continue;
4337
4338 // Returns true if \p Indvar is a pointer induction that is used directly by
4339 // load/store instruction \p I.
4340 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4341 Instruction *I) {
4342 return Induction.second.getKind() ==
4343 InductionDescriptor::IK_PtrInduction &&
4344 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4345 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4346 };
4347
4348 // Determine if all users of the induction variable are scalar after
4349 // vectorization.
4350 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4351 auto *I = cast<Instruction>(U);
4352 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4353 IsDirectLoadStoreFromPtrIndvar(Ind, I);
4354 });
4355 if (!ScalarInd)
4356 continue;
4357
4358 // Determine if all users of the induction variable update instruction are
4359 // scalar after vectorization.
4360 auto ScalarIndUpdate =
4361 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4362 auto *I = cast<Instruction>(U);
4363 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4364 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4365 });
4366 if (!ScalarIndUpdate)
4367 continue;
4368
4369 // The induction variable and its update instruction will remain scalar.
4370 Worklist.insert(Ind);
4371 Worklist.insert(IndUpdate);
4372 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Ind << "\n"; } } while (false)
;
4373 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
4374 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
;
4375 }
4376
4377 Scalars[VF].insert(Worklist.begin(), Worklist.end());
4378}
4379
4380bool LoopVectorizationCostModel::isScalarWithPredication(
4381 Instruction *I, ElementCount VF) const {
4382 if (!isPredicatedInst(I))
4383 return false;
4384
4385 // Do we have a non-scalar lowering for this predicated
4386 // instruction? No - it is scalar with predication.
4387 switch(I->getOpcode()) {
4388 default:
4389 return true;
4390 case Instruction::Load:
4391 case Instruction::Store: {
4392 auto *Ptr = getLoadStorePointerOperand(I);
4393 auto *Ty = getLoadStoreType(I);
4394 Type *VTy = Ty;
4395 if (VF.isVector())
4396 VTy = VectorType::get(Ty, VF);
4397 const Align Alignment = getLoadStoreAlignment(I);
4398 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4399 TTI.isLegalMaskedGather(VTy, Alignment))
4400 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4401 TTI.isLegalMaskedScatter(VTy, Alignment));
4402 }
4403 case Instruction::UDiv:
4404 case Instruction::SDiv:
4405 case Instruction::SRem:
4406 case Instruction::URem: {
4407 // We have the option to use the safe-divisor idiom to avoid predication.
4408 // The cost based decision here will always select safe-divisor for
4409 // scalable vectors as scalarization isn't legal.
4410 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
4411 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
4412 }
4413 }
4414}
4415
4416bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
4417 if (!blockNeedsPredicationForAnyReason(I->getParent()))
4418 return false;
4419
4420 // Can we prove this instruction is safe to unconditionally execute?
4421 // If not, we must use some form of predication.
4422 switch(I->getOpcode()) {
4423 default:
4424 return false;
4425 case Instruction::Load:
4426 case Instruction::Store: {
4427 if (!Legal->isMaskRequired(I))
4428 return false;
4429 // When we know the load's address is loop invariant and the instruction
4430 // in the original scalar loop was unconditionally executed then we
4431 // don't need to mark it as a predicated instruction. Tail folding may
4432 // introduce additional predication, but we're guaranteed to always have
4433 // at least one active lane. We call Legal->blockNeedsPredication here
4434 // because it doesn't query tail-folding. For stores, we need to prove
4435 // both speculation safety (which follows from the same argument as loads),
4436 // but also must prove the value being stored is correct. The easiest
4437 // form of the later is to require that all values stored are the same.
4438 if (Legal->isUniformMemOp(*I) &&
4439 (isa<LoadInst>(I) ||
4440 (isa<StoreInst>(I) &&
4441 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
4442 !Legal->blockNeedsPredication(I->getParent()))
4443 return false;
4444 return true;
4445 }
4446 case Instruction::UDiv:
4447 case Instruction::SDiv:
4448 case Instruction::SRem:
4449 case Instruction::URem:
4450 // TODO: We can use the loop-preheader as context point here and get
4451 // context sensitive reasoning
4452 return !isSafeToSpeculativelyExecute(I);
4453 }
4454}
4455
4456std::pair<InstructionCost, InstructionCost>
4457LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
4458 ElementCount VF) const {
4459 assert(I->getOpcode() == Instruction::UDiv ||(static_cast <bool> (I->getOpcode() == Instruction::
UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode
() == Instruction::SRem || I->getOpcode() == Instruction::
URem) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4462, __extension__
__PRETTY_FUNCTION__))
4460 I->getOpcode() == Instruction::SDiv ||(static_cast <bool> (I->getOpcode() == Instruction::
UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode
() == Instruction::SRem || I->getOpcode() == Instruction::
URem) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4462, __extension__
__PRETTY_FUNCTION__))
4461 I->getOpcode() == Instruction::SRem ||(static_cast <bool> (I->getOpcode() == Instruction::
UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode
() == Instruction::SRem || I->getOpcode() == Instruction::
URem) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4462, __extension__
__PRETTY_FUNCTION__))
4462 I->getOpcode() == Instruction::URem)(static_cast <bool> (I->getOpcode() == Instruction::
UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode
() == Instruction::SRem || I->getOpcode() == Instruction::
URem) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4462, __extension__
__PRETTY_FUNCTION__))
;
4463 assert(!isSafeToSpeculativelyExecute(I))(static_cast <bool> (!isSafeToSpeculativelyExecute(I)) ?
void (0) : __assert_fail ("!isSafeToSpeculativelyExecute(I)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4463, __extension__
__PRETTY_FUNCTION__))
;
4464
4465 const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4466
4467 // Scalarization isn't legal for scalable vector types
4468 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
4469 if (!VF.isScalable()) {
4470 // Get the scalarization cost and scale this amount by the probability of
4471 // executing the predicated block. If the instruction is not predicated,
4472 // we fall through to the next case.
4473 ScalarizationCost = 0;
4474
4475 // These instructions have a non-void type, so account for the phi nodes
4476 // that we will create. This cost is likely to be zero. The phi node
4477 // cost, if any, should be scaled by the block probability because it
4478 // models a copy at the end of each predicated block.
4479 ScalarizationCost += VF.getKnownMinValue() *
4480 TTI.getCFInstrCost(Instruction::PHI, CostKind);
4481
4482 // The cost of the non-predicated instruction.
4483 ScalarizationCost += VF.getKnownMinValue() *
4484 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4485
4486 // The cost of insertelement and extractelement instructions needed for
4487 // scalarization.
4488 ScalarizationCost += getScalarizationOverhead(I, VF);
4489
4490 // Scale the cost by the probability of executing the predicated blocks.
4491 // This assumes the predicated block for each vector lane is equally
4492 // likely.
4493 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4494 }
4495 InstructionCost SafeDivisorCost = 0;
4496
4497 auto *VecTy = ToVectorTy(I->getType(), VF);
4498
4499 // The cost of the select guard to ensure all lanes are well defined
4500 // after we speculate above any internal control flow.
4501 SafeDivisorCost += TTI.getCmpSelInstrCost(
4502 Instruction::Select, VecTy,
4503 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4504 CmpInst::BAD_ICMP_PREDICATE, CostKind);
4505
4506 // Certain instructions can be cheaper to vectorize if they have a constant
4507 // second vector operand. One example of this are shifts on x86.
4508 Value *Op2 = I->getOperand(1);
4509 auto Op2Info = TTI.getOperandInfo(Op2);
4510 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
4511 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
4512
4513 SmallVector<const Value *, 4> Operands(I->operand_values());
4514 SafeDivisorCost += TTI.getArithmeticInstrCost(
4515 I->getOpcode(), VecTy, CostKind,
4516 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4517 Op2Info, Operands, I);
4518 return {ScalarizationCost, SafeDivisorCost};
4519}
4520
4521bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4522 Instruction *I, ElementCount VF) {
4523 assert(isAccessInterleaved(I) && "Expecting interleaved access.")(static_cast <bool> (isAccessInterleaved(I) && "Expecting interleaved access."
) ? void (0) : __assert_fail ("isAccessInterleaved(I) && \"Expecting interleaved access.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4523, __extension__
__PRETTY_FUNCTION__))
;
4524 assert(getWideningDecision(I, VF) == CM_Unknown &&(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown
&& "Decision should not be set yet.") ? void (0) : __assert_fail
("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4525, __extension__
__PRETTY_FUNCTION__))
4525 "Decision should not be set yet.")(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown
&& "Decision should not be set yet.") ? void (0) : __assert_fail
("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4525, __extension__
__PRETTY_FUNCTION__))
;
4526 auto *Group = getInterleavedAccessGroup(I);
4527 assert(Group && "Must have a group.")(static_cast <bool> (Group && "Must have a group."
) ? void (0) : __assert_fail ("Group && \"Must have a group.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4527, __extension__
__PRETTY_FUNCTION__))
;
4528
4529 // If the instruction's allocated size doesn't equal it's type size, it
4530 // requires padding and will be scalarized.
4531 auto &DL = I->getModule()->getDataLayout();
4532 auto *ScalarTy = getLoadStoreType(I);
4533 if (hasIrregularType(ScalarTy, DL))
4534 return false;
4535
4536 // If the group involves a non-integral pointer, we may not be able to
4537 // losslessly cast all values to a common type.
4538 unsigned InterleaveFactor = Group->getFactor();
4539 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4540 for (unsigned i = 0; i < InterleaveFactor; i++) {
4541 Instruction *Member = Group->getMember(i);
4542 if (!Member)
4543 continue;
4544 auto *MemberTy = getLoadStoreType(Member);
4545 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4546 // Don't coerce non-integral pointers to integers or vice versa.
4547 if (MemberNI != ScalarNI) {
4548 // TODO: Consider adding special nullptr value case here
4549 return false;
4550 } else if (MemberNI && ScalarNI &&
4551 ScalarTy->getPointerAddressSpace() !=
4552 MemberTy->getPointerAddressSpace()) {
4553 return false;
4554 }
4555 }
4556
4557 // Check if masking is required.
4558 // A Group may need masking for one of two reasons: it resides in a block that
4559 // needs predication, or it was decided to use masking to deal with gaps
4560 // (either a gap at the end of a load-access that may result in a speculative
4561 // load, or any gaps in a store-access).
4562 bool PredicatedAccessRequiresMasking =
4563 blockNeedsPredicationForAnyReason(I->getParent()) &&
4564 Legal->isMaskRequired(I);
4565 bool LoadAccessWithGapsRequiresEpilogMasking =
4566 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4567 !isScalarEpilogueAllowed();
4568 bool StoreAccessWithGapsRequiresMasking =
4569 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4570 if (!PredicatedAccessRequiresMasking &&
4571 !LoadAccessWithGapsRequiresEpilogMasking &&
4572 !StoreAccessWithGapsRequiresMasking)
4573 return true;
4574
4575 // If masked interleaving is required, we expect that the user/target had
4576 // enabled it, because otherwise it either wouldn't have been created or
4577 // it should have been invalidated by the CostModel.
4578 assert(useMaskedInterleavedAccesses(TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled."
) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4579, __extension__
__PRETTY_FUNCTION__))
4579 "Masked interleave-groups for predicated accesses are not enabled.")(static_cast <bool> (useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled."
) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4579, __extension__
__PRETTY_FUNCTION__))
;
4580
4581 if (Group->isReverse())
4582 return false;
4583
4584 auto *Ty = getLoadStoreType(I);
4585 const Align Alignment = getLoadStoreAlignment(I);
4586 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4587 : TTI.isLegalMaskedStore(Ty, Alignment);
4588}
4589
4590bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4591 Instruction *I, ElementCount VF) {
4592 // Get and ensure we have a valid memory instruction.
4593 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction")(static_cast <bool> ((isa<LoadInst, StoreInst>(I)
) && "Invalid memory instruction") ? void (0) : __assert_fail
("(isa<LoadInst, StoreInst>(I)) && \"Invalid memory instruction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4593, __extension__
__PRETTY_FUNCTION__))
;
4594
4595 auto *Ptr = getLoadStorePointerOperand(I);
4596 auto *ScalarTy = getLoadStoreType(I);
4597
4598 // In order to be widened, the pointer should be consecutive, first of all.
4599 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4600 return false;
4601
4602 // If the instruction is a store located in a predicated block, it will be
4603 // scalarized.
4604 if (isScalarWithPredication(I, VF))
4605 return false;
4606
4607 // If the instruction's allocated size doesn't equal it's type size, it
4608 // requires padding and will be scalarized.
4609 auto &DL = I->getModule()->getDataLayout();
4610 if (hasIrregularType(ScalarTy, DL))
4611 return false;
4612
4613 return true;
4614}
4615
4616void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4617 // We should not collect Uniforms more than once per VF. Right now,
4618 // this function is called from collectUniformsAndScalars(), which
4619 // already does this check. Collecting Uniforms for VF=1 does not make any
4620 // sense.
4621
4622 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&(static_cast <bool> (VF.isVector() && Uniforms.
find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4623, __extension__
__PRETTY_FUNCTION__))
4623 "This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Uniforms.
find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4623, __extension__
__PRETTY_FUNCTION__))
;
4624
4625 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4626 // not analyze again. Uniforms.count(VF) will return 1.
4627 Uniforms[VF].clear();
4628
4629 // We now know that the loop is vectorizable!
4630 // Collect instructions inside the loop that will remain uniform after
4631 // vectorization.
4632
4633 // Global values, params and instructions outside of current loop are out of
4634 // scope.
4635 auto isOutOfScope = [&](Value *V) -> bool {
4636 Instruction *I = dyn_cast<Instruction>(V);
4637 return (!I || !TheLoop->contains(I));
4638 };
4639
4640 // Worklist containing uniform instructions demanding lane 0.
4641 SetVector<Instruction *> Worklist;
4642 BasicBlock *Latch = TheLoop->getLoopLatch();
4643
4644 // Add uniform instructions demanding lane 0 to the worklist. Instructions
4645 // that are scalar with predication must not be considered uniform after
4646 // vectorization, because that would create an erroneous replicating region
4647 // where only a single instance out of VF should be formed.
4648 // TODO: optimize such seldom cases if found important, see PR40816.
4649 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4650 if (isOutOfScope(I)) {
4651 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
4652 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
;
4653 return;
4654 }
4655 if (isScalarWithPredication(I, VF)) {
4656 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
4657 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
;
4658 return;
4659 }
4660 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
<< *I << "\n"; } } while (false)
;
4661 Worklist.insert(I);
4662 };
4663
4664 // Start with the conditional branch. If the branch condition is an
4665 // instruction contained in the loop that is only used by the branch, it is
4666 // uniform.
4667 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4668 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4669 addToWorklistIfAllowed(Cmp);
4670
4671 // Return true if all lanes perform the same memory operation, and we can
4672 // thus chose to execute only one.
4673 auto isUniformMemOpUse = [&](Instruction *I) {
4674 if (!Legal->isUniformMemOp(*I))
4675 return false;
4676 if (isa<LoadInst>(I))
4677 // Loading the same address always produces the same result - at least
4678 // assuming aliasing and ordering which have already been checked.
4679 return true;
4680 // Storing the same value on every iteration.
4681 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4682 };
4683
4684 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4685 InstWidening WideningDecision = getWideningDecision(I, VF);
4686 assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4687, __extension__
__PRETTY_FUNCTION__))
4687 "Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4687, __extension__
__PRETTY_FUNCTION__))
;
4688
4689 if (isUniformMemOpUse(I))
4690 return true;
4691
4692 return (WideningDecision == CM_Widen ||
4693 WideningDecision == CM_Widen_Reverse ||
4694 WideningDecision == CM_Interleave);
4695 };
4696
4697
4698 // Returns true if Ptr is the pointer operand of a memory access instruction
4699 // I, and I is known to not require scalarization.
4700 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4701 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4702 };
4703
4704 // Holds a list of values which are known to have at least one uniform use.
4705 // Note that there may be other uses which aren't uniform. A "uniform use"
4706 // here is something which only demands lane 0 of the unrolled iterations;
4707 // it does not imply that all lanes produce the same value (e.g. this is not
4708 // the usual meaning of uniform)
4709 SetVector<Value *> HasUniformUse;
4710
4711 // Scan the loop for instructions which are either a) known to have only
4712 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4713 for (auto *BB : TheLoop->blocks())
4714 for (auto &I : *BB) {
4715 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4716 switch (II->getIntrinsicID()) {
4717 case Intrinsic::sideeffect:
4718 case Intrinsic::experimental_noalias_scope_decl:
4719 case Intrinsic::assume:
4720 case Intrinsic::lifetime_start:
4721 case Intrinsic::lifetime_end:
4722 if (TheLoop->hasLoopInvariantOperands(&I))
4723 addToWorklistIfAllowed(&I);
4724 break;
4725 default:
4726 break;
4727 }
4728 }
4729
4730 // ExtractValue instructions must be uniform, because the operands are
4731 // known to be loop-invariant.
4732 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4733 assert(isOutOfScope(EVI->getAggregateOperand()) &&(static_cast <bool> (isOutOfScope(EVI->getAggregateOperand
()) && "Expected aggregate value to be loop invariant"
) ? void (0) : __assert_fail ("isOutOfScope(EVI->getAggregateOperand()) && \"Expected aggregate value to be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4734, __extension__
__PRETTY_FUNCTION__))
4734 "Expected aggregate value to be loop invariant")(static_cast <bool> (isOutOfScope(EVI->getAggregateOperand
()) && "Expected aggregate value to be loop invariant"
) ? void (0) : __assert_fail ("isOutOfScope(EVI->getAggregateOperand()) && \"Expected aggregate value to be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4734, __extension__
__PRETTY_FUNCTION__))
;
4735 addToWorklistIfAllowed(EVI);
4736 continue;
4737 }
4738
4739 // If there's no pointer operand, there's nothing to do.
4740 auto *Ptr = getLoadStorePointerOperand(&I);
4741 if (!Ptr)
4742 continue;
4743
4744 if (isUniformMemOpUse(&I))
4745 addToWorklistIfAllowed(&I);
4746
4747 if (isUniformDecision(&I, VF)) {
4748 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check")(static_cast <bool> (isVectorizedMemAccessUse(&I, Ptr
) && "consistency check") ? void (0) : __assert_fail (
"isVectorizedMemAccessUse(&I, Ptr) && \"consistency check\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4748, __extension__
__PRETTY_FUNCTION__))
;
4749 HasUniformUse.insert(Ptr);
4750 }
4751 }
4752
4753 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4754 // demanding) users. Since loops are assumed to be in LCSSA form, this
4755 // disallows uses outside the loop as well.
4756 for (auto *V : HasUniformUse) {
4757 if (isOutOfScope(V))
4758 continue;
4759 auto *I = cast<Instruction>(V);
4760 auto UsersAreMemAccesses =
4761 llvm::all_of(I->users(), [&](User *U) -> bool {
4762 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4763 });
4764 if (UsersAreMemAccesses)
4765 addToWorklistIfAllowed(I);
4766 }
4767
4768 // Expand Worklist in topological order: whenever a new instruction
4769 // is added , its users should be already inside Worklist. It ensures
4770 // a uniform instruction will only be used by uniform instructions.
4771 unsigned idx = 0;
4772 while (idx != Worklist.size()) {
4773 Instruction *I = Worklist[idx++];
4774
4775 for (auto *OV : I->operand_values()) {
4776 // isOutOfScope operands cannot be uniform instructions.
4777 if (isOutOfScope(OV))
4778 continue;
4779 // First order recurrence Phi's should typically be considered
4780 // non-uniform.
4781 auto *OP = dyn_cast<PHINode>(OV);
4782 if (OP && Legal->isFixedOrderRecurrence(OP))
4783 continue;
4784 // If all the users of the operand are uniform, then add the
4785 // operand into the uniform worklist.
4786 auto *OI = cast<Instruction>(OV);
4787 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4788 auto *J = cast<Instruction>(U);
4789 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4790 }))
4791 addToWorklistIfAllowed(OI);
4792 }
4793 }
4794
4795 // For an instruction to be added into Worklist above, all its users inside
4796 // the loop should also be in Worklist. However, this condition cannot be
4797 // true for phi nodes that form a cyclic dependence. We must process phi
4798 // nodes separately. An induction variable will remain uniform if all users
4799 // of the induction variable and induction variable update remain uniform.
4800 // The code below handles both pointer and non-pointer induction variables.
4801 for (const auto &Induction : Legal->getInductionVars()) {
4802 auto *Ind = Induction.first;
4803 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4804
4805 // Determine if all users of the induction variable are uniform after
4806 // vectorization.
4807 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4808 auto *I = cast<Instruction>(U);
4809 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4810 isVectorizedMemAccessUse(I, Ind);
4811 });
4812 if (!UniformInd)
4813 continue;
4814
4815 // Determine if all users of the induction variable update instruction are
4816 // uniform after vectorization.
4817 auto UniformIndUpdate =
4818 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4819 auto *I = cast<Instruction>(U);
4820 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4821 isVectorizedMemAccessUse(I, IndUpdate);
4822 });
4823 if (!UniformIndUpdate)
4824 continue;
4825
4826 // The induction variable and its update instruction will remain uniform.
4827 addToWorklistIfAllowed(Ind);
4828 addToWorklistIfAllowed(IndUpdate);
4829 }
4830
4831 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4832}
4833
4834bool LoopVectorizationCostModel::runtimeChecksRequired() {
4835 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Performing code size checks.\n"
; } } while (false)
;
4836
4837 if (Legal->getRuntimePointerChecking()->Need) {
4838 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4839 "runtime pointer checks needed. Enable vectorization of this "
4840 "loop with '#pragma clang loop vectorize(enable)' when "
4841 "compiling with -Os/-Oz",
4842 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4843 return true;
4844 }
4845
4846 if (!PSE.getPredicate().isAlwaysTrue()) {
4847 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4848 "runtime SCEV checks needed. Enable vectorization of this "
4849 "loop with '#pragma clang loop vectorize(enable)' when "
4850 "compiling with -Os/-Oz",
4851 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4852 return true;
4853 }
4854
4855 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4856 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4857 reportVectorizationFailure("Runtime stride check for small trip count",
4858 "runtime stride == 1 checks needed. Enable vectorization of "
4859 "this loop without such check by compiling with -Os/-Oz",
4860 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4861 return true;
4862 }
4863
4864 return false;
4865}
4866
4867ElementCount
4868LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4869 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4870 return ElementCount::getScalable(0);
4871
4872 if (Hints->isScalableVectorizationDisabled()) {
4873 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4874 "ScalableVectorizationDisabled", ORE, TheLoop);
4875 return ElementCount::getScalable(0);
4876 }
4877
4878 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalable vectorization is available\n"
; } } while (false)
;
4879
4880 auto MaxScalableVF = ElementCount::getScalable(
4881 std::numeric_limits<ElementCount::ScalarTy>::max());
4882
4883 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4884 // FIXME: While for scalable vectors this is currently sufficient, this should
4885 // be replaced by a more detailed mechanism that filters out specific VFs,
4886 // instead of invalidating vectorization for a whole set of VFs based on the
4887 // MaxVF.
4888
4889 // Disable scalable vectorization if the loop contains unsupported reductions.
4890 if (!canVectorizeReductions(MaxScalableVF)) {
4891 reportVectorizationInfo(
4892 "Scalable vectorization not supported for the reduction "
4893 "operations found in this loop.",
4894 "ScalableVFUnfeasible", ORE, TheLoop);
4895 return ElementCount::getScalable(0);
4896 }
4897
4898 // Disable scalable vectorization if the loop contains any instructions
4899 // with element types not supported for scalable vectors.
4900 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4901 return !Ty->isVoidTy() &&
4902 !this->TTI.isElementTypeLegalForScalableVector(Ty);
4903 })) {
4904 reportVectorizationInfo("Scalable vectorization is not supported "
4905 "for all element types found in this loop.",
4906 "ScalableVFUnfeasible", ORE, TheLoop);
4907 return ElementCount::getScalable(0);
4908 }
4909
4910 if (Legal->isSafeForAnyVectorWidth())
4911 return MaxScalableVF;
4912
4913 // Limit MaxScalableVF by the maximum safe dependence distance.
4914 Optional<unsigned> MaxVScale = TTI.getMaxVScale();
4915 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
4916 MaxVScale =
4917 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
4918 MaxScalableVF = ElementCount::getScalable(
4919 MaxVScale ? (MaxSafeElements / MaxVScale.value()) : 0);
4920 if (!MaxScalableVF)
4921 reportVectorizationInfo(
4922 "Max legal vector width too small, scalable vectorization "
4923 "unfeasible.",
4924 "ScalableVFUnfeasible", ORE, TheLoop);
4925
4926 return MaxScalableVF;
4927}
4928
4929FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4930 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4931 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4932 unsigned SmallestType, WidestType;
4933 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4934
4935 // Get the maximum safe dependence distance in bits computed by LAA.
4936 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4937 // the memory accesses that is most restrictive (involved in the smallest
4938 // dependence distance).
4939 unsigned MaxSafeElements =
4940 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4941
4942 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4943 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4944
4945 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe fixed VF is: "
<< MaxSafeFixedVF << ".\n"; } } while (false)
4946 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe fixed VF is: "
<< MaxSafeFixedVF << ".\n"; } } while (false)
;
4947 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe scalable VF is: "
<< MaxSafeScalableVF << ".\n"; } } while (false)
4948 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe scalable VF is: "
<< MaxSafeScalableVF << ".\n"; } } while (false)
;
4949
4950 // First analyze the UserVF, fall back if the UserVF should be ignored.
4951 if (UserVF) {
4952 auto MaxSafeUserVF =
4953 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4954
4955 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4956 // If `VF=vscale x N` is safe, then so is `VF=N`
4957 if (UserVF.isScalable())
4958 return FixedScalableVFPair(
4959 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4960 else
4961 return UserVF;
4962 }
4963
4964 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF))(static_cast <bool> (ElementCount::isKnownGT(UserVF, MaxSafeUserVF
)) ? void (0) : __assert_fail ("ElementCount::isKnownGT(UserVF, MaxSafeUserVF)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4964, __extension__
__PRETTY_FUNCTION__))
;
4965
4966 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4967 // is better to ignore the hint and let the compiler choose a suitable VF.
4968 if (!UserVF.isScalable()) {
4969 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeFixedVF << ".\n"; } } while (false)
4970 << " is unsafe, clamping to max safe VF="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeFixedVF << ".\n"; } } while (false)
4971 << MaxSafeFixedVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeFixedVF << ".\n"; } } while (false)
;
4972 ORE->emit([&]() {
4973 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
4974 TheLoop->getStartLoc(),
4975 TheLoop->getHeader())
4976 << "User-specified vectorization factor "
4977 << ore::NV("UserVectorizationFactor", UserVF)
4978 << " is unsafe, clamping to maximum safe vectorization factor "
4979 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4980 });
4981 return MaxSafeFixedVF;
4982 }
4983
4984 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4985 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is ignored because scalable vectors are not "
"available.\n"; } } while (false)
4986 << " is ignored because scalable vectors are not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is ignored because scalable vectors are not "
"available.\n"; } } while (false)
4987 "available.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is ignored because scalable vectors are not "
"available.\n"; } } while (false)
;
4988 ORE->emit([&]() {
4989 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
4990 TheLoop->getStartLoc(),
4991 TheLoop->getHeader())
4992 << "User-specified vectorization factor "
4993 << ore::NV("UserVectorizationFactor", UserVF)
4994 << " is ignored because the target does not support scalable "
4995 "vectors. The compiler will pick a more suitable value.";
4996 });
4997 } else {
4998 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe. Ignoring scalable UserVF.\n"; }
} while (false)
4999 << " is unsafe. Ignoring scalable UserVF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe. Ignoring scalable UserVF.\n"; }
} while (false)
;
5000 ORE->emit([&]() {
5001 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
5002 TheLoop->getStartLoc(),
5003 TheLoop->getHeader())
5004 << "User-specified vectorization factor "
5005 << ore::NV("UserVectorizationFactor", UserVF)
5006 << " is unsafe. Ignoring the hint to let the compiler pick a "
5007 "more suitable value.";
5008 });
5009 }
5010 }
5011
5012 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestTypedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
5013 << " / " << WidestType << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
;
5014
5015 FixedScalableVFPair Result(ElementCount::getFixed(1),
5016 ElementCount::getScalable(0));
5017 if (auto MaxVF =
5018 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5019 MaxSafeFixedVF, FoldTailByMasking))
5020 Result.FixedVF = MaxVF;
5021
5022 if (auto MaxVF =
5023 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5024 MaxSafeScalableVF, FoldTailByMasking))
5025 if (MaxVF.isScalable()) {
5026 Result.ScalableVF = MaxVF;
5027 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found feasible scalable VF = "
<< MaxVF << "\n"; } } while (false)
5028 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found feasible scalable VF = "
<< MaxVF << "\n"; } } while (false)
;
5029 }
5030
5031 return Result;
5032}
5033
5034FixedScalableVFPair
5035LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5036 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5037 // TODO: It may by useful to do since it's still likely to be dynamically
5038 // uniform if the target can skip.
5039 reportVectorizationFailure(
5040 "Not inserting runtime ptr check for divergent target",
5041 "runtime pointer checks needed. Not enabled for divergent target",
5042 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5043 return FixedScalableVFPair::getNone();
5044 }
5045
5046 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5047 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found trip count: "
<< TC << '\n'; } } while (false)
;
5048 if (TC == 1) {
5049 reportVectorizationFailure("Single iteration (non) loop",
5050 "loop trip count is one, irrelevant for vectorization",
5051 "SingleIterationLoop", ORE, TheLoop);
5052 return FixedScalableVFPair::getNone();
5053 }
5054
5055 switch (ScalarEpilogueStatus) {
5056 case CM_ScalarEpilogueAllowed:
5057 return computeFeasibleMaxVF(TC, UserVF, false);
5058 case CM_ScalarEpilogueNotAllowedUsePredicate:
5059 [[fallthrough]];
5060 case CM_ScalarEpilogueNotNeededUsePredicate:
5061 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5062 dbgs() << "LV: vector predicate hint/switch found.\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5063 << "LV: Not allowing scalar epilogue, creating predicated "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5064 << "vector loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
;
5065 break;
5066 case CM_ScalarEpilogueNotAllowedLowTripLoop:
5067 // fallthrough as a special case of OptForSize
5068 case CM_ScalarEpilogueNotAllowedOptSize:
5069 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5070 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
5071 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
;
5072 else
5073 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
<< "count.\n"; } } while (false)
5074 << "count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
<< "count.\n"; } } while (false)
;
5075
5076 // Bail if runtime checks are required, which are not good when optimising
5077 // for size.
5078 if (runtimeChecksRequired())
5079 return FixedScalableVFPair::getNone();
5080
5081 break;
5082 }
5083
5084 // The only loops we can vectorize without a scalar epilogue, are loops with
5085 // a bottom-test and a single exiting block. We'd have to handle the fact
5086 // that not every instruction executes on the last iteration. This will
5087 // require a lane mask which varies through the vector loop body. (TODO)
5088 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5089 // If there was a tail-folding hint/switch, but we can't fold the tail by
5090 // masking, fallback to a vectorization with a scalar epilogue.
5091 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5092 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
5093 "scalar epilogue instead.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
;
5094 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5095 return computeFeasibleMaxVF(TC, UserVF, false);
5096 }
5097 return FixedScalableVFPair::getNone();
5098 }
5099
5100 // Now try the tail folding
5101
5102 // Invalidate interleave groups that require an epilogue if we can't mask
5103 // the interleave-group.
5104 if (!useMaskedInterleavedAccesses(TTI)) {
5105 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&(static_cast <bool> (WideningDecisions.empty() &&
Uniforms.empty() && Scalars.empty() && "No decisions should have been taken at this point"
) ? void (0) : __assert_fail ("WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && \"No decisions should have been taken at this point\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5106, __extension__
__PRETTY_FUNCTION__))
5106 "No decisions should have been taken at this point")(static_cast <bool> (WideningDecisions.empty() &&
Uniforms.empty() && Scalars.empty() && "No decisions should have been taken at this point"
) ? void (0) : __assert_fail ("WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && \"No decisions should have been taken at this point\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5106, __extension__
__PRETTY_FUNCTION__))
;
5107 // Note: There is no need to invalidate any cost modeling decisions here, as
5108 // non where taken so far.
5109 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5110 }
5111
5112 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5113 // Avoid tail folding if the trip count is known to be a multiple of any VF
5114 // we chose.
5115 // FIXME: The condition below pessimises the case for fixed-width vectors,
5116 // when scalable VFs are also candidates for vectorization.
5117 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5118 ElementCount MaxFixedVF = MaxFactors.FixedVF;
5119 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&(static_cast <bool> ((UserVF.isNonZero() || isPowerOf2_32
(MaxFixedVF.getFixedValue())) && "MaxFixedVF must be a power of 2"
) ? void (0) : __assert_fail ("(UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && \"MaxFixedVF must be a power of 2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5120, __extension__
__PRETTY_FUNCTION__))
5120 "MaxFixedVF must be a power of 2")(static_cast <bool> ((UserVF.isNonZero() || isPowerOf2_32
(MaxFixedVF.getFixedValue())) && "MaxFixedVF must be a power of 2"
) ? void (0) : __assert_fail ("(UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && \"MaxFixedVF must be a power of 2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5120, __extension__
__PRETTY_FUNCTION__))
;
5121 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5122 : MaxFixedVF.getFixedValue();
5123 ScalarEvolution *SE = PSE.getSE();
5124 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5125 const SCEV *ExitCount = SE->getAddExpr(
5126 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5127 const SCEV *Rem = SE->getURemExpr(
5128 SE->applyLoopGuards(ExitCount, TheLoop),
5129 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5130 if (Rem->isZero()) {
5131 // Accept MaxFixedVF if we do not have a tail.
5132 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: No tail will remain for any chosen VF.\n"
; } } while (false)
;
5133 return MaxFactors;
5134 }
5135 }
5136
5137 // If we don't know the precise trip count, or if the trip count that we
5138 // found modulo the vectorization factor is not zero, try to fold the tail
5139 // by masking.
5140 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5141 if (Legal->prepareToFoldTailByMasking()) {
5142 FoldTailByMasking = true;
5143 return MaxFactors;
5144 }
5145
5146 // If there was a tail-folding hint/switch, but we can't fold the tail by
5147 // masking, fallback to a vectorization with a scalar epilogue.
5148 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5149 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
5150 "scalar epilogue instead.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
;
5151 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5152 return MaxFactors;
5153 }
5154
5155 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5156 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"
; } } while (false)
;
5157 return FixedScalableVFPair::getNone();
5158 }
5159
5160 if (TC == 0) {
5161 reportVectorizationFailure(
5162 "Unable to calculate the loop count due to complex control flow",
5163 "unable to calculate the loop count due to complex control flow",
5164 "UnknownLoopCountComplexCFG", ORE, TheLoop);
5165 return FixedScalableVFPair::getNone();
5166 }
5167
5168 reportVectorizationFailure(
5169 "Cannot optimize for size and vectorize at the same time.",
5170 "cannot optimize for size and vectorize at the same time. "
5171 "Enable vectorization of this loop with '#pragma clang loop "
5172 "vectorize(enable)' when compiling with -Os/-Oz",
5173 "NoTailLoopWithOptForSize", ORE, TheLoop);
5174 return FixedScalableVFPair::getNone();
5175}
5176
5177ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5178 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5179 ElementCount MaxSafeVF, bool FoldTailByMasking) {
5180 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5181 TypeSize WidestRegister = TTI.getRegisterBitWidth(
5182 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5183 : TargetTransformInfo::RGK_FixedWidthVector);
5184
5185 // Convenience function to return the minimum of two ElementCounts.
5186 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5187 assert((LHS.isScalable() == RHS.isScalable()) &&(static_cast <bool> ((LHS.isScalable() == RHS.isScalable
()) && "Scalable flags must match") ? void (0) : __assert_fail
("(LHS.isScalable() == RHS.isScalable()) && \"Scalable flags must match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5188, __extension__
__PRETTY_FUNCTION__))
5188 "Scalable flags must match")(static_cast <bool> ((LHS.isScalable() == RHS.isScalable
()) && "Scalable flags must match") ? void (0) : __assert_fail
("(LHS.isScalable() == RHS.isScalable()) && \"Scalable flags must match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5188, __extension__
__PRETTY_FUNCTION__))
;
5189 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5190 };
5191
5192 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5193 // Note that both WidestRegister and WidestType may not be a powers of 2.
5194 auto MaxVectorElementCount = ElementCount::get(
5195 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5196 ComputeScalableMaxVF);
5197 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5198 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< (MaxVectorElementCount * WidestType) << " bits.\n"
; } } while (false)
5199 << (MaxVectorElementCount * WidestType) << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< (MaxVectorElementCount * WidestType) << " bits.\n"
; } } while (false)
;
5200
5201 if (!MaxVectorElementCount) {
5202 LLVM_DEBUG(dbgs() << "LV: The target has no "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no "
<< (ComputeScalableMaxVF ? "scalable" : "fixed") <<
" vector registers.\n"; } } while (false)
5203 << (ComputeScalableMaxVF ? "scalable" : "fixed")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no "
<< (ComputeScalableMaxVF ? "scalable" : "fixed") <<
" vector registers.\n"; } } while (false)
5204 << " vector registers.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no "
<< (ComputeScalableMaxVF ? "scalable" : "fixed") <<
" vector registers.\n"; } } while (false)
;
5205 return ElementCount::getFixed(1);
5206 }
5207
5208 const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5209 if (ConstTripCount &&
5210 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5211 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5212 // If loop trip count (TC) is known at compile time there is no point in
5213 // choosing VF greater than TC (as done in the loop below). Select maximum
5214 // power of two which doesn't exceed TC.
5215 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5216 // when the TC is less than or equal to the known number of lanes.
5217 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5218 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: " << ClampedConstTripCount
<< "\n"; } } while (false)
5219 "exceeding the constant trip count: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: " << ClampedConstTripCount
<< "\n"; } } while (false)
5220 << ClampedConstTripCount << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: " << ClampedConstTripCount
<< "\n"; } } while (false)
;
5221 return ElementCount::getFixed(ClampedConstTripCount);
5222 }
5223
5224 TargetTransformInfo::RegisterKind RegKind =
5225 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5226 : TargetTransformInfo::RGK_FixedWidthVector;
5227 ElementCount MaxVF = MaxVectorElementCount;
5228 if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
5229 TTI.shouldMaximizeVectorBandwidth(RegKind))) {
5230 auto MaxVectorElementCountMaxBW = ElementCount::get(
5231 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5232 ComputeScalableMaxVF);
5233 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5234
5235 // Collect all viable vectorization factors larger than the default MaxVF
5236 // (i.e. MaxVectorElementCount).
5237 SmallVector<ElementCount, 8> VFs;
5238 for (ElementCount VS = MaxVectorElementCount * 2;
5239 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5240 VFs.push_back(VS);
5241
5242 // For each VF calculate its register usage.
5243 auto RUs = calculateRegisterUsage(VFs);
5244
5245 // Select the largest VF which doesn't require more registers than existing
5246 // ones.
5247 for (int i = RUs.size() - 1; i >= 0; --i) {
5248 bool Selected = true;
5249 for (auto &pair : RUs[i].MaxLocalUsers) {
5250 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5251 if (pair.second > TargetNumRegisters)
5252 Selected = false;
5253 }
5254 if (Selected) {
5255 MaxVF = VFs[i];
5256 break;
5257 }
5258 }
5259 if (ElementCount MinVF =
5260 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5261 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5262 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
<< MaxVF << ") with target's minimum: " <<
MinVF << '\n'; } } while (false)
5263 << ") with target's minimum: " << MinVF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
<< MaxVF << ") with target's minimum: " <<
MinVF << '\n'; } } while (false)
;
5264 MaxVF = MinVF;
5265 }
5266 }
5267
5268 // Invalidate any widening decisions we might have made, in case the loop
5269 // requires prediction (decided later), but we have already made some
5270 // load/store widening decisions.
5271 invalidateCostModelingDecisions();
5272 }
5273 return MaxVF;
5274}
5275
5276Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5277 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5278 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5279 auto Min = Attr.getVScaleRangeMin();
5280 auto Max = Attr.getVScaleRangeMax();
5281 if (Max && Min == Max)
5282 return Max;
5283 }
5284
5285 return TTI.getVScaleForTuning();
5286}
5287
5288bool LoopVectorizationCostModel::isMoreProfitable(
5289 const VectorizationFactor &A, const VectorizationFactor &B) const {
5290 InstructionCost CostA = A.Cost;
5291 InstructionCost CostB = B.Cost;
5292
5293 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5294
5295 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5296 MaxTripCount) {
5297 // If we are folding the tail and the trip count is a known (possibly small)
5298 // constant, the trip count will be rounded up to an integer number of
5299 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5300 // which we compare directly. When not folding the tail, the total cost will
5301 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5302 // approximated with the per-lane cost below instead of using the tripcount
5303 // as here.
5304 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5305 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5306 return RTCostA < RTCostB;
5307 }
5308
5309 // Improve estimate for the vector width if it is scalable.
5310 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5311 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5312 if (Optional<unsigned> VScale = getVScaleForTuning()) {
5313 if (A.Width.isScalable())
5314 EstimatedWidthA *= VScale.value();
5315 if (B.Width.isScalable())
5316 EstimatedWidthB *= VScale.value();
5317 }
5318
5319 // Assume vscale may be larger than 1 (or the value being tuned for),
5320 // so that scalable vectorization is slightly favorable over fixed-width
5321 // vectorization.
5322 if (A.Width.isScalable() && !B.Width.isScalable())
5323 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5324
5325 // To avoid the need for FP division:
5326 // (CostA / A.Width) < (CostB / B.Width)
5327 // <=> (CostA * B.Width) < (CostB * A.Width)
5328 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5329}
5330
5331VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5332 const ElementCountSet &VFCandidates) {
5333 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5334 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalar loop costs: "
<< ExpectedCost << ".\n"; } } while (false)
;
5335 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop")(static_cast <bool> (ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"
) ? void (0) : __assert_fail ("ExpectedCost.isValid() && \"Unexpected invalid cost for scalar loop\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5335, __extension__
__PRETTY_FUNCTION__))
;
5336 assert(VFCandidates.count(ElementCount::getFixed(1)) &&(static_cast <bool> (VFCandidates.count(ElementCount::getFixed
(1)) && "Expected Scalar VF to be a candidate") ? void
(0) : __assert_fail ("VFCandidates.count(ElementCount::getFixed(1)) && \"Expected Scalar VF to be a candidate\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5337, __extension__
__PRETTY_FUNCTION__))
5337 "Expected Scalar VF to be a candidate")(static_cast <bool> (VFCandidates.count(ElementCount::getFixed
(1)) && "Expected Scalar VF to be a candidate") ? void
(0) : __assert_fail ("VFCandidates.count(ElementCount::getFixed(1)) && \"Expected Scalar VF to be a candidate\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5337, __extension__
__PRETTY_FUNCTION__))
;
5338
5339 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
5340 ExpectedCost);
5341 VectorizationFactor ChosenFactor = ScalarCost;
5342
5343 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5344 if (ForceVectorization && VFCandidates.size() > 1) {
5345 // Ignore scalar width, because the user explicitly wants vectorization.
5346 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5347 // evaluation.
5348 ChosenFactor.Cost = InstructionCost::getMax();
5349 }
5350
5351 SmallVector<InstructionVFPair> InvalidCosts;
5352 for (const auto &i : VFCandidates) {
5353 // The cost for scalar VF=1 is already calculated, so ignore it.
5354 if (i.isScalar())
5355 continue;
5356
5357 VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5358 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5359
5360#ifndef NDEBUG
5361 unsigned AssumedMinimumVscale = 1;
5362 if (Optional<unsigned> VScale = getVScaleForTuning())
5363 AssumedMinimumVscale = *VScale;
5364 unsigned Width =
5365 Candidate.Width.isScalable()
5366 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5367 : Candidate.Width.getFixedValue();
5368 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (Candidate.Cost / Width
); } } while (false)
5369 << " costs: " << (Candidate.Cost / Width))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (Candidate.Cost / Width
); } } while (false)
;
5370 if (i.isScalable())
5371 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " (assuming a minimum vscale of "
<< AssumedMinimumVscale << ")"; } } while (false
)
5372 << AssumedMinimumVscale << ")")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " (assuming a minimum vscale of "
<< AssumedMinimumVscale << ")"; } } while (false
)
;
5373 LLVM_DEBUG(dbgs() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << ".\n"; } } while (false
)
;
5374#endif
5375
5376 if (!C.second && !ForceVectorization) {
5377 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
5378 dbgs() << "LV: Not considering vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
5379 << " because it will not generate any vector instructions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
;
5380 continue;
5381 }
5382
5383 // If profitable add it to ProfitableVF list.
5384 if (isMoreProfitable(Candidate, ScalarCost))
5385 ProfitableVFs.push_back(Candidate);
5386
5387 if (isMoreProfitable(Candidate, ChosenFactor))
5388 ChosenFactor = Candidate;
5389 }
5390
5391 // Emit a report of VFs with invalid costs in the loop.
5392 if (!InvalidCosts.empty()) {
5393 // Group the remarks per instruction, keeping the instruction order from
5394 // InvalidCosts.
5395 std::map<Instruction *, unsigned> Numbering;
5396 unsigned I = 0;
5397 for (auto &Pair : InvalidCosts)
5398 if (!Numbering.count(Pair.first))
5399 Numbering[Pair.first] = I++;
5400
5401 // Sort the list, first on instruction(number) then on VF.
5402 llvm::sort(InvalidCosts,
5403 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5404 if (Numbering[A.first] != Numbering[B.first])
5405 return Numbering[A.first] < Numbering[B.first];
5406 ElementCountComparator ECC;
5407 return ECC(A.second, B.second);
5408 });
5409
5410 // For a list of ordered instruction-vf pairs:
5411 // [(load, vf1), (load, vf2), (store, vf1)]
5412 // Group the instructions together to emit separate remarks for:
5413 // load (vf1, vf2)
5414 // store (vf1)
5415 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5416 auto Subset = ArrayRef<InstructionVFPair>();
5417 do {
5418 if (Subset.empty())
5419 Subset = Tail.take_front(1);
5420
5421 Instruction *I = Subset.front().first;