Bug Summary

File:llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Warning:line 2367, column 9
Value stored to 'InitVecValSTy' during its initialization is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name LoopVectorize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20220125101009+ceec4383681c/build-llvm -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-14~++20220125101009+ceec4383681c/llvm/lib/Transforms/Vectorize -I include -I /build/llvm-toolchain-snapshot-14~++20220125101009+ceec4383681c/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-14~++20220125101009+ceec4383681c/build-llvm=build-llvm -fmacro-prefix-map=/build/llvm-toolchain-snapshot-14~++20220125101009+ceec4383681c/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-14~++20220125101009+ceec4383681c/build-llvm=build-llvm -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-14~++20220125101009+ceec4383681c/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20220125101009+ceec4383681c/build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20220125101009+ceec4383681c/build-llvm=build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20220125101009+ceec4383681c/= -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-01-25-232935-20746-1 -x c++ /build/llvm-toolchain-snapshot-14~++20220125101009+ceec4383681c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/Proposal/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanHCFGBuilder.h"
61#include "VPlanPredicator.h"
62#include "VPlanTransforms.h"
63#include "llvm/ADT/APInt.h"
64#include "llvm/ADT/ArrayRef.h"
65#include "llvm/ADT/DenseMap.h"
66#include "llvm/ADT/DenseMapInfo.h"
67#include "llvm/ADT/Hashing.h"
68#include "llvm/ADT/MapVector.h"
69#include "llvm/ADT/None.h"
70#include "llvm/ADT/Optional.h"
71#include "llvm/ADT/STLExtras.h"
72#include "llvm/ADT/SmallPtrSet.h"
73#include "llvm/ADT/SmallSet.h"
74#include "llvm/ADT/SmallVector.h"
75#include "llvm/ADT/Statistic.h"
76#include "llvm/ADT/StringRef.h"
77#include "llvm/ADT/Twine.h"
78#include "llvm/ADT/iterator_range.h"
79#include "llvm/Analysis/AssumptionCache.h"
80#include "llvm/Analysis/BasicAliasAnalysis.h"
81#include "llvm/Analysis/BlockFrequencyInfo.h"
82#include "llvm/Analysis/CFG.h"
83#include "llvm/Analysis/CodeMetrics.h"
84#include "llvm/Analysis/DemandedBits.h"
85#include "llvm/Analysis/GlobalsModRef.h"
86#include "llvm/Analysis/LoopAccessAnalysis.h"
87#include "llvm/Analysis/LoopAnalysisManager.h"
88#include "llvm/Analysis/LoopInfo.h"
89#include "llvm/Analysis/LoopIterator.h"
90#include "llvm/Analysis/OptimizationRemarkEmitter.h"
91#include "llvm/Analysis/ProfileSummaryInfo.h"
92#include "llvm/Analysis/ScalarEvolution.h"
93#include "llvm/Analysis/ScalarEvolutionExpressions.h"
94#include "llvm/Analysis/TargetLibraryInfo.h"
95#include "llvm/Analysis/TargetTransformInfo.h"
96#include "llvm/Analysis/VectorUtils.h"
97#include "llvm/IR/Attributes.h"
98#include "llvm/IR/BasicBlock.h"
99#include "llvm/IR/CFG.h"
100#include "llvm/IR/Constant.h"
101#include "llvm/IR/Constants.h"
102#include "llvm/IR/DataLayout.h"
103#include "llvm/IR/DebugInfoMetadata.h"
104#include "llvm/IR/DebugLoc.h"
105#include "llvm/IR/DerivedTypes.h"
106#include "llvm/IR/DiagnosticInfo.h"
107#include "llvm/IR/Dominators.h"
108#include "llvm/IR/Function.h"
109#include "llvm/IR/IRBuilder.h"
110#include "llvm/IR/InstrTypes.h"
111#include "llvm/IR/Instruction.h"
112#include "llvm/IR/Instructions.h"
113#include "llvm/IR/IntrinsicInst.h"
114#include "llvm/IR/Intrinsics.h"
115#include "llvm/IR/LLVMContext.h"
116#include "llvm/IR/Metadata.h"
117#include "llvm/IR/Module.h"
118#include "llvm/IR/Operator.h"
119#include "llvm/IR/PatternMatch.h"
120#include "llvm/IR/Type.h"
121#include "llvm/IR/Use.h"
122#include "llvm/IR/User.h"
123#include "llvm/IR/Value.h"
124#include "llvm/IR/ValueHandle.h"
125#include "llvm/IR/Verifier.h"
126#include "llvm/InitializePasses.h"
127#include "llvm/Pass.h"
128#include "llvm/Support/Casting.h"
129#include "llvm/Support/CommandLine.h"
130#include "llvm/Support/Compiler.h"
131#include "llvm/Support/Debug.h"
132#include "llvm/Support/ErrorHandling.h"
133#include "llvm/Support/InstructionCost.h"
134#include "llvm/Support/MathExtras.h"
135#include "llvm/Support/raw_ostream.h"
136#include "llvm/Transforms/Utils/BasicBlockUtils.h"
137#include "llvm/Transforms/Utils/InjectTLIMappings.h"
138#include "llvm/Transforms/Utils/LoopSimplify.h"
139#include "llvm/Transforms/Utils/LoopUtils.h"
140#include "llvm/Transforms/Utils/LoopVersioning.h"
141#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
142#include "llvm/Transforms/Utils/SizeOpts.h"
143#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
144#include <algorithm>
145#include <cassert>
146#include <cstdint>
147#include <cstdlib>
148#include <functional>
149#include <iterator>
150#include <limits>
151#include <memory>
152#include <string>
153#include <tuple>
154#include <utility>
155
156using namespace llvm;
157
158#define LV_NAME"loop-vectorize" "loop-vectorize"
159#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"
160
161#ifndef NDEBUG
162const char VerboseDebug[] = DEBUG_TYPE"loop-vectorize" "-verbose";
163#endif
164
165/// @{
166/// Metadata attribute names
167const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168const char LLVMLoopVectorizeFollowupVectorized[] =
169 "llvm.loop.vectorize.followup_vectorized";
170const char LLVMLoopVectorizeFollowupEpilogue[] =
171 "llvm.loop.vectorize.followup_epilogue";
172/// @}
173
174STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = {"loop-vectorize", "LoopsVectorized"
, "Number of loops vectorized"}
;
175STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = {"loop-vectorize", "LoopsAnalyzed"
, "Number of loops analyzed for vectorization"}
;
176STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized")static llvm::Statistic LoopsEpilogueVectorized = {"loop-vectorize"
, "LoopsEpilogueVectorized", "Number of epilogues vectorized"
}
;
177
178static cl::opt<bool> EnableEpilogueVectorization(
179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180 cl::desc("Enable vectorization of epilogue loops."));
181
182static cl::opt<unsigned> EpilogueVectorizationForceVF(
183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184 cl::desc("When epilogue vectorization is enabled, and a value greater than "
185 "1 is specified, forces the given VF for all applicable epilogue "
186 "loops."));
187
188static cl::opt<unsigned> EpilogueVectorizationMinVF(
189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190 cl::desc("Only loops with vectorization factor equal to or larger than "
191 "the specified value are considered for epilogue vectorization."));
192
193/// Loops with a known constant trip count below this number are vectorized only
194/// if no scalar iteration overheads are incurred.
195static cl::opt<unsigned> TinyTripCountVectorThreshold(
196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197 cl::desc("Loops with a constant trip count that is smaller than this "
198 "value are vectorized only if no scalar iteration overheads "
199 "are incurred."));
200
201static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203 cl::desc("The maximum allowed number of runtime memory checks with a "
204 "vectorize(enable) pragma."));
205
206// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207// that predication is preferred, and this lists all options. I.e., the
208// vectorizer will try to fold the tail-loop (epilogue) into the vector body
209// and predicate the instructions accordingly. If tail-folding fails, there are
210// different fallback strategies depending on these values:
211namespace PreferPredicateTy {
212 enum Option {
213 ScalarEpilogue = 0,
214 PredicateElseScalarEpilogue,
215 PredicateOrDontVectorize
216 };
217} // namespace PreferPredicateTy
218
219static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
220 "prefer-predicate-over-epilogue",
221 cl::init(PreferPredicateTy::ScalarEpilogue),
222 cl::Hidden,
223 cl::desc("Tail-folding and predication preferences over creating a scalar "
224 "epilogue loop."),
225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
226 "scalar-epilogue",llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
227 "Don't tail-predicate loops, create scalar epilogue")llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
,
228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
229 "predicate-else-scalar-epilogue",llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
230 "prefer tail-folding, create scalar epilogue if tail "llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
231 "folding fails.")llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
,
232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
233 "predicate-dont-vectorize",llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
234 "prefers tail-folding, don't attempt vectorization if "llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
235 "tail-folding fails.")llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
));
236
237static cl::opt<bool> MaximizeBandwidth(
238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
239 cl::desc("Maximize bandwidth when selecting vectorization factor which "
240 "will be determined by the smallest type in loop."));
241
242static cl::opt<bool> EnableInterleavedMemAccesses(
243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
245
246/// An interleave-group may need masking if it resides in a block that needs
247/// predication, or in order to mask away gaps.
248static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
251
252static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
254 cl::desc("We don't interleave loops with a estimated constant trip count "
255 "below this number"));
256
257static cl::opt<unsigned> ForceTargetNumScalarRegs(
258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
259 cl::desc("A flag that overrides the target's number of scalar registers."));
260
261static cl::opt<unsigned> ForceTargetNumVectorRegs(
262 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
263 cl::desc("A flag that overrides the target's number of vector registers."));
264
265static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
267 cl::desc("A flag that overrides the target's max interleave factor for "
268 "scalar loops."));
269
270static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
272 cl::desc("A flag that overrides the target's max interleave factor for "
273 "vectorized loops."));
274
275static cl::opt<unsigned> ForceTargetInstructionCost(
276 "force-target-instruction-cost", cl::init(0), cl::Hidden,
277 cl::desc("A flag that overrides the target's expected cost for "
278 "an instruction to a single constant value. Mostly "
279 "useful for getting consistent testing."));
280
281static cl::opt<bool> ForceTargetSupportsScalableVectors(
282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
283 cl::desc(
284 "Pretend that scalable vectors are supported, even if the target does "
285 "not support them. This flag should only be used for testing."));
286
287static cl::opt<unsigned> SmallLoopCost(
288 "small-loop-cost", cl::init(20), cl::Hidden,
289 cl::desc(
290 "The cost of a loop that is considered 'small' by the interleaver."));
291
292static cl::opt<bool> LoopVectorizeWithBlockFrequency(
293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
294 cl::desc("Enable the use of the block frequency analysis to access PGO "
295 "heuristics minimizing code growth in cold regions and being more "
296 "aggressive in hot regions."));
297
298// Runtime interleave loops for load/store throughput.
299static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
301 cl::desc(
302 "Enable runtime interleaving until load/store ports are saturated"));
303
304/// Interleave small loops with scalar reductions.
305static cl::opt<bool> InterleaveSmallLoopScalarReduction(
306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
307 cl::desc("Enable interleaving for loops with small iteration counts that "
308 "contain scalar reductions to expose ILP."));
309
310/// The number of stores in a loop that are allowed to need predication.
311static cl::opt<unsigned> NumberOfStoresToPredicate(
312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
313 cl::desc("Max number of stores to be predicated behind an if."));
314
315static cl::opt<bool> EnableIndVarRegisterHeur(
316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
317 cl::desc("Count the induction variable only once when interleaving"));
318
319static cl::opt<bool> EnableCondStoresVectorization(
320 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
321 cl::desc("Enable if predication of stores during vectorization."));
322
323static cl::opt<unsigned> MaxNestedScalarReductionIC(
324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
325 cl::desc("The maximum interleave count to use when interleaving a scalar "
326 "reduction in a nested loop."));
327
328static cl::opt<bool>
329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
330 cl::Hidden,
331 cl::desc("Prefer in-loop vector reductions, "
332 "overriding the targets preference."));
333
334static cl::opt<bool> ForceOrderedReductions(
335 "force-ordered-reductions", cl::init(false), cl::Hidden,
336 cl::desc("Enable the vectorisation of loops with in-order (strict) "
337 "FP reductions"));
338
339static cl::opt<bool> PreferPredicatedReductionSelect(
340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
341 cl::desc(
342 "Prefer predicating a reduction operation over an after loop select."));
343
344cl::opt<bool> EnableVPlanNativePath(
345 "enable-vplan-native-path", cl::init(false), cl::Hidden,
346 cl::desc("Enable VPlan-native vectorization path with "
347 "support for outer loop vectorization."));
348
349// FIXME: Remove this switch once we have divergence analysis. Currently we
350// assume divergent non-backedge branches when this switch is true.
351cl::opt<bool> EnableVPlanPredication(
352 "enable-vplan-predication", cl::init(false), cl::Hidden,
353 cl::desc("Enable VPlan-native vectorization path predicator with "
354 "support for outer loop vectorization."));
355
356// This flag enables the stress testing of the VPlan H-CFG construction in the
357// VPlan-native vectorization path. It must be used in conjuction with
358// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
359// verification of the H-CFGs built.
360static cl::opt<bool> VPlanBuildStressTest(
361 "vplan-build-stress-test", cl::init(false), cl::Hidden,
362 cl::desc(
363 "Build VPlan for every supported loop nest in the function and bail "
364 "out right after the build (stress test the VPlan H-CFG construction "
365 "in the VPlan-native vectorization path)."));
366
367cl::opt<bool> llvm::EnableLoopInterleaving(
368 "interleave-loops", cl::init(true), cl::Hidden,
369 cl::desc("Enable loop interleaving in Loop vectorization passes"));
370cl::opt<bool> llvm::EnableLoopVectorization(
371 "vectorize-loops", cl::init(true), cl::Hidden,
372 cl::desc("Run the Loop vectorization passes"));
373
374cl::opt<bool> PrintVPlansInDotFormat(
375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
376 cl::desc("Use dot format instead of plain text when dumping VPlans"));
377
378/// A helper function that returns true if the given type is irregular. The
379/// type is irregular if its allocated size doesn't equal the store size of an
380/// element of the corresponding vector type.
381static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
382 // Determine if an array of N elements of type Ty is "bitcast compatible"
383 // with a <N x Ty> vector.
384 // This is only true if there is no padding between the array elements.
385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
386}
387
388/// A helper function that returns the reciprocal of the block probability of
389/// predicated blocks. If we return X, we are assuming the predicated block
390/// will execute once for every X iterations of the loop header.
391///
392/// TODO: We should use actual block probability here, if available. Currently,
393/// we always assume predicated blocks have a 50% chance of executing.
394static unsigned getReciprocalPredBlockProb() { return 2; }
395
396/// A helper function that returns an integer or floating-point constant with
397/// value C.
398static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
400 : ConstantFP::get(Ty, C);
401}
402
403/// Returns "best known" trip count for the specified loop \p L as defined by
404/// the following procedure:
405/// 1) Returns exact trip count if it is known.
406/// 2) Returns expected trip count according to profile data if any.
407/// 3) Returns upper bound estimate if it is known.
408/// 4) Returns None if all of the above failed.
409static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
410 // Check if exact trip count is known.
411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
412 return ExpectedTC;
413
414 // Check if there is an expected trip count available from profile data.
415 if (LoopVectorizeWithBlockFrequency)
416 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
417 return EstimatedTC;
418
419 // Check if upper bound estimate is known.
420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
421 return ExpectedTC;
422
423 return None;
424}
425
426// Forward declare GeneratedRTChecks.
427class GeneratedRTChecks;
428
429namespace llvm {
430
431AnalysisKey ShouldRunExtraVectorPasses::Key;
432
433/// InnerLoopVectorizer vectorizes loops which contain only one basic
434/// block to a specified vectorization factor (VF).
435/// This class performs the widening of scalars into vectors, or multiple
436/// scalars. This class also implements the following features:
437/// * It inserts an epilogue loop for handling loops that don't have iteration
438/// counts that are known to be a multiple of the vectorization factor.
439/// * It handles the code generation for reduction variables.
440/// * Scalarization (implementation using scalars) of un-vectorizable
441/// instructions.
442/// InnerLoopVectorizer does not perform any vectorization-legality
443/// checks, and relies on the caller to check for the different legality
444/// aspects. The InnerLoopVectorizer relies on the
445/// LoopVectorizationLegality class to provide information about the induction
446/// and reduction variables that were found to a given vectorization factor.
447class InnerLoopVectorizer {
448public:
449 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
450 LoopInfo *LI, DominatorTree *DT,
451 const TargetLibraryInfo *TLI,
452 const TargetTransformInfo *TTI, AssumptionCache *AC,
453 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
454 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
455 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
456 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
457 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
458 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
459 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
460 PSI(PSI), RTChecks(RTChecks) {
461 // Query this against the original loop and save it here because the profile
462 // of the original loop header may change as the transformation happens.
463 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
464 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
465 }
466
467 virtual ~InnerLoopVectorizer() = default;
468
469 /// Create a new empty loop that will contain vectorized instructions later
470 /// on, while the old loop will be used as the scalar remainder. Control flow
471 /// is generated around the vectorized (and scalar epilogue) loops consisting
472 /// of various checks and bypasses. Return the pre-header block of the new
473 /// loop and the start value for the canonical induction, if it is != 0. The
474 /// latter is the case when vectorizing the epilogue loop. In the case of
475 /// epilogue vectorization, this function is overriden to handle the more
476 /// complex control flow around the loops.
477 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
478
479 /// Widen a single call instruction within the innermost loop.
480 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
481 VPTransformState &State);
482
483 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
484 void fixVectorizedLoop(VPTransformState &State);
485
486 // Return true if any runtime check is added.
487 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
488
489 /// A type for vectorized values in the new loop. Each value from the
490 /// original loop, when vectorized, is represented by UF vector values in the
491 /// new unrolled loop, where UF is the unroll factor.
492 using VectorParts = SmallVector<Value *, 2>;
493
494 /// Vectorize a single first-order recurrence or pointer induction PHINode in
495 /// a block. This method handles the induction variable canonicalization. It
496 /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
497 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
498 VPTransformState &State);
499
500 /// A helper function to scalarize a single Instruction in the innermost loop.
501 /// Generates a sequence of scalar instances for each lane between \p MinLane
502 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
503 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
504 /// Instr's operands.
505 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
506 const VPIteration &Instance, bool IfPredicateInstr,
507 VPTransformState &State);
508
509 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
510 /// is provided, the integer induction variable will first be truncated to
511 /// the corresponding type. \p CanonicalIV is the scalar value generated for
512 /// the canonical induction variable.
513 void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def,
514 VPTransformState &State, Value *CanonicalIV);
515
516 /// Construct the vector value of a scalarized value \p V one lane at a time.
517 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
518 VPTransformState &State);
519
520 /// Try to vectorize interleaved access group \p Group with the base address
521 /// given in \p Addr, optionally masking the vector operations if \p
522 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
523 /// values in the vectorized loop.
524 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
525 ArrayRef<VPValue *> VPDefs,
526 VPTransformState &State, VPValue *Addr,
527 ArrayRef<VPValue *> StoredValues,
528 VPValue *BlockInMask = nullptr);
529
530 /// Set the debug location in the builder \p Ptr using the debug location in
531 /// \p V. If \p Ptr is None then it uses the class member's Builder.
532 void setDebugLocFromInst(const Value *V,
533 Optional<IRBuilder<> *> CustomBuilder = None);
534
535 /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
536 void fixNonInductionPHIs(VPTransformState &State);
537
538 /// Returns true if the reordering of FP operations is not allowed, but we are
539 /// able to vectorize with strict in-order reductions for the given RdxDesc.
540 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
541
542 /// Create a broadcast instruction. This method generates a broadcast
543 /// instruction (shuffle) for loop invariant values and for the induction
544 /// value. If this is the induction variable then we extend it to N, N+1, ...
545 /// this is needed because each iteration in the loop corresponds to a SIMD
546 /// element.
547 virtual Value *getBroadcastInstrs(Value *V);
548
549 /// Add metadata from one instruction to another.
550 ///
551 /// This includes both the original MDs from \p From and additional ones (\see
552 /// addNewMetadata). Use this for *newly created* instructions in the vector
553 /// loop.
554 void addMetadata(Instruction *To, Instruction *From);
555
556 /// Similar to the previous function but it adds the metadata to a
557 /// vector of instructions.
558 void addMetadata(ArrayRef<Value *> To, Instruction *From);
559
560 // Returns the resume value (bc.merge.rdx) for a reduction as
561 // generated by fixReduction.
562 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
563
564protected:
565 friend class LoopVectorizationPlanner;
566
567 /// A small list of PHINodes.
568 using PhiVector = SmallVector<PHINode *, 4>;
569
570 /// A type for scalarized values in the new loop. Each value from the
571 /// original loop, when scalarized, is represented by UF x VF scalar values
572 /// in the new unrolled loop, where UF is the unroll factor and VF is the
573 /// vectorization factor.
574 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
575
576 /// Set up the values of the IVs correctly when exiting the vector loop.
577 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
578 Value *CountRoundDown, Value *EndValue,
579 BasicBlock *MiddleBlock);
580
581 /// Introduce a conditional branch (on true, condition to be set later) at the
582 /// end of the header=latch connecting it to itself (across the backedge) and
583 /// to the exit block of \p L.
584 void createHeaderBranch(Loop *L);
585
586 /// Handle all cross-iteration phis in the header.
587 void fixCrossIterationPHIs(VPTransformState &State);
588
589 /// Create the exit value of first order recurrences in the middle block and
590 /// update their users.
591 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
592 VPTransformState &State);
593
594 /// Create code for the loop exit value of the reduction.
595 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
596
597 /// Clear NSW/NUW flags from reduction instructions if necessary.
598 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
599 VPTransformState &State);
600
601 /// Fixup the LCSSA phi nodes in the unique exit block. This simply
602 /// means we need to add the appropriate incoming value from the middle
603 /// block as exiting edges from the scalar epilogue loop (if present) are
604 /// already in place, and we exit the vector loop exclusively to the middle
605 /// block.
606 void fixLCSSAPHIs(VPTransformState &State);
607
608 /// Iteratively sink the scalarized operands of a predicated instruction into
609 /// the block that was created for it.
610 void sinkScalarOperands(Instruction *PredInst);
611
612 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
613 /// represented as.
614 void truncateToMinimalBitwidths(VPTransformState &State);
615
616 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
617 /// variable on which to base the steps, \p Step is the size of the step, and
618 /// \p EntryVal is the value from the original loop that maps to the steps.
619 /// Note that \p EntryVal doesn't have to be an induction variable - it
620 /// can also be a truncate instruction.
621 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
622 const InductionDescriptor &ID, VPValue *Def,
623 VPTransformState &State);
624
625 /// Create a vector induction phi node based on an existing scalar one. \p
626 /// EntryVal is the value from the original loop that maps to the vector phi
627 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
628 /// truncate instruction, instead of widening the original IV, we widen a
629 /// version of the IV truncated to \p EntryVal's type.
630 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
631 Value *Step, Value *Start,
632 Instruction *EntryVal, VPValue *Def,
633 VPTransformState &State);
634
635 /// Returns true if an instruction \p I should be scalarized instead of
636 /// vectorized for the chosen vectorization factor.
637 bool shouldScalarizeInstruction(Instruction *I) const;
638
639 /// Returns true if we should generate a scalar version of \p IV.
640 bool needsScalarInduction(Instruction *IV) const;
641
642 /// Returns (and creates if needed) the original loop trip count.
643 Value *getOrCreateTripCount(Loop *NewLoop);
644
645 /// Returns (and creates if needed) the trip count of the widened loop.
646 Value *getOrCreateVectorTripCount(Loop *NewLoop);
647
648 /// Returns a bitcasted value to the requested vector type.
649 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
650 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
651 const DataLayout &DL);
652
653 /// Emit a bypass check to see if the vector trip count is zero, including if
654 /// it overflows.
655 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
656
657 /// Emit a bypass check to see if all of the SCEV assumptions we've
658 /// had to make are correct. Returns the block containing the checks or
659 /// nullptr if no checks have been added.
660 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
661
662 /// Emit bypass checks to check any memory assumptions we may have made.
663 /// Returns the block containing the checks or nullptr if no checks have been
664 /// added.
665 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
666
667 /// Compute the transformed value of Index at offset StartValue using step
668 /// StepValue.
669 /// For integer induction, returns StartValue + Index * StepValue.
670 /// For pointer induction, returns StartValue[Index * StepValue].
671 /// FIXME: The newly created binary instructions should contain nsw/nuw
672 /// flags, which can be found from the original scalar operations.
673 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
674 const DataLayout &DL,
675 const InductionDescriptor &ID,
676 BasicBlock *VectorHeader) const;
677
678 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
679 /// vector loop preheader, middle block and scalar preheader. Also
680 /// allocate a loop object for the new vector loop and return it.
681 Loop *createVectorLoopSkeleton(StringRef Prefix);
682
683 /// Create new phi nodes for the induction variables to resume iteration count
684 /// in the scalar epilogue, from where the vectorized loop left off.
685 /// In cases where the loop skeleton is more complicated (eg. epilogue
686 /// vectorization) and the resume values can come from an additional bypass
687 /// block, the \p AdditionalBypass pair provides information about the bypass
688 /// block and the end value on the edge from bypass to this loop.
689 void createInductionResumeValues(
690 Loop *L,
691 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
692
693 /// Complete the loop skeleton by adding debug MDs, creating appropriate
694 /// conditional branches in the middle block, preparing the builder and
695 /// running the verifier. Take in the vector loop \p L as argument, and return
696 /// the preheader of the completed vector loop.
697 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
698
699 /// Add additional metadata to \p To that was not present on \p Orig.
700 ///
701 /// Currently this is used to add the noalias annotations based on the
702 /// inserted memchecks. Use this for instructions that are *cloned* into the
703 /// vector loop.
704 void addNewMetadata(Instruction *To, const Instruction *Orig);
705
706 /// Collect poison-generating recipes that may generate a poison value that is
707 /// used after vectorization, even when their operands are not poison. Those
708 /// recipes meet the following conditions:
709 /// * Contribute to the address computation of a recipe generating a widen
710 /// memory load/store (VPWidenMemoryInstructionRecipe or
711 /// VPInterleaveRecipe).
712 /// * Such a widen memory load/store has at least one underlying Instruction
713 /// that is in a basic block that needs predication and after vectorization
714 /// the generated instruction won't be predicated.
715 void collectPoisonGeneratingRecipes(VPTransformState &State);
716
717 /// Allow subclasses to override and print debug traces before/after vplan
718 /// execution, when trace information is requested.
719 virtual void printDebugTracesAtStart(){};
720 virtual void printDebugTracesAtEnd(){};
721
722 /// The original loop.
723 Loop *OrigLoop;
724
725 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
726 /// dynamic knowledge to simplify SCEV expressions and converts them to a
727 /// more usable form.
728 PredicatedScalarEvolution &PSE;
729
730 /// Loop Info.
731 LoopInfo *LI;
732
733 /// Dominator Tree.
734 DominatorTree *DT;
735
736 /// Alias Analysis.
737 AAResults *AA;
738
739 /// Target Library Info.
740 const TargetLibraryInfo *TLI;
741
742 /// Target Transform Info.
743 const TargetTransformInfo *TTI;
744
745 /// Assumption Cache.
746 AssumptionCache *AC;
747
748 /// Interface to emit optimization remarks.
749 OptimizationRemarkEmitter *ORE;
750
751 /// LoopVersioning. It's only set up (non-null) if memchecks were
752 /// used.
753 ///
754 /// This is currently only used to add no-alias metadata based on the
755 /// memchecks. The actually versioning is performed manually.
756 std::unique_ptr<LoopVersioning> LVer;
757
758 /// The vectorization SIMD factor to use. Each vector will have this many
759 /// vector elements.
760 ElementCount VF;
761
762 /// The vectorization unroll factor to use. Each scalar is vectorized to this
763 /// many different vector instructions.
764 unsigned UF;
765
766 /// The builder that we use
767 IRBuilder<> Builder;
768
769 // --- Vectorization state ---
770
771 /// The vector-loop preheader.
772 BasicBlock *LoopVectorPreHeader;
773
774 /// The scalar-loop preheader.
775 BasicBlock *LoopScalarPreHeader;
776
777 /// Middle Block between the vector and the scalar.
778 BasicBlock *LoopMiddleBlock;
779
780 /// The unique ExitBlock of the scalar loop if one exists. Note that
781 /// there can be multiple exiting edges reaching this block.
782 BasicBlock *LoopExitBlock;
783
784 /// The vector loop body.
785 BasicBlock *LoopVectorBody;
786
787 /// The scalar loop body.
788 BasicBlock *LoopScalarBody;
789
790 /// A list of all bypass blocks. The first block is the entry of the loop.
791 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
792
793 /// Store instructions that were predicated.
794 SmallVector<Instruction *, 4> PredicatedInstructions;
795
796 /// Trip count of the original loop.
797 Value *TripCount = nullptr;
798
799 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
800 Value *VectorTripCount = nullptr;
801
802 /// The legality analysis.
803 LoopVectorizationLegality *Legal;
804
805 /// The profitablity analysis.
806 LoopVectorizationCostModel *Cost;
807
808 // Record whether runtime checks are added.
809 bool AddedSafetyChecks = false;
810
811 // Holds the end values for each induction variable. We save the end values
812 // so we can later fix-up the external users of the induction variables.
813 DenseMap<PHINode *, Value *> IVEndValues;
814
815 // Vector of original scalar PHIs whose corresponding widened PHIs need to be
816 // fixed up at the end of vector code generation.
817 SmallVector<PHINode *, 8> OrigPHIsToFix;
818
819 /// BFI and PSI are used to check for profile guided size optimizations.
820 BlockFrequencyInfo *BFI;
821 ProfileSummaryInfo *PSI;
822
823 // Whether this loop should be optimized for size based on profile guided size
824 // optimizatios.
825 bool OptForSizeBasedOnProfile;
826
827 /// Structure to hold information about generated runtime checks, responsible
828 /// for cleaning the checks, if vectorization turns out unprofitable.
829 GeneratedRTChecks &RTChecks;
830
831 // Holds the resume values for reductions in the loops, used to set the
832 // correct start value of reduction PHIs when vectorizing the epilogue.
833 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
834 ReductionResumeValues;
835};
836
837class InnerLoopUnroller : public InnerLoopVectorizer {
838public:
839 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
840 LoopInfo *LI, DominatorTree *DT,
841 const TargetLibraryInfo *TLI,
842 const TargetTransformInfo *TTI, AssumptionCache *AC,
843 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
844 LoopVectorizationLegality *LVL,
845 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
846 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
847 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
848 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
849 BFI, PSI, Check) {}
850
851private:
852 Value *getBroadcastInstrs(Value *V) override;
853};
854
855/// Encapsulate information regarding vectorization of a loop and its epilogue.
856/// This information is meant to be updated and used across two stages of
857/// epilogue vectorization.
858struct EpilogueLoopVectorizationInfo {
859 ElementCount MainLoopVF = ElementCount::getFixed(0);
860 unsigned MainLoopUF = 0;
861 ElementCount EpilogueVF = ElementCount::getFixed(0);
862 unsigned EpilogueUF = 0;
863 BasicBlock *MainLoopIterationCountCheck = nullptr;
864 BasicBlock *EpilogueIterationCountCheck = nullptr;
865 BasicBlock *SCEVSafetyCheck = nullptr;
866 BasicBlock *MemSafetyCheck = nullptr;
867 Value *TripCount = nullptr;
868 Value *VectorTripCount = nullptr;
869
870 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
871 ElementCount EVF, unsigned EUF)
872 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
873 assert(EUF == 1 &&(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 874, __extension__
__PRETTY_FUNCTION__))
874 "A high UF for the epilogue loop is likely not beneficial.")(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 874, __extension__
__PRETTY_FUNCTION__))
;
875 }
876};
877
878/// An extension of the inner loop vectorizer that creates a skeleton for a
879/// vectorized loop that has its epilogue (residual) also vectorized.
880/// The idea is to run the vplan on a given loop twice, firstly to setup the
881/// skeleton and vectorize the main loop, and secondly to complete the skeleton
882/// from the first step and vectorize the epilogue. This is achieved by
883/// deriving two concrete strategy classes from this base class and invoking
884/// them in succession from the loop vectorizer planner.
885class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
886public:
887 InnerLoopAndEpilogueVectorizer(
888 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
889 DominatorTree *DT, const TargetLibraryInfo *TLI,
890 const TargetTransformInfo *TTI, AssumptionCache *AC,
891 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
892 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
893 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
894 GeneratedRTChecks &Checks)
895 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
896 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
897 Checks),
898 EPI(EPI) {}
899
900 // Override this function to handle the more complex control flow around the
901 // three loops.
902 std::pair<BasicBlock *, Value *>
903 createVectorizedLoopSkeleton() final override {
904 return createEpilogueVectorizedLoopSkeleton();
905 }
906
907 /// The interface for creating a vectorized skeleton using one of two
908 /// different strategies, each corresponding to one execution of the vplan
909 /// as described above.
910 virtual std::pair<BasicBlock *, Value *>
911 createEpilogueVectorizedLoopSkeleton() = 0;
912
913 /// Holds and updates state information required to vectorize the main loop
914 /// and its epilogue in two separate passes. This setup helps us avoid
915 /// regenerating and recomputing runtime safety checks. It also helps us to
916 /// shorten the iteration-count-check path length for the cases where the
917 /// iteration count of the loop is so small that the main vector loop is
918 /// completely skipped.
919 EpilogueLoopVectorizationInfo &EPI;
920};
921
922/// A specialized derived class of inner loop vectorizer that performs
923/// vectorization of *main* loops in the process of vectorizing loops and their
924/// epilogues.
925class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
926public:
927 EpilogueVectorizerMainLoop(
928 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
929 DominatorTree *DT, const TargetLibraryInfo *TLI,
930 const TargetTransformInfo *TTI, AssumptionCache *AC,
931 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
932 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
933 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
934 GeneratedRTChecks &Check)
935 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
936 EPI, LVL, CM, BFI, PSI, Check) {}
937 /// Implements the interface for creating a vectorized skeleton using the
938 /// *main loop* strategy (ie the first pass of vplan execution).
939 std::pair<BasicBlock *, Value *>
940 createEpilogueVectorizedLoopSkeleton() final override;
941
942protected:
943 /// Emits an iteration count bypass check once for the main loop (when \p
944 /// ForEpilogue is false) and once for the epilogue loop (when \p
945 /// ForEpilogue is true).
946 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
947 bool ForEpilogue);
948 void printDebugTracesAtStart() override;
949 void printDebugTracesAtEnd() override;
950};
951
952// A specialized derived class of inner loop vectorizer that performs
953// vectorization of *epilogue* loops in the process of vectorizing loops and
954// their epilogues.
955class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
956public:
957 EpilogueVectorizerEpilogueLoop(
958 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
959 DominatorTree *DT, const TargetLibraryInfo *TLI,
960 const TargetTransformInfo *TTI, AssumptionCache *AC,
961 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
962 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
963 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
964 GeneratedRTChecks &Checks)
965 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
966 EPI, LVL, CM, BFI, PSI, Checks) {}
967 /// Implements the interface for creating a vectorized skeleton using the
968 /// *epilogue loop* strategy (ie the second pass of vplan execution).
969 std::pair<BasicBlock *, Value *>
970 createEpilogueVectorizedLoopSkeleton() final override;
971
972protected:
973 /// Emits an iteration count bypass check after the main vector loop has
974 /// finished to see if there are any iterations left to execute by either
975 /// the vector epilogue or the scalar epilogue.
976 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
977 BasicBlock *Bypass,
978 BasicBlock *Insert);
979 void printDebugTracesAtStart() override;
980 void printDebugTracesAtEnd() override;
981};
982} // end namespace llvm
983
984/// Look for a meaningful debug location on the instruction or it's
985/// operands.
986static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
987 if (!I)
988 return I;
989
990 DebugLoc Empty;
991 if (I->getDebugLoc() != Empty)
992 return I;
993
994 for (Use &Op : I->operands()) {
995 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
996 if (OpInst->getDebugLoc() != Empty)
997 return OpInst;
998 }
999
1000 return I;
1001}
1002
1003void InnerLoopVectorizer::setDebugLocFromInst(
1004 const Value *V, Optional<IRBuilder<> *> CustomBuilder) {
1005 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
1006 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
1007 const DILocation *DIL = Inst->getDebugLoc();
1008
1009 // When a FSDiscriminator is enabled, we don't need to add the multiply
1010 // factors to the discriminators.
1011 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1012 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
1013 // FIXME: For scalable vectors, assume vscale=1.
1014 auto NewDIL =
1015 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1016 if (NewDIL)
1017 B->SetCurrentDebugLocation(NewDIL.getValue());
1018 else
1019 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
1020 << "Failed to create new discriminator: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
1021 << DIL->getFilename() << " Line: " << DIL->getLine())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
;
1022 } else
1023 B->SetCurrentDebugLocation(DIL);
1024 } else
1025 B->SetCurrentDebugLocation(DebugLoc());
1026}
1027
1028/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
1029/// is passed, the message relates to that particular instruction.
1030#ifndef NDEBUG
1031static void debugVectorizationMessage(const StringRef Prefix,
1032 const StringRef DebugMsg,
1033 Instruction *I) {
1034 dbgs() << "LV: " << Prefix << DebugMsg;
1035 if (I != nullptr)
1036 dbgs() << " " << *I;
1037 else
1038 dbgs() << '.';
1039 dbgs() << '\n';
1040}
1041#endif
1042
1043/// Create an analysis remark that explains why vectorization failed
1044///
1045/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
1046/// RemarkName is the identifier for the remark. If \p I is passed it is an
1047/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
1048/// the location of the remark. \return the remark object that can be
1049/// streamed to.
1050static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1051 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1052 Value *CodeRegion = TheLoop->getHeader();
1053 DebugLoc DL = TheLoop->getStartLoc();
1054
1055 if (I) {
1056 CodeRegion = I->getParent();
1057 // If there is no debug location attached to the instruction, revert back to
1058 // using the loop's.
1059 if (I->getDebugLoc())
1060 DL = I->getDebugLoc();
1061 }
1062
1063 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1064}
1065
1066namespace llvm {
1067
1068/// Return a value for Step multiplied by VF.
1069Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
1070 int64_t Step) {
1071 assert(Ty->isIntegerTy() && "Expected an integer step")(static_cast <bool> (Ty->isIntegerTy() && "Expected an integer step"
) ? void (0) : __assert_fail ("Ty->isIntegerTy() && \"Expected an integer step\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1071, __extension__
__PRETTY_FUNCTION__))
;
1072 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
1073 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1074}
1075
1076/// Return the runtime value for VF.
1077Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
1078 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1079 return VF.isScalable() ? B.CreateVScale(EC) : EC;
1080}
1081
1082static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) {
1083 assert(FTy->isFloatingPointTy() && "Expected floating point type!")(static_cast <bool> (FTy->isFloatingPointTy() &&
"Expected floating point type!") ? void (0) : __assert_fail (
"FTy->isFloatingPointTy() && \"Expected floating point type!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1083, __extension__
__PRETTY_FUNCTION__))
;
1084 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1085 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1086 return B.CreateUIToFP(RuntimeVF, FTy);
1087}
1088
1089void reportVectorizationFailure(const StringRef DebugMsg,
1090 const StringRef OREMsg, const StringRef ORETag,
1091 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1092 Instruction *I) {
1093 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationMessage("Not vectorizing: "
, DebugMsg, I); } } while (false)
;
1094 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1095 ORE->emit(
1096 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1097 << "loop not vectorized: " << OREMsg);
1098}
1099
1100void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1101 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1102 Instruction *I) {
1103 LLVM_DEBUG(debugVectorizationMessage("", Msg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationMessage("", Msg, I); }
} while (false)
;
1104 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1105 ORE->emit(
1106 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1107 << Msg);
1108}
1109
1110} // end namespace llvm
1111
1112#ifndef NDEBUG
1113/// \return string containing a file name and a line # for the given loop.
1114static std::string getDebugLocString(const Loop *L) {
1115 std::string Result;
1116 if (L) {
1117 raw_string_ostream OS(Result);
1118 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1119 LoopDbgLoc.print(OS);
1120 else
1121 // Just print the module name.
1122 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1123 OS.flush();
1124 }
1125 return Result;
1126}
1127#endif
1128
1129void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1130 const Instruction *Orig) {
1131 // If the loop was versioned with memchecks, add the corresponding no-alias
1132 // metadata.
1133 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1134 LVer->annotateInstWithNoAlias(To, Orig);
1135}
1136
1137void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1138 VPTransformState &State) {
1139
1140 // Collect recipes in the backward slice of `Root` that may generate a poison
1141 // value that is used after vectorization.
1142 SmallPtrSet<VPRecipeBase *, 16> Visited;
1143 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1144 SmallVector<VPRecipeBase *, 16> Worklist;
1145 Worklist.push_back(Root);
1146
1147 // Traverse the backward slice of Root through its use-def chain.
1148 while (!Worklist.empty()) {
1149 VPRecipeBase *CurRec = Worklist.back();
1150 Worklist.pop_back();
1151
1152 if (!Visited.insert(CurRec).second)
1153 continue;
1154
1155 // Prune search if we find another recipe generating a widen memory
1156 // instruction. Widen memory instructions involved in address computation
1157 // will lead to gather/scatter instructions, which don't need to be
1158 // handled.
1159 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1160 isa<VPInterleaveRecipe>(CurRec) ||
1161 isa<VPCanonicalIVPHIRecipe>(CurRec))
1162 continue;
1163
1164 // This recipe contributes to the address computation of a widen
1165 // load/store. Collect recipe if its underlying instruction has
1166 // poison-generating flags.
1167 Instruction *Instr = CurRec->getUnderlyingInstr();
1168 if (Instr && Instr->hasPoisonGeneratingFlags())
1169 State.MayGeneratePoisonRecipes.insert(CurRec);
1170
1171 // Add new definitions to the worklist.
1172 for (VPValue *operand : CurRec->operands())
1173 if (VPDef *OpDef = operand->getDef())
1174 Worklist.push_back(cast<VPRecipeBase>(OpDef));
1175 }
1176 });
1177
1178 // Traverse all the recipes in the VPlan and collect the poison-generating
1179 // recipes in the backward slice starting at the address of a VPWidenRecipe or
1180 // VPInterleaveRecipe.
1181 auto Iter = depth_first(
1182 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1183 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1184 for (VPRecipeBase &Recipe : *VPBB) {
1185 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1186 Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
1187 VPDef *AddrDef = WidenRec->getAddr()->getDef();
1188 if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
1189 Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
1190 collectPoisonGeneratingInstrsInBackwardSlice(
1191 cast<VPRecipeBase>(AddrDef));
1192 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1193 VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1194 if (AddrDef) {
1195 // Check if any member of the interleave group needs predication.
1196 const InterleaveGroup<Instruction> *InterGroup =
1197 InterleaveRec->getInterleaveGroup();
1198 bool NeedPredication = false;
1199 for (int I = 0, NumMembers = InterGroup->getNumMembers();
1200 I < NumMembers; ++I) {
1201 Instruction *Member = InterGroup->getMember(I);
1202 if (Member)
1203 NeedPredication |=
1204 Legal->blockNeedsPredication(Member->getParent());
1205 }
1206
1207 if (NeedPredication)
1208 collectPoisonGeneratingInstrsInBackwardSlice(
1209 cast<VPRecipeBase>(AddrDef));
1210 }
1211 }
1212 }
1213 }
1214}
1215
1216void InnerLoopVectorizer::addMetadata(Instruction *To,
1217 Instruction *From) {
1218 propagateMetadata(To, From);
1219 addNewMetadata(To, From);
1220}
1221
1222void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1223 Instruction *From) {
1224 for (Value *V : To) {
1225 if (Instruction *I = dyn_cast<Instruction>(V))
1226 addMetadata(I, From);
1227 }
1228}
1229
1230PHINode *InnerLoopVectorizer::getReductionResumeValue(
1231 const RecurrenceDescriptor &RdxDesc) {
1232 auto It = ReductionResumeValues.find(&RdxDesc);
1233 assert(It != ReductionResumeValues.end() &&(static_cast <bool> (It != ReductionResumeValues.end() &&
"Expected to find a resume value for the reduction.") ? void
(0) : __assert_fail ("It != ReductionResumeValues.end() && \"Expected to find a resume value for the reduction.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1234, __extension__
__PRETTY_FUNCTION__))
1234 "Expected to find a resume value for the reduction.")(static_cast <bool> (It != ReductionResumeValues.end() &&
"Expected to find a resume value for the reduction.") ? void
(0) : __assert_fail ("It != ReductionResumeValues.end() && \"Expected to find a resume value for the reduction.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1234, __extension__
__PRETTY_FUNCTION__))
;
1235 return It->second;
1236}
1237
1238namespace llvm {
1239
1240// Loop vectorization cost-model hints how the scalar epilogue loop should be
1241// lowered.
1242enum ScalarEpilogueLowering {
1243
1244 // The default: allowing scalar epilogues.
1245 CM_ScalarEpilogueAllowed,
1246
1247 // Vectorization with OptForSize: don't allow epilogues.
1248 CM_ScalarEpilogueNotAllowedOptSize,
1249
1250 // A special case of vectorisation with OptForSize: loops with a very small
1251 // trip count are considered for vectorization under OptForSize, thereby
1252 // making sure the cost of their loop body is dominant, free of runtime
1253 // guards and scalar iteration overheads.
1254 CM_ScalarEpilogueNotAllowedLowTripLoop,
1255
1256 // Loop hint predicate indicating an epilogue is undesired.
1257 CM_ScalarEpilogueNotNeededUsePredicate,
1258
1259 // Directive indicating we must either tail fold or not vectorize
1260 CM_ScalarEpilogueNotAllowedUsePredicate
1261};
1262
1263/// ElementCountComparator creates a total ordering for ElementCount
1264/// for the purposes of using it in a set structure.
1265struct ElementCountComparator {
1266 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1267 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1268 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1269 }
1270};
1271using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1272
1273/// LoopVectorizationCostModel - estimates the expected speedups due to
1274/// vectorization.
1275/// In many cases vectorization is not profitable. This can happen because of
1276/// a number of reasons. In this class we mainly attempt to predict the
1277/// expected speedup/slowdowns due to the supported instruction set. We use the
1278/// TargetTransformInfo to query the different backends for the cost of
1279/// different operations.
1280class LoopVectorizationCostModel {
1281public:
1282 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1283 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1284 LoopVectorizationLegality *Legal,
1285 const TargetTransformInfo &TTI,
1286 const TargetLibraryInfo *TLI, DemandedBits *DB,
1287 AssumptionCache *AC,
1288 OptimizationRemarkEmitter *ORE, const Function *F,
1289 const LoopVectorizeHints *Hints,
1290 InterleavedAccessInfo &IAI)
1291 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1292 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1293 Hints(Hints), InterleaveInfo(IAI) {}
1294
1295 /// \return An upper bound for the vectorization factors (both fixed and
1296 /// scalable). If the factors are 0, vectorization and interleaving should be
1297 /// avoided up front.
1298 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1299
1300 /// \return True if runtime checks are required for vectorization, and false
1301 /// otherwise.
1302 bool runtimeChecksRequired();
1303
1304 /// \return The most profitable vectorization factor and the cost of that VF.
1305 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1306 /// then this vectorization factor will be selected if vectorization is
1307 /// possible.
1308 VectorizationFactor
1309 selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1310
1311 VectorizationFactor
1312 selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1313 const LoopVectorizationPlanner &LVP);
1314
1315 /// Setup cost-based decisions for user vectorization factor.
1316 /// \return true if the UserVF is a feasible VF to be chosen.
1317 bool selectUserVectorizationFactor(ElementCount UserVF) {
1318 collectUniformsAndScalars(UserVF);
1319 collectInstsToScalarize(UserVF);
1320 return expectedCost(UserVF).first.isValid();
1321 }
1322
1323 /// \return The size (in bits) of the smallest and widest types in the code
1324 /// that needs to be vectorized. We ignore values that remain scalar such as
1325 /// 64 bit loop indices.
1326 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1327
1328 /// \return The desired interleave count.
1329 /// If interleave count has been specified by metadata it will be returned.
1330 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1331 /// are the selected vectorization factor and the cost of the selected VF.
1332 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1333
1334 /// Memory access instruction may be vectorized in more than one way.
1335 /// Form of instruction after vectorization depends on cost.
1336 /// This function takes cost-based decisions for Load/Store instructions
1337 /// and collects them in a map. This decisions map is used for building
1338 /// the lists of loop-uniform and loop-scalar instructions.
1339 /// The calculated cost is saved with widening decision in order to
1340 /// avoid redundant calculations.
1341 void setCostBasedWideningDecision(ElementCount VF);
1342
1343 /// A struct that represents some properties of the register usage
1344 /// of a loop.
1345 struct RegisterUsage {
1346 /// Holds the number of loop invariant values that are used in the loop.
1347 /// The key is ClassID of target-provided register class.
1348 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1349 /// Holds the maximum number of concurrent live intervals in the loop.
1350 /// The key is ClassID of target-provided register class.
1351 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1352 };
1353
1354 /// \return Returns information about the register usages of the loop for the
1355 /// given vectorization factors.
1356 SmallVector<RegisterUsage, 8>
1357 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1358
1359 /// Collect values we want to ignore in the cost model.
1360 void collectValuesToIgnore();
1361
1362 /// Collect all element types in the loop for which widening is needed.
1363 void collectElementTypesForWidening();
1364
1365 /// Split reductions into those that happen in the loop, and those that happen
1366 /// outside. In loop reductions are collected into InLoopReductionChains.
1367 void collectInLoopReductions();
1368
1369 /// Returns true if we should use strict in-order reductions for the given
1370 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1371 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1372 /// of FP operations.
1373 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1374 return !Hints->allowReordering() && RdxDesc.isOrdered();
1375 }
1376
1377 /// \returns The smallest bitwidth each instruction can be represented with.
1378 /// The vector equivalents of these instructions should be truncated to this
1379 /// type.
1380 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1381 return MinBWs;
1382 }
1383
1384 /// \returns True if it is more profitable to scalarize instruction \p I for
1385 /// vectorization factor \p VF.
1386 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1387 assert(VF.isVector() &&(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1388, __extension__
__PRETTY_FUNCTION__))
1388 "Profitable to scalarize relevant only for VF > 1.")(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1388, __extension__
__PRETTY_FUNCTION__))
;
1389
1390 // Cost model is not run in the VPlan-native path - return conservative
1391 // result until this changes.
1392 if (EnableVPlanNativePath)
1393 return false;
1394
1395 auto Scalars = InstsToScalarize.find(VF);
1396 assert(Scalars != InstsToScalarize.end() &&(static_cast <bool> (Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability") ? void
(0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1397, __extension__
__PRETTY_FUNCTION__))
1397 "VF not yet analyzed for scalarization profitability")(static_cast <bool> (Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability") ? void
(0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1397, __extension__
__PRETTY_FUNCTION__))
;
1398 return Scalars->second.find(I) != Scalars->second.end();
1399 }
1400
1401 /// Returns true if \p I is known to be uniform after vectorization.
1402 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1403 if (VF.isScalar())
1404 return true;
1405
1406 // Cost model is not run in the VPlan-native path - return conservative
1407 // result until this changes.
1408 if (EnableVPlanNativePath)
1409 return false;
1410
1411 auto UniformsPerVF = Uniforms.find(VF);
1412 assert(UniformsPerVF != Uniforms.end() &&(static_cast <bool> (UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity") ? void (0) : __assert_fail
("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1413, __extension__
__PRETTY_FUNCTION__))
1413 "VF not yet analyzed for uniformity")(static_cast <bool> (UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity") ? void (0) : __assert_fail
("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1413, __extension__
__PRETTY_FUNCTION__))
;
1414 return UniformsPerVF->second.count(I);
1415 }
1416
1417 /// Returns true if \p I is known to be scalar after vectorization.
1418 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1419 if (VF.isScalar())
1420 return true;
1421
1422 // Cost model is not run in the VPlan-native path - return conservative
1423 // result until this changes.
1424 if (EnableVPlanNativePath)
1425 return false;
1426
1427 auto ScalarsPerVF = Scalars.find(VF);
1428 assert(ScalarsPerVF != Scalars.end() &&(static_cast <bool> (ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF") ? void (0) : __assert_fail
("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1429, __extension__
__PRETTY_FUNCTION__))
1429 "Scalar values are not calculated for VF")(static_cast <bool> (ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF") ? void (0) : __assert_fail
("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1429, __extension__
__PRETTY_FUNCTION__))
;
1430 return ScalarsPerVF->second.count(I);
1431 }
1432
1433 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1434 /// for vectorization factor \p VF.
1435 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1436 return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1437 !isProfitableToScalarize(I, VF) &&
1438 !isScalarAfterVectorization(I, VF);
1439 }
1440
1441 /// Decision that was taken during cost calculation for memory instruction.
1442 enum InstWidening {
1443 CM_Unknown,
1444 CM_Widen, // For consecutive accesses with stride +1.
1445 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1446 CM_Interleave,
1447 CM_GatherScatter,
1448 CM_Scalarize
1449 };
1450
1451 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1452 /// instruction \p I and vector width \p VF.
1453 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1454 InstructionCost Cost) {
1455 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1455, __extension__
__PRETTY_FUNCTION__))
;
1456 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1457 }
1458
1459 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1460 /// interleaving group \p Grp and vector width \p VF.
1461 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1462 ElementCount VF, InstWidening W,
1463 InstructionCost Cost) {
1464 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1464, __extension__
__PRETTY_FUNCTION__))
;
1465 /// Broadcast this decicion to all instructions inside the group.
1466 /// But the cost will be assigned to one instruction only.
1467 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1468 if (auto *I = Grp->getMember(i)) {
1469 if (Grp->getInsertPos() == I)
1470 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1471 else
1472 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1473 }
1474 }
1475 }
1476
1477 /// Return the cost model decision for the given instruction \p I and vector
1478 /// width \p VF. Return CM_Unknown if this instruction did not pass
1479 /// through the cost modeling.
1480 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1481 assert(VF.isVector() && "Expected VF to be a vector VF")(static_cast <bool> (VF.isVector() && "Expected VF to be a vector VF"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF to be a vector VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1481, __extension__
__PRETTY_FUNCTION__))
;
1482 // Cost model is not run in the VPlan-native path - return conservative
1483 // result until this changes.
1484 if (EnableVPlanNativePath)
1485 return CM_GatherScatter;
1486
1487 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1488 auto Itr = WideningDecisions.find(InstOnVF);
1489 if (Itr == WideningDecisions.end())
1490 return CM_Unknown;
1491 return Itr->second.first;
1492 }
1493
1494 /// Return the vectorization cost for the given instruction \p I and vector
1495 /// width \p VF.
1496 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1497 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1497, __extension__
__PRETTY_FUNCTION__))
;
1498 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1499 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&(static_cast <bool> (WideningDecisions.find(InstOnVF) !=
WideningDecisions.end() && "The cost is not calculated"
) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1500, __extension__
__PRETTY_FUNCTION__))
1500 "The cost is not calculated")(static_cast <bool> (WideningDecisions.find(InstOnVF) !=
WideningDecisions.end() && "The cost is not calculated"
) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1500, __extension__
__PRETTY_FUNCTION__))
;
1501 return WideningDecisions[InstOnVF].second;
1502 }
1503
1504 /// Return True if instruction \p I is an optimizable truncate whose operand
1505 /// is an induction variable. Such a truncate will be removed by adding a new
1506 /// induction variable with the destination type.
1507 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1508 // If the instruction is not a truncate, return false.
1509 auto *Trunc = dyn_cast<TruncInst>(I);
1510 if (!Trunc)
1511 return false;
1512
1513 // Get the source and destination types of the truncate.
1514 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1515 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1516
1517 // If the truncate is free for the given types, return false. Replacing a
1518 // free truncate with an induction variable would add an induction variable
1519 // update instruction to each iteration of the loop. We exclude from this
1520 // check the primary induction variable since it will need an update
1521 // instruction regardless.
1522 Value *Op = Trunc->getOperand(0);
1523 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1524 return false;
1525
1526 // If the truncated value is not an induction variable, return false.
1527 return Legal->isInductionPhi(Op);
1528 }
1529
1530 /// Collects the instructions to scalarize for each predicated instruction in
1531 /// the loop.
1532 void collectInstsToScalarize(ElementCount VF);
1533
1534 /// Collect Uniform and Scalar values for the given \p VF.
1535 /// The sets depend on CM decision for Load/Store instructions
1536 /// that may be vectorized as interleave, gather-scatter or scalarized.
1537 void collectUniformsAndScalars(ElementCount VF) {
1538 // Do the analysis once.
1539 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1540 return;
1541 setCostBasedWideningDecision(VF);
1542 collectLoopUniforms(VF);
1543 collectLoopScalars(VF);
1544 }
1545
1546 /// Returns true if the target machine supports masked store operation
1547 /// for the given \p DataType and kind of access to \p Ptr.
1548 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1549 return Legal->isConsecutivePtr(DataType, Ptr) &&
1550 TTI.isLegalMaskedStore(DataType, Alignment);
1551 }
1552
1553 /// Returns true if the target machine supports masked load operation
1554 /// for the given \p DataType and kind of access to \p Ptr.
1555 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1556 return Legal->isConsecutivePtr(DataType, Ptr) &&
1557 TTI.isLegalMaskedLoad(DataType, Alignment);
1558 }
1559
1560 /// Returns true if the target machine can represent \p V as a masked gather
1561 /// or scatter operation.
1562 bool isLegalGatherOrScatter(Value *V,
1563 ElementCount VF = ElementCount::getFixed(1)) {
1564 bool LI = isa<LoadInst>(V);
1565 bool SI = isa<StoreInst>(V);
1566 if (!LI && !SI)
1567 return false;
1568 auto *Ty = getLoadStoreType(V);
1569 Align Align = getLoadStoreAlignment(V);
1570 if (VF.isVector())
1571 Ty = VectorType::get(Ty, VF);
1572 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1573 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1574 }
1575
1576 /// Returns true if the target machine supports all of the reduction
1577 /// variables found for the given VF.
1578 bool canVectorizeReductions(ElementCount VF) const {
1579 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1580 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1581 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1582 }));
1583 }
1584
1585 /// Returns true if \p I is an instruction that will be scalarized with
1586 /// predication when vectorizing \p I with vectorization factor \p VF. Such
1587 /// instructions include conditional stores and instructions that may divide
1588 /// by zero.
1589 bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1590
1591 // Returns true if \p I is an instruction that will be predicated either
1592 // through scalar predication or masked load/store or masked gather/scatter.
1593 // \p VF is the vectorization factor that will be used to vectorize \p I.
1594 // Superset of instructions that return true for isScalarWithPredication.
1595 bool isPredicatedInst(Instruction *I, ElementCount VF,
1596 bool IsKnownUniform = false) {
1597 // When we know the load is uniform and the original scalar loop was not
1598 // predicated we don't need to mark it as a predicated instruction. Any
1599 // vectorised blocks created when tail-folding are something artificial we
1600 // have introduced and we know there is always at least one active lane.
1601 // That's why we call Legal->blockNeedsPredication here because it doesn't
1602 // query tail-folding.
1603 if (IsKnownUniform && isa<LoadInst>(I) &&
1604 !Legal->blockNeedsPredication(I->getParent()))
1605 return false;
1606 if (!blockNeedsPredicationForAnyReason(I->getParent()))
1607 return false;
1608 // Loads and stores that need some form of masked operation are predicated
1609 // instructions.
1610 if (isa<LoadInst>(I) || isa<StoreInst>(I))
1611 return Legal->isMaskRequired(I);
1612 return isScalarWithPredication(I, VF);
1613 }
1614
1615 /// Returns true if \p I is a memory instruction with consecutive memory
1616 /// access that can be widened.
1617 bool
1618 memoryInstructionCanBeWidened(Instruction *I,
1619 ElementCount VF = ElementCount::getFixed(1));
1620
1621 /// Returns true if \p I is a memory instruction in an interleaved-group
1622 /// of memory accesses that can be vectorized with wide vector loads/stores
1623 /// and shuffles.
1624 bool
1625 interleavedAccessCanBeWidened(Instruction *I,
1626 ElementCount VF = ElementCount::getFixed(1));
1627
1628 /// Check if \p Instr belongs to any interleaved access group.
1629 bool isAccessInterleaved(Instruction *Instr) {
1630 return InterleaveInfo.isInterleaved(Instr);
1631 }
1632
1633 /// Get the interleaved access group that \p Instr belongs to.
1634 const InterleaveGroup<Instruction> *
1635 getInterleavedAccessGroup(Instruction *Instr) {
1636 return InterleaveInfo.getInterleaveGroup(Instr);
1637 }
1638
1639 /// Returns true if we're required to use a scalar epilogue for at least
1640 /// the final iteration of the original loop.
1641 bool requiresScalarEpilogue(ElementCount VF) const {
1642 if (!isScalarEpilogueAllowed())
1643 return false;
1644 // If we might exit from anywhere but the latch, must run the exiting
1645 // iteration in scalar form.
1646 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1647 return true;
1648 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1649 }
1650
1651 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1652 /// loop hint annotation.
1653 bool isScalarEpilogueAllowed() const {
1654 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1655 }
1656
1657 /// Returns true if all loop blocks should be masked to fold tail loop.
1658 bool foldTailByMasking() const { return FoldTailByMasking; }
1659
1660 /// Returns true if the instructions in this block requires predication
1661 /// for any reason, e.g. because tail folding now requires a predicate
1662 /// or because the block in the original loop was predicated.
1663 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1664 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1665 }
1666
1667 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1668 /// nodes to the chain of instructions representing the reductions. Uses a
1669 /// MapVector to ensure deterministic iteration order.
1670 using ReductionChainMap =
1671 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1672
1673 /// Return the chain of instructions representing an inloop reduction.
1674 const ReductionChainMap &getInLoopReductionChains() const {
1675 return InLoopReductionChains;
1676 }
1677
1678 /// Returns true if the Phi is part of an inloop reduction.
1679 bool isInLoopReduction(PHINode *Phi) const {
1680 return InLoopReductionChains.count(Phi);
1681 }
1682
1683 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1684 /// with factor VF. Return the cost of the instruction, including
1685 /// scalarization overhead if it's needed.
1686 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1687
1688 /// Estimate cost of a call instruction CI if it were vectorized with factor
1689 /// VF. Return the cost of the instruction, including scalarization overhead
1690 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1691 /// scalarized -
1692 /// i.e. either vector version isn't available, or is too expensive.
1693 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1694 bool &NeedToScalarize) const;
1695
1696 /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1697 /// that of B.
1698 bool isMoreProfitable(const VectorizationFactor &A,
1699 const VectorizationFactor &B) const;
1700
1701 /// Invalidates decisions already taken by the cost model.
1702 void invalidateCostModelingDecisions() {
1703 WideningDecisions.clear();
1704 Uniforms.clear();
1705 Scalars.clear();
1706 }
1707
1708private:
1709 unsigned NumPredStores = 0;
1710
1711 /// \return An upper bound for the vectorization factors for both
1712 /// fixed and scalable vectorization, where the minimum-known number of
1713 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1714 /// disabled or unsupported, then the scalable part will be equal to
1715 /// ElementCount::getScalable(0).
1716 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1717 ElementCount UserVF,
1718 bool FoldTailByMasking);
1719
1720 /// \return the maximized element count based on the targets vector
1721 /// registers and the loop trip-count, but limited to a maximum safe VF.
1722 /// This is a helper function of computeFeasibleMaxVF.
1723 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1724 /// issue that occurred on one of the buildbots which cannot be reproduced
1725 /// without having access to the properietary compiler (see comments on
1726 /// D98509). The issue is currently under investigation and this workaround
1727 /// will be removed as soon as possible.
1728 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1729 unsigned SmallestType,
1730 unsigned WidestType,
1731 const ElementCount &MaxSafeVF,
1732 bool FoldTailByMasking);
1733
1734 /// \return the maximum legal scalable VF, based on the safe max number
1735 /// of elements.
1736 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1737
1738 /// The vectorization cost is a combination of the cost itself and a boolean
1739 /// indicating whether any of the contributing operations will actually
1740 /// operate on vector values after type legalization in the backend. If this
1741 /// latter value is false, then all operations will be scalarized (i.e. no
1742 /// vectorization has actually taken place).
1743 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1744
1745 /// Returns the expected execution cost. The unit of the cost does
1746 /// not matter because we use the 'cost' units to compare different
1747 /// vector widths. The cost that is returned is *not* normalized by
1748 /// the factor width. If \p Invalid is not nullptr, this function
1749 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1750 /// each instruction that has an Invalid cost for the given VF.
1751 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1752 VectorizationCostTy
1753 expectedCost(ElementCount VF,
1754 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1755
1756 /// Returns the execution time cost of an instruction for a given vector
1757 /// width. Vector width of one means scalar.
1758 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1759
1760 /// The cost-computation logic from getInstructionCost which provides
1761 /// the vector type as an output parameter.
1762 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1763 Type *&VectorTy);
1764
1765 /// Return the cost of instructions in an inloop reduction pattern, if I is
1766 /// part of that pattern.
1767 Optional<InstructionCost>
1768 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1769 TTI::TargetCostKind CostKind);
1770
1771 /// Calculate vectorization cost of memory instruction \p I.
1772 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1773
1774 /// The cost computation for scalarized memory instruction.
1775 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1776
1777 /// The cost computation for interleaving group of memory instructions.
1778 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1779
1780 /// The cost computation for Gather/Scatter instruction.
1781 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1782
1783 /// The cost computation for widening instruction \p I with consecutive
1784 /// memory access.
1785 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1786
1787 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1788 /// Load: scalar load + broadcast.
1789 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1790 /// element)
1791 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1792
1793 /// Estimate the overhead of scalarizing an instruction. This is a
1794 /// convenience wrapper for the type-based getScalarizationOverhead API.
1795 InstructionCost getScalarizationOverhead(Instruction *I,
1796 ElementCount VF) const;
1797
1798 /// Returns whether the instruction is a load or store and will be a emitted
1799 /// as a vector operation.
1800 bool isConsecutiveLoadOrStore(Instruction *I);
1801
1802 /// Returns true if an artificially high cost for emulated masked memrefs
1803 /// should be used.
1804 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1805
1806 /// Map of scalar integer values to the smallest bitwidth they can be legally
1807 /// represented as. The vector equivalents of these values should be truncated
1808 /// to this type.
1809 MapVector<Instruction *, uint64_t> MinBWs;
1810
1811 /// A type representing the costs for instructions if they were to be
1812 /// scalarized rather than vectorized. The entries are Instruction-Cost
1813 /// pairs.
1814 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1815
1816 /// A set containing all BasicBlocks that are known to present after
1817 /// vectorization as a predicated block.
1818 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1819
1820 /// Records whether it is allowed to have the original scalar loop execute at
1821 /// least once. This may be needed as a fallback loop in case runtime
1822 /// aliasing/dependence checks fail, or to handle the tail/remainder
1823 /// iterations when the trip count is unknown or doesn't divide by the VF,
1824 /// or as a peel-loop to handle gaps in interleave-groups.
1825 /// Under optsize and when the trip count is very small we don't allow any
1826 /// iterations to execute in the scalar loop.
1827 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1828
1829 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1830 bool FoldTailByMasking = false;
1831
1832 /// A map holding scalar costs for different vectorization factors. The
1833 /// presence of a cost for an instruction in the mapping indicates that the
1834 /// instruction will be scalarized when vectorizing with the associated
1835 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1836 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1837
1838 /// Holds the instructions known to be uniform after vectorization.
1839 /// The data is collected per VF.
1840 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1841
1842 /// Holds the instructions known to be scalar after vectorization.
1843 /// The data is collected per VF.
1844 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1845
1846 /// Holds the instructions (address computations) that are forced to be
1847 /// scalarized.
1848 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1849
1850 /// PHINodes of the reductions that should be expanded in-loop along with
1851 /// their associated chains of reduction operations, in program order from top
1852 /// (PHI) to bottom
1853 ReductionChainMap InLoopReductionChains;
1854
1855 /// A Map of inloop reduction operations and their immediate chain operand.
1856 /// FIXME: This can be removed once reductions can be costed correctly in
1857 /// vplan. This was added to allow quick lookup to the inloop operations,
1858 /// without having to loop through InLoopReductionChains.
1859 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1860
1861 /// Returns the expected difference in cost from scalarizing the expression
1862 /// feeding a predicated instruction \p PredInst. The instructions to
1863 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1864 /// non-negative return value implies the expression will be scalarized.
1865 /// Currently, only single-use chains are considered for scalarization.
1866 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1867 ElementCount VF);
1868
1869 /// Collect the instructions that are uniform after vectorization. An
1870 /// instruction is uniform if we represent it with a single scalar value in
1871 /// the vectorized loop corresponding to each vector iteration. Examples of
1872 /// uniform instructions include pointer operands of consecutive or
1873 /// interleaved memory accesses. Note that although uniformity implies an
1874 /// instruction will be scalar, the reverse is not true. In general, a
1875 /// scalarized instruction will be represented by VF scalar values in the
1876 /// vectorized loop, each corresponding to an iteration of the original
1877 /// scalar loop.
1878 void collectLoopUniforms(ElementCount VF);
1879
1880 /// Collect the instructions that are scalar after vectorization. An
1881 /// instruction is scalar if it is known to be uniform or will be scalarized
1882 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1883 /// to the list if they are used by a load/store instruction that is marked as
1884 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1885 /// VF values in the vectorized loop, each corresponding to an iteration of
1886 /// the original scalar loop.
1887 void collectLoopScalars(ElementCount VF);
1888
1889 /// Keeps cost model vectorization decision and cost for instructions.
1890 /// Right now it is used for memory instructions only.
1891 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1892 std::pair<InstWidening, InstructionCost>>;
1893
1894 DecisionList WideningDecisions;
1895
1896 /// Returns true if \p V is expected to be vectorized and it needs to be
1897 /// extracted.
1898 bool needsExtract(Value *V, ElementCount VF) const {
1899 Instruction *I = dyn_cast<Instruction>(V);
1900 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1901 TheLoop->isLoopInvariant(I))
1902 return false;
1903
1904 // Assume we can vectorize V (and hence we need extraction) if the
1905 // scalars are not computed yet. This can happen, because it is called
1906 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1907 // the scalars are collected. That should be a safe assumption in most
1908 // cases, because we check if the operands have vectorizable types
1909 // beforehand in LoopVectorizationLegality.
1910 return Scalars.find(VF) == Scalars.end() ||
1911 !isScalarAfterVectorization(I, VF);
1912 };
1913
1914 /// Returns a range containing only operands needing to be extracted.
1915 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1916 ElementCount VF) const {
1917 return SmallVector<Value *, 4>(make_filter_range(
1918 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1919 }
1920
1921 /// Determines if we have the infrastructure to vectorize loop \p L and its
1922 /// epilogue, assuming the main loop is vectorized by \p VF.
1923 bool isCandidateForEpilogueVectorization(const Loop &L,
1924 const ElementCount VF) const;
1925
1926 /// Returns true if epilogue vectorization is considered profitable, and
1927 /// false otherwise.
1928 /// \p VF is the vectorization factor chosen for the original loop.
1929 bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1930
1931public:
1932 /// The loop that we evaluate.
1933 Loop *TheLoop;
1934
1935 /// Predicated scalar evolution analysis.
1936 PredicatedScalarEvolution &PSE;
1937
1938 /// Loop Info analysis.
1939 LoopInfo *LI;
1940
1941 /// Vectorization legality.
1942 LoopVectorizationLegality *Legal;
1943
1944 /// Vector target information.
1945 const TargetTransformInfo &TTI;
1946
1947 /// Target Library Info.
1948 const TargetLibraryInfo *TLI;
1949
1950 /// Demanded bits analysis.
1951 DemandedBits *DB;
1952
1953 /// Assumption cache.
1954 AssumptionCache *AC;
1955
1956 /// Interface to emit optimization remarks.
1957 OptimizationRemarkEmitter *ORE;
1958
1959 const Function *TheFunction;
1960
1961 /// Loop Vectorize Hint.
1962 const LoopVectorizeHints *Hints;
1963
1964 /// The interleave access information contains groups of interleaved accesses
1965 /// with the same stride and close to each other.
1966 InterleavedAccessInfo &InterleaveInfo;
1967
1968 /// Values to ignore in the cost model.
1969 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1970
1971 /// Values to ignore in the cost model when VF > 1.
1972 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1973
1974 /// All element types found in the loop.
1975 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1976
1977 /// Profitable vector factors.
1978 SmallVector<VectorizationFactor, 8> ProfitableVFs;
1979};
1980} // end namespace llvm
1981
1982/// Helper struct to manage generating runtime checks for vectorization.
1983///
1984/// The runtime checks are created up-front in temporary blocks to allow better
1985/// estimating the cost and un-linked from the existing IR. After deciding to
1986/// vectorize, the checks are moved back. If deciding not to vectorize, the
1987/// temporary blocks are completely removed.
1988class GeneratedRTChecks {
1989 /// Basic block which contains the generated SCEV checks, if any.
1990 BasicBlock *SCEVCheckBlock = nullptr;
1991
1992 /// The value representing the result of the generated SCEV checks. If it is
1993 /// nullptr, either no SCEV checks have been generated or they have been used.
1994 Value *SCEVCheckCond = nullptr;
1995
1996 /// Basic block which contains the generated memory runtime checks, if any.
1997 BasicBlock *MemCheckBlock = nullptr;
1998
1999 /// The value representing the result of the generated memory runtime checks.
2000 /// If it is nullptr, either no memory runtime checks have been generated or
2001 /// they have been used.
2002 Value *MemRuntimeCheckCond = nullptr;
2003
2004 DominatorTree *DT;
2005 LoopInfo *LI;
2006
2007 SCEVExpander SCEVExp;
2008 SCEVExpander MemCheckExp;
2009
2010public:
2011 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
2012 const DataLayout &DL)
2013 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
2014 MemCheckExp(SE, DL, "scev.check") {}
2015
2016 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
2017 /// accurately estimate the cost of the runtime checks. The blocks are
2018 /// un-linked from the IR and is added back during vector code generation. If
2019 /// there is no vector code generation, the check blocks are removed
2020 /// completely.
2021 void Create(Loop *L, const LoopAccessInfo &LAI,
2022 const SCEVUnionPredicate &UnionPred) {
2023
2024 BasicBlock *LoopHeader = L->getHeader();
2025 BasicBlock *Preheader = L->getLoopPreheader();
2026
2027 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
2028 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
2029 // may be used by SCEVExpander. The blocks will be un-linked from their
2030 // predecessors and removed from LI & DT at the end of the function.
2031 if (!UnionPred.isAlwaysTrue()) {
2032 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
2033 nullptr, "vector.scevcheck");
2034
2035 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
2036 &UnionPred, SCEVCheckBlock->getTerminator());
2037 }
2038
2039 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
2040 if (RtPtrChecking.Need) {
2041 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
2042 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2043 "vector.memcheck");
2044
2045 MemRuntimeCheckCond =
2046 addRuntimeChecks(MemCheckBlock->getTerminator(), L,
2047 RtPtrChecking.getChecks(), MemCheckExp);
2048 assert(MemRuntimeCheckCond &&(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2050, __extension__
__PRETTY_FUNCTION__))
2049 "no RT checks generated although RtPtrChecking "(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2050, __extension__
__PRETTY_FUNCTION__))
2050 "claimed checks are required")(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2050, __extension__
__PRETTY_FUNCTION__))
;
2051 }
2052
2053 if (!MemCheckBlock && !SCEVCheckBlock)
2054 return;
2055
2056 // Unhook the temporary block with the checks, update various places
2057 // accordingly.
2058 if (SCEVCheckBlock)
2059 SCEVCheckBlock->replaceAllUsesWith(Preheader);
2060 if (MemCheckBlock)
2061 MemCheckBlock->replaceAllUsesWith(Preheader);
2062
2063 if (SCEVCheckBlock) {
2064 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2065 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2066 Preheader->getTerminator()->eraseFromParent();
2067 }
2068 if (MemCheckBlock) {
2069 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2070 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2071 Preheader->getTerminator()->eraseFromParent();
2072 }
2073
2074 DT->changeImmediateDominator(LoopHeader, Preheader);
2075 if (MemCheckBlock) {
2076 DT->eraseNode(MemCheckBlock);
2077 LI->removeBlock(MemCheckBlock);
2078 }
2079 if (SCEVCheckBlock) {
2080 DT->eraseNode(SCEVCheckBlock);
2081 LI->removeBlock(SCEVCheckBlock);
2082 }
2083 }
2084
2085 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2086 /// unused.
2087 ~GeneratedRTChecks() {
2088 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2089 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2090 if (!SCEVCheckCond)
2091 SCEVCleaner.markResultUsed();
2092
2093 if (!MemRuntimeCheckCond)
2094 MemCheckCleaner.markResultUsed();
2095
2096 if (MemRuntimeCheckCond) {
2097 auto &SE = *MemCheckExp.getSE();
2098 // Memory runtime check generation creates compares that use expanded
2099 // values. Remove them before running the SCEVExpanderCleaners.
2100 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2101 if (MemCheckExp.isInsertedInstruction(&I))
2102 continue;
2103 SE.forgetValue(&I);
2104 I.eraseFromParent();
2105 }
2106 }
2107 MemCheckCleaner.cleanup();
2108 SCEVCleaner.cleanup();
2109
2110 if (SCEVCheckCond)
2111 SCEVCheckBlock->eraseFromParent();
2112 if (MemRuntimeCheckCond)
2113 MemCheckBlock->eraseFromParent();
2114 }
2115
2116 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2117 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2118 /// depending on the generated condition.
2119 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
2120 BasicBlock *LoopVectorPreHeader,
2121 BasicBlock *LoopExitBlock) {
2122 if (!SCEVCheckCond)
2123 return nullptr;
2124 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2125 if (C->isZero())
2126 return nullptr;
2127
2128 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2129
2130 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2131 // Create new preheader for vector loop.
2132 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2133 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2134
2135 SCEVCheckBlock->getTerminator()->eraseFromParent();
2136 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2137 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2138 SCEVCheckBlock);
2139
2140 DT->addNewBlock(SCEVCheckBlock, Pred);
2141 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2142
2143 ReplaceInstWithInst(
2144 SCEVCheckBlock->getTerminator(),
2145 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2146 // Mark the check as used, to prevent it from being removed during cleanup.
2147 SCEVCheckCond = nullptr;
2148 return SCEVCheckBlock;
2149 }
2150
2151 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2152 /// the branches to branch to the vector preheader or \p Bypass, depending on
2153 /// the generated condition.
2154 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2155 BasicBlock *LoopVectorPreHeader) {
2156 // Check if we generated code that checks in runtime if arrays overlap.
2157 if (!MemRuntimeCheckCond)
2158 return nullptr;
2159
2160 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2161 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2162 MemCheckBlock);
2163
2164 DT->addNewBlock(MemCheckBlock, Pred);
2165 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2166 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2167
2168 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2169 PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2170
2171 ReplaceInstWithInst(
2172 MemCheckBlock->getTerminator(),
2173 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2174 MemCheckBlock->getTerminator()->setDebugLoc(
2175 Pred->getTerminator()->getDebugLoc());
2176
2177 // Mark the check as used, to prevent it from being removed during cleanup.
2178 MemRuntimeCheckCond = nullptr;
2179 return MemCheckBlock;
2180 }
2181};
2182
2183// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2184// vectorization. The loop needs to be annotated with #pragma omp simd
2185// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2186// vector length information is not provided, vectorization is not considered
2187// explicit. Interleave hints are not allowed either. These limitations will be
2188// relaxed in the future.
2189// Please, note that we are currently forced to abuse the pragma 'clang
2190// vectorize' semantics. This pragma provides *auto-vectorization hints*
2191// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2192// provides *explicit vectorization hints* (LV can bypass legal checks and
2193// assume that vectorization is legal). However, both hints are implemented
2194// using the same metadata (llvm.loop.vectorize, processed by
2195// LoopVectorizeHints). This will be fixed in the future when the native IR
2196// representation for pragma 'omp simd' is introduced.
2197static bool isExplicitVecOuterLoop(Loop *OuterLp,
2198 OptimizationRemarkEmitter *ORE) {
2199 assert(!OuterLp->isInnermost() && "This is not an outer loop")(static_cast <bool> (!OuterLp->isInnermost() &&
"This is not an outer loop") ? void (0) : __assert_fail ("!OuterLp->isInnermost() && \"This is not an outer loop\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2199, __extension__
__PRETTY_FUNCTION__))
;
2200 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2201
2202 // Only outer loops with an explicit vectorization hint are supported.
2203 // Unannotated outer loops are ignored.
2204 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2205 return false;
2206
2207 Function *Fn = OuterLp->getHeader()->getParent();
2208 if (!Hints.allowVectorization(Fn, OuterLp,
2209 true /*VectorizeOnlyWhenForced*/)) {
2210 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"
; } } while (false)
;
2211 return false;
2212 }
2213
2214 if (Hints.getInterleave() > 1) {
2215 // TODO: Interleave support is future work.
2216 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
2217 "outer loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
;
2218 Hints.emitRemarkWithHints();
2219 return false;
2220 }
2221
2222 return true;
2223}
2224
2225static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2226 OptimizationRemarkEmitter *ORE,
2227 SmallVectorImpl<Loop *> &V) {
2228 // Collect inner loops and outer loops without irreducible control flow. For
2229 // now, only collect outer loops that have explicit vectorization hints. If we
2230 // are stress testing the VPlan H-CFG construction, we collect the outermost
2231 // loop of every loop nest.
2232 if (L.isInnermost() || VPlanBuildStressTest ||
2233 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2234 LoopBlocksRPO RPOT(&L);
2235 RPOT.perform(LI);
2236 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2237 V.push_back(&L);
2238 // TODO: Collect inner loops inside marked outer loops in case
2239 // vectorization fails for the outer loop. Do not invoke
2240 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2241 // already known to be reducible. We can use an inherited attribute for
2242 // that.
2243 return;
2244 }
2245 }
2246 for (Loop *InnerL : L)
2247 collectSupportedLoops(*InnerL, LI, ORE, V);
2248}
2249
2250namespace {
2251
2252/// The LoopVectorize Pass.
2253struct LoopVectorize : public FunctionPass {
2254 /// Pass identification, replacement for typeid
2255 static char ID;
2256
2257 LoopVectorizePass Impl;
2258
2259 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2260 bool VectorizeOnlyWhenForced = false)
2261 : FunctionPass(ID),
2262 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2263 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2264 }
2265
2266 bool runOnFunction(Function &F) override {
2267 if (skipFunction(F))
2268 return false;
2269
2270 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2271 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2272 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2273 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2274 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2275 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2276 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2277 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2278 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2279 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2280 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2281 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2282 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2283
2284 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2285 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2286
2287 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2288 GetLAA, *ORE, PSI).MadeAnyChange;
2289 }
2290
2291 void getAnalysisUsage(AnalysisUsage &AU) const override {
2292 AU.addRequired<AssumptionCacheTracker>();
2293 AU.addRequired<BlockFrequencyInfoWrapperPass>();
2294 AU.addRequired<DominatorTreeWrapperPass>();
2295 AU.addRequired<LoopInfoWrapperPass>();
2296 AU.addRequired<ScalarEvolutionWrapperPass>();
2297 AU.addRequired<TargetTransformInfoWrapperPass>();
2298 AU.addRequired<AAResultsWrapperPass>();
2299 AU.addRequired<LoopAccessLegacyAnalysis>();
2300 AU.addRequired<DemandedBitsWrapperPass>();
2301 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2302 AU.addRequired<InjectTLIMappingsLegacy>();
2303
2304 // We currently do not preserve loopinfo/dominator analyses with outer loop
2305 // vectorization. Until this is addressed, mark these analyses as preserved
2306 // only for non-VPlan-native path.
2307 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2308 if (!EnableVPlanNativePath) {
2309 AU.addPreserved<LoopInfoWrapperPass>();
2310 AU.addPreserved<DominatorTreeWrapperPass>();
2311 }
2312
2313 AU.addPreserved<BasicAAWrapperPass>();
2314 AU.addPreserved<GlobalsAAWrapperPass>();
2315 AU.addRequired<ProfileSummaryInfoWrapperPass>();
2316 }
2317};
2318
2319} // end anonymous namespace
2320
2321//===----------------------------------------------------------------------===//
2322// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2323// LoopVectorizationCostModel and LoopVectorizationPlanner.
2324//===----------------------------------------------------------------------===//
2325
2326Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2327 // We need to place the broadcast of invariant variables outside the loop,
2328 // but only if it's proven safe to do so. Else, broadcast will be inside
2329 // vector loop body.
2330 Instruction *Instr = dyn_cast<Instruction>(V);
2331 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2332 (!Instr ||
2333 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2334 // Place the code for broadcasting invariant variables in the new preheader.
2335 IRBuilder<>::InsertPointGuard Guard(Builder);
2336 if (SafeToHoist)
2337 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2338
2339 // Broadcast the scalar into all locations in the vector.
2340 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2341
2342 return Shuf;
2343}
2344
2345/// This function adds
2346/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2347/// to each vector element of Val. The sequence starts at StartIndex.
2348/// \p Opcode is relevant for FP induction variable.
2349static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2350 Instruction::BinaryOps BinOp, ElementCount VF,
2351 IRBuilder<> &Builder) {
2352 assert(VF.isVector() && "only vector VFs are supported")(static_cast <bool> (VF.isVector() && "only vector VFs are supported"
) ? void (0) : __assert_fail ("VF.isVector() && \"only vector VFs are supported\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2352, __extension__
__PRETTY_FUNCTION__))
;
2353
2354 // Create and check the types.
2355 auto *ValVTy = cast<VectorType>(Val->getType());
2356 ElementCount VLen = ValVTy->getElementCount();
2357
2358 Type *STy = Val->getType()->getScalarType();
2359 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2360, __extension__
__PRETTY_FUNCTION__))
2360 "Induction Step must be an integer or FP")(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2360, __extension__
__PRETTY_FUNCTION__))
;
2361 assert(Step->getType() == STy && "Step has wrong type")(static_cast <bool> (Step->getType() == STy &&
"Step has wrong type") ? void (0) : __assert_fail ("Step->getType() == STy && \"Step has wrong type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2361, __extension__
__PRETTY_FUNCTION__))
;
2362
2363 SmallVector<Constant *, 8> Indices;
2364
2365 // Create a vector of consecutive numbers from zero to VF.
2366 VectorType *InitVecValVTy = ValVTy;
2367 Type *InitVecValSTy = STy;
Value stored to 'InitVecValSTy' during its initialization is never read
2368 if (STy->isFloatingPointTy()) {
2369 InitVecValSTy =
2370 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2371 InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2372 }
2373 Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2374
2375 // Splat the StartIdx
2376 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2377
2378 if (STy->isIntegerTy()) {
2379 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2380 Step = Builder.CreateVectorSplat(VLen, Step);
2381 assert(Step->getType() == Val->getType() && "Invalid step vec")(static_cast <bool> (Step->getType() == Val->getType
() && "Invalid step vec") ? void (0) : __assert_fail (
"Step->getType() == Val->getType() && \"Invalid step vec\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2381, __extension__
__PRETTY_FUNCTION__))
;
2382 // FIXME: The newly created binary instructions should contain nsw/nuw
2383 // flags, which can be found from the original scalar operations.
2384 Step = Builder.CreateMul(InitVec, Step);
2385 return Builder.CreateAdd(Val, Step, "induction");
2386 }
2387
2388 // Floating point induction.
2389 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
== Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2390, __extension__
__PRETTY_FUNCTION__))
2390 "Binary Opcode should be specified for FP induction")(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
== Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2390, __extension__
__PRETTY_FUNCTION__))
;
2391 InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2392 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2393
2394 Step = Builder.CreateVectorSplat(VLen, Step);
2395 Value *MulOp = Builder.CreateFMul(InitVec, Step);
2396 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2397}
2398
2399void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2400 const InductionDescriptor &II, Value *Step, Value *Start,
2401 Instruction *EntryVal, VPValue *Def, VPTransformState &State) {
2402 IRBuilder<> &Builder = State.Builder;
2403 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2404, __extension__
__PRETTY_FUNCTION__))
2404 "Expected either an induction phi-node or a truncate of it!")(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2404, __extension__
__PRETTY_FUNCTION__))
;
2405
2406 // Construct the initial value of the vector IV in the vector loop preheader
2407 auto CurrIP = Builder.saveIP();
2408 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2409 if (isa<TruncInst>(EntryVal)) {
2410 assert(Start->getType()->isIntegerTy() &&(static_cast <bool> (Start->getType()->isIntegerTy
() && "Truncation requires an integer type") ? void (
0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2411, __extension__
__PRETTY_FUNCTION__))
2411 "Truncation requires an integer type")(static_cast <bool> (Start->getType()->isIntegerTy
() && "Truncation requires an integer type") ? void (
0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2411, __extension__
__PRETTY_FUNCTION__))
;
2412 auto *TruncType = cast<IntegerType>(EntryVal->getType());
2413 Step = Builder.CreateTrunc(Step, TruncType);
2414 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2415 }
2416
2417 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
2418 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
2419 Value *SteppedStart = getStepVector(
2420 SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder);
2421
2422 // We create vector phi nodes for both integer and floating-point induction
2423 // variables. Here, we determine the kind of arithmetic we will perform.
2424 Instruction::BinaryOps AddOp;
2425 Instruction::BinaryOps MulOp;
2426 if (Step->getType()->isIntegerTy()) {
2427 AddOp = Instruction::Add;
2428 MulOp = Instruction::Mul;
2429 } else {
2430 AddOp = II.getInductionOpcode();
2431 MulOp = Instruction::FMul;
2432 }
2433
2434 // Multiply the vectorization factor by the step using integer or
2435 // floating-point arithmetic as appropriate.
2436 Type *StepType = Step->getType();
2437 Value *RuntimeVF;
2438 if (Step->getType()->isFloatingPointTy())
2439 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
2440 else
2441 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
2442 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2443
2444 // Create a vector splat to use in the induction update.
2445 //
2446 // FIXME: If the step is non-constant, we create the vector splat with
2447 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2448 // handle a constant vector splat.
2449 Value *SplatVF = isa<Constant>(Mul)
2450 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
2451 : Builder.CreateVectorSplat(State.VF, Mul);
2452 Builder.restoreIP(CurrIP);
2453
2454 // We may need to add the step a number of times, depending on the unroll
2455 // factor. The last of those goes into the PHI.
2456 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2457 &*LoopVectorBody->getFirstInsertionPt());
2458 VecInd->setDebugLoc(EntryVal->getDebugLoc());
2459 Instruction *LastInduction = VecInd;
2460 for (unsigned Part = 0; Part < UF; ++Part) {
2461 State.set(Def, LastInduction, Part);
2462
2463 if (isa<TruncInst>(EntryVal))
2464 addMetadata(LastInduction, EntryVal);
2465
2466 LastInduction = cast<Instruction>(
2467 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2468 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2469 }
2470
2471 // Move the last step to the end of the latch block. This ensures consistent
2472 // placement of all induction updates.
2473 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2474 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2475 LastInduction->moveBefore(Br);
2476 LastInduction->setName("vec.ind.next");
2477
2478 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2479 VecInd->addIncoming(LastInduction, LoopVectorLatch);
2480}
2481
2482bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2483 return Cost->isScalarAfterVectorization(I, VF) ||
2484 Cost->isProfitableToScalarize(I, VF);
2485}
2486
2487bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2488 if (shouldScalarizeInstruction(IV))
2489 return true;
2490 auto isScalarInst = [&](User *U) -> bool {
2491 auto *I = cast<Instruction>(U);
2492 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2493 };
2494 return llvm::any_of(IV->users(), isScalarInst);
2495}
2496
2497void InnerLoopVectorizer::widenIntOrFpInduction(
2498 PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State,
2499 Value *CanonicalIV) {
2500 Value *Start = Def->getStartValue()->getLiveInIRValue();
2501 const InductionDescriptor &ID = Def->getInductionDescriptor();
2502 TruncInst *Trunc = Def->getTruncInst();
2503 IRBuilder<> &Builder = State.Builder;
2504 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match")(static_cast <bool> (IV->getType() == ID.getStartValue
()->getType() && "Types must match") ? void (0) : __assert_fail
("IV->getType() == ID.getStartValue()->getType() && \"Types must match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2504, __extension__
__PRETTY_FUNCTION__))
;
2505 assert(!State.VF.isZero() && "VF must be non-zero")(static_cast <bool> (!State.VF.isZero() && "VF must be non-zero"
) ? void (0) : __assert_fail ("!State.VF.isZero() && \"VF must be non-zero\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2505, __extension__
__PRETTY_FUNCTION__))
;
2506
2507 // The value from the original loop to which we are mapping the new induction
2508 // variable.
2509 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2510
2511 auto &DL = EntryVal->getModule()->getDataLayout();
2512
2513 // Generate code for the induction step. Note that induction steps are
2514 // required to be loop-invariant
2515 auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2516 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&(static_cast <bool> (PSE.getSE()->isLoopInvariant(Step
, OrigLoop) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(Step, OrigLoop) && \"Induction step should be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2517, __extension__
__PRETTY_FUNCTION__))
2517 "Induction step should be loop invariant")(static_cast <bool> (PSE.getSE()->isLoopInvariant(Step
, OrigLoop) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(Step, OrigLoop) && \"Induction step should be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2517, __extension__
__PRETTY_FUNCTION__))
;
2518 if (PSE.getSE()->isSCEVable(IV->getType())) {
2519 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2520 return Exp.expandCodeFor(Step, Step->getType(),
2521 State.CFG.VectorPreHeader->getTerminator());
2522 }
2523 return cast<SCEVUnknown>(Step)->getValue();
2524 };
2525
2526 // The scalar value to broadcast. This is derived from the canonical
2527 // induction variable. If a truncation type is given, truncate the canonical
2528 // induction variable and step. Otherwise, derive these values from the
2529 // induction descriptor.
2530 auto CreateScalarIV = [&](Value *&Step) -> Value * {
2531 Value *ScalarIV = CanonicalIV;
2532 Type *NeededType = IV->getType();
2533 if (!Def->isCanonical() || ScalarIV->getType() != NeededType) {
2534 ScalarIV =
2535 NeededType->isIntegerTy()
2536 ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType)
2537 : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType);
2538 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID,
2539 State.CFG.PrevBB);
2540 ScalarIV->setName("offset.idx");
2541 }
2542 if (Trunc) {
2543 auto *TruncType = cast<IntegerType>(Trunc->getType());
2544 assert(Step->getType()->isIntegerTy() &&(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2545, __extension__
__PRETTY_FUNCTION__))
2545 "Truncation requires an integer step")(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2545, __extension__
__PRETTY_FUNCTION__))
;
2546 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2547 Step = Builder.CreateTrunc(Step, TruncType);
2548 }
2549 return ScalarIV;
2550 };
2551
2552 // Create the vector values from the scalar IV, in the absence of creating a
2553 // vector IV.
2554 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2555 Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2556 for (unsigned Part = 0; Part < UF; ++Part) {
2557 Value *StartIdx;
2558 if (Step->getType()->isFloatingPointTy())
2559 StartIdx =
2560 getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part);
2561 else
2562 StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part);
2563
2564 Value *EntryPart =
2565 getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode(),
2566 State.VF, State.Builder);
2567 State.set(Def, EntryPart, Part);
2568 if (Trunc)
2569 addMetadata(EntryPart, Trunc);
2570 }
2571 };
2572
2573 // Fast-math-flags propagate from the original induction instruction.
2574 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2575 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2576 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2577
2578 // Now do the actual transformations, and start with creating the step value.
2579 Value *Step = CreateStepValue(ID.getStep());
2580 if (State.VF.isScalar()) {
2581 Value *ScalarIV = CreateScalarIV(Step);
2582 Type *ScalarTy = IntegerType::get(ScalarIV->getContext(),
2583 Step->getType()->getScalarSizeInBits());
2584
2585 Instruction::BinaryOps IncOp = ID.getInductionOpcode();
2586 if (IncOp == Instruction::BinaryOpsEnd)
2587 IncOp = Instruction::Add;
2588 for (unsigned Part = 0; Part < UF; ++Part) {
2589 Value *StartIdx = ConstantInt::get(ScalarTy, Part);
2590 Instruction::BinaryOps MulOp = Instruction::Mul;
2591 if (Step->getType()->isFloatingPointTy()) {
2592 StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType());
2593 MulOp = Instruction::FMul;
2594 }
2595
2596 Value *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2597 Value *EntryPart = Builder.CreateBinOp(IncOp, ScalarIV, Mul, "induction");
2598 State.set(Def, EntryPart, Part);
2599 if (Trunc) {
2600 assert(!Step->getType()->isFloatingPointTy() &&(static_cast <bool> (!Step->getType()->isFloatingPointTy
() && "fp inductions shouldn't be truncated") ? void (
0) : __assert_fail ("!Step->getType()->isFloatingPointTy() && \"fp inductions shouldn't be truncated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2601, __extension__
__PRETTY_FUNCTION__))
2601 "fp inductions shouldn't be truncated")(static_cast <bool> (!Step->getType()->isFloatingPointTy
() && "fp inductions shouldn't be truncated") ? void (
0) : __assert_fail ("!Step->getType()->isFloatingPointTy() && \"fp inductions shouldn't be truncated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2601, __extension__
__PRETTY_FUNCTION__))
;
2602 addMetadata(EntryPart, Trunc);
2603 }
2604 }
2605 return;
2606 }
2607
2608 // Determine if we want a scalar version of the induction variable. This is
2609 // true if the induction variable itself is not widened, or if it has at
2610 // least one user in the loop that is not widened.
2611 auto NeedsScalarIV = needsScalarInduction(EntryVal);
2612 if (!NeedsScalarIV) {
2613 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
2614 return;
2615 }
2616
2617 // Try to create a new independent vector induction variable. If we can't
2618 // create the phi node, we will splat the scalar induction variable in each
2619 // loop iteration.
2620 if (!shouldScalarizeInstruction(EntryVal)) {
2621 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
2622 Value *ScalarIV = CreateScalarIV(Step);
2623 // Create scalar steps that can be used by instructions we will later
2624 // scalarize. Note that the addition of the scalar steps will not increase
2625 // the number of instructions in the loop in the common case prior to
2626 // InstCombine. We will be trading one vector extract for each scalar step.
2627 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
2628 return;
2629 }
2630
2631 // All IV users are scalar instructions, so only emit a scalar IV, not a
2632 // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2633 // predicate used by the masked loads/stores.
2634 Value *ScalarIV = CreateScalarIV(Step);
2635 if (!Cost->isScalarEpilogueAllowed())
2636 CreateSplatIV(ScalarIV, Step);
2637 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
2638}
2639
2640void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2641 Instruction *EntryVal,
2642 const InductionDescriptor &ID,
2643 VPValue *Def,
2644 VPTransformState &State) {
2645 IRBuilder<> &Builder = State.Builder;
2646 // We shouldn't have to build scalar steps if we aren't vectorizing.
2647 assert(State.VF.isVector() && "VF should be greater than one")(static_cast <bool> (State.VF.isVector() && "VF should be greater than one"
) ? void (0) : __assert_fail ("State.VF.isVector() && \"VF should be greater than one\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2647, __extension__
__PRETTY_FUNCTION__))
;
2648 // Get the value type and ensure it and the step have the same integer type.
2649 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2650 assert(ScalarIVTy == Step->getType() &&(static_cast <bool> (ScalarIVTy == Step->getType() &&
"Val and Step should have the same type") ? void (0) : __assert_fail
("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2651, __extension__
__PRETTY_FUNCTION__))
2651 "Val and Step should have the same type")(static_cast <bool> (ScalarIVTy == Step->getType() &&
"Val and Step should have the same type") ? void (0) : __assert_fail
("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2651, __extension__
__PRETTY_FUNCTION__))
;
2652
2653 // We build scalar steps for both integer and floating-point induction
2654 // variables. Here, we determine the kind of arithmetic we will perform.
2655 Instruction::BinaryOps AddOp;
2656 Instruction::BinaryOps MulOp;
2657 if (ScalarIVTy->isIntegerTy()) {
2658 AddOp = Instruction::Add;
2659 MulOp = Instruction::Mul;
2660 } else {
2661 AddOp = ID.getInductionOpcode();
2662 MulOp = Instruction::FMul;
2663 }
2664
2665 // Determine the number of scalars we need to generate for each unroll
2666 // iteration. If EntryVal is uniform, we only need to generate the first
2667 // lane. Otherwise, we generate all VF values.
2668 bool IsUniform =
2669 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF);
2670 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
2671 // Compute the scalar steps and save the results in State.
2672 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2673 ScalarIVTy->getScalarSizeInBits());
2674 Type *VecIVTy = nullptr;
2675 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2676 if (!IsUniform && State.VF.isScalable()) {
2677 VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2678 UnitStepVec =
2679 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2680 SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2681 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2682 }
2683
2684 for (unsigned Part = 0; Part < State.UF; ++Part) {
2685 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2686
2687 if (!IsUniform && State.VF.isScalable()) {
2688 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2689 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2690 if (ScalarIVTy->isFloatingPointTy())
2691 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2692 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2693 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2694 State.set(Def, Add, Part);
2695 // It's useful to record the lane values too for the known minimum number
2696 // of elements so we do those below. This improves the code quality when
2697 // trying to extract the first element, for example.
2698 }
2699
2700 if (ScalarIVTy->isFloatingPointTy())
2701 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2702
2703 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2704 Value *StartIdx = Builder.CreateBinOp(
2705 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2706 // The step returned by `createStepForVF` is a runtime-evaluated value
2707 // when VF is scalable. Otherwise, it should be folded into a Constant.
2708 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&(static_cast <bool> ((State.VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2710, __extension__
__PRETTY_FUNCTION__))
2709 "Expected StartIdx to be folded to a constant when VF is not "(static_cast <bool> ((State.VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2710, __extension__
__PRETTY_FUNCTION__))
2710 "scalable")(static_cast <bool> ((State.VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2710, __extension__
__PRETTY_FUNCTION__))
;
2711 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2712 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2713 State.set(Def, Add, VPIteration(Part, Lane));
2714 }
2715 }
2716}
2717
2718void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2719 const VPIteration &Instance,
2720 VPTransformState &State) {
2721 Value *ScalarInst = State.get(Def, Instance);
2722 Value *VectorValue = State.get(Def, Instance.Part);
2723 VectorValue = Builder.CreateInsertElement(
2724 VectorValue, ScalarInst,
2725 Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2726 State.set(Def, VectorValue, Instance.Part);
2727}
2728
2729// Return whether we allow using masked interleave-groups (for dealing with
2730// strided loads/stores that reside in predicated blocks, or for dealing
2731// with gaps).
2732static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2733 // If an override option has been passed in for interleaved accesses, use it.
2734 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2735 return EnableMaskedInterleavedMemAccesses;
2736
2737 return TTI.enableMaskedInterleavedAccessVectorization();
2738}
2739
2740// Try to vectorize the interleave group that \p Instr belongs to.
2741//
2742// E.g. Translate following interleaved load group (factor = 3):
2743// for (i = 0; i < N; i+=3) {
2744// R = Pic[i]; // Member of index 0
2745// G = Pic[i+1]; // Member of index 1
2746// B = Pic[i+2]; // Member of index 2
2747// ... // do something to R, G, B
2748// }
2749// To:
2750// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2751// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2752// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2753// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2754//
2755// Or translate following interleaved store group (factor = 3):
2756// for (i = 0; i < N; i+=3) {
2757// ... do something to R, G, B
2758// Pic[i] = R; // Member of index 0
2759// Pic[i+1] = G; // Member of index 1
2760// Pic[i+2] = B; // Member of index 2
2761// }
2762// To:
2763// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2764// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2765// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2766// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2767// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2768void InnerLoopVectorizer::vectorizeInterleaveGroup(
2769 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2770 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2771 VPValue *BlockInMask) {
2772 Instruction *Instr = Group->getInsertPos();
2773 const DataLayout &DL = Instr->getModule()->getDataLayout();
2774
2775 // Prepare for the vector type of the interleaved load/store.
2776 Type *ScalarTy = getLoadStoreType(Instr);
2777 unsigned InterleaveFactor = Group->getFactor();
2778 assert(!VF.isScalable() && "scalable vectors not yet supported.")(static_cast <bool> (!VF.isScalable() && "scalable vectors not yet supported."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2778, __extension__
__PRETTY_FUNCTION__))
;
2779 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2780
2781 // Prepare for the new pointers.
2782 SmallVector<Value *, 2> AddrParts;
2783 unsigned Index = Group->getIndex(Instr);
2784
2785 // TODO: extend the masked interleaved-group support to reversed access.
2786 assert((!BlockInMask || !Group->isReverse()) &&(static_cast <bool> ((!BlockInMask || !Group->isReverse
()) && "Reversed masked interleave-group not supported."
) ? void (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2787, __extension__
__PRETTY_FUNCTION__))
2787 "Reversed masked interleave-group not supported.")(static_cast <bool> ((!BlockInMask || !Group->isReverse
()) && "Reversed masked interleave-group not supported."
) ? void (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2787, __extension__
__PRETTY_FUNCTION__))
;
2788
2789 // If the group is reverse, adjust the index to refer to the last vector lane
2790 // instead of the first. We adjust the index from the first vector lane,
2791 // rather than directly getting the pointer for lane VF - 1, because the
2792 // pointer operand of the interleaved access is supposed to be uniform. For
2793 // uniform instructions, we're only required to generate a value for the
2794 // first vector lane in each unroll iteration.
2795 if (Group->isReverse())
2796 Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2797
2798 for (unsigned Part = 0; Part < UF; Part++) {
2799 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2800 setDebugLocFromInst(AddrPart);
2801
2802 // Notice current instruction could be any index. Need to adjust the address
2803 // to the member of index 0.
2804 //
2805 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2806 // b = A[i]; // Member of index 0
2807 // Current pointer is pointed to A[i+1], adjust it to A[i].
2808 //
2809 // E.g. A[i+1] = a; // Member of index 1
2810 // A[i] = b; // Member of index 0
2811 // A[i+2] = c; // Member of index 2 (Current instruction)
2812 // Current pointer is pointed to A[i+2], adjust it to A[i].
2813
2814 bool InBounds = false;
2815 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2816 InBounds = gep->isInBounds();
2817 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2818 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2819
2820 // Cast to the vector pointer type.
2821 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2822 Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2823 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2824 }
2825
2826 setDebugLocFromInst(Instr);
2827 Value *PoisonVec = PoisonValue::get(VecTy);
2828
2829 Value *MaskForGaps = nullptr;
2830 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2831 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2832 assert(MaskForGaps && "Mask for Gaps is required but it is null")(static_cast <bool> (MaskForGaps && "Mask for Gaps is required but it is null"
) ? void (0) : __assert_fail ("MaskForGaps && \"Mask for Gaps is required but it is null\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2832, __extension__
__PRETTY_FUNCTION__))
;
2833 }
2834
2835 // Vectorize the interleaved load group.
2836 if (isa<LoadInst>(Instr)) {
2837 // For each unroll part, create a wide load for the group.
2838 SmallVector<Value *, 2> NewLoads;
2839 for (unsigned Part = 0; Part < UF; Part++) {
2840 Instruction *NewLoad;
2841 if (BlockInMask || MaskForGaps) {
2842 assert(useMaskedInterleavedAccesses(*TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(*TTI)
&& "masked interleaved groups are not allowed.") ? void
(0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2843, __extension__
__PRETTY_FUNCTION__))
2843 "masked interleaved groups are not allowed.")(static_cast <bool> (useMaskedInterleavedAccesses(*TTI)
&& "masked interleaved groups are not allowed.") ? void
(0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2843, __extension__
__PRETTY_FUNCTION__))
;
2844 Value *GroupMask = MaskForGaps;
2845 if (BlockInMask) {
2846 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2847 Value *ShuffledMask = Builder.CreateShuffleVector(
2848 BlockInMaskPart,
2849 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2850 "interleaved.mask");
2851 GroupMask = MaskForGaps
2852 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2853 MaskForGaps)
2854 : ShuffledMask;
2855 }
2856 NewLoad =
2857 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2858 GroupMask, PoisonVec, "wide.masked.vec");
2859 }
2860 else
2861 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2862 Group->getAlign(), "wide.vec");
2863 Group->addMetadata(NewLoad);
2864 NewLoads.push_back(NewLoad);
2865 }
2866
2867 // For each member in the group, shuffle out the appropriate data from the
2868 // wide loads.
2869 unsigned J = 0;
2870 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2871 Instruction *Member = Group->getMember(I);
2872
2873 // Skip the gaps in the group.
2874 if (!Member)
2875 continue;
2876
2877 auto StrideMask =
2878 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2879 for (unsigned Part = 0; Part < UF; Part++) {
2880 Value *StridedVec = Builder.CreateShuffleVector(
2881 NewLoads[Part], StrideMask, "strided.vec");
2882
2883 // If this member has different type, cast the result type.
2884 if (Member->getType() != ScalarTy) {
2885 assert(!VF.isScalable() && "VF is assumed to be non scalable.")(static_cast <bool> (!VF.isScalable() && "VF is assumed to be non scalable."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2885, __extension__
__PRETTY_FUNCTION__))
;
2886 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2887 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2888 }
2889
2890 if (Group->isReverse())
2891 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2892
2893 State.set(VPDefs[J], StridedVec, Part);
2894 }
2895 ++J;
2896 }
2897 return;
2898 }
2899
2900 // The sub vector type for current instruction.
2901 auto *SubVT = VectorType::get(ScalarTy, VF);
2902
2903 // Vectorize the interleaved store group.
2904 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2905 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&(static_cast <bool> ((!MaskForGaps || useMaskedInterleavedAccesses
(*TTI)) && "masked interleaved groups are not allowed."
) ? void (0) : __assert_fail ("(!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2906, __extension__
__PRETTY_FUNCTION__))
2906 "masked interleaved groups are not allowed.")(static_cast <bool> ((!MaskForGaps || useMaskedInterleavedAccesses
(*TTI)) && "masked interleaved groups are not allowed."
) ? void (0) : __assert_fail ("(!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2906, __extension__
__PRETTY_FUNCTION__))
;
2907 assert((!MaskForGaps || !VF.isScalable()) &&(static_cast <bool> ((!MaskForGaps || !VF.isScalable())
&& "masking gaps for scalable vectors is not yet supported."
) ? void (0) : __assert_fail ("(!MaskForGaps || !VF.isScalable()) && \"masking gaps for scalable vectors is not yet supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2908, __extension__
__PRETTY_FUNCTION__))
2908 "masking gaps for scalable vectors is not yet supported.")(static_cast <bool> ((!MaskForGaps || !VF.isScalable())
&& "masking gaps for scalable vectors is not yet supported."
) ? void (0) : __assert_fail ("(!MaskForGaps || !VF.isScalable()) && \"masking gaps for scalable vectors is not yet supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2908, __extension__
__PRETTY_FUNCTION__))
;
2909 for (unsigned Part = 0; Part < UF; Part++) {
2910 // Collect the stored vector from each member.
2911 SmallVector<Value *, 4> StoredVecs;
2912 for (unsigned i = 0; i < InterleaveFactor; i++) {
2913 assert((Group->getMember(i) || MaskForGaps) &&(static_cast <bool> ((Group->getMember(i) || MaskForGaps
) && "Fail to get a member from an interleaved store group"
) ? void (0) : __assert_fail ("(Group->getMember(i) || MaskForGaps) && \"Fail to get a member from an interleaved store group\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2914, __extension__
__PRETTY_FUNCTION__))
2914 "Fail to get a member from an interleaved store group")(static_cast <bool> ((Group->getMember(i) || MaskForGaps
) && "Fail to get a member from an interleaved store group"
) ? void (0) : __assert_fail ("(Group->getMember(i) || MaskForGaps) && \"Fail to get a member from an interleaved store group\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2914, __extension__
__PRETTY_FUNCTION__))
;
2915 Instruction *Member = Group->getMember(i);
2916
2917 // Skip the gaps in the group.
2918 if (!Member) {
2919 Value *Undef = PoisonValue::get(SubVT);
2920 StoredVecs.push_back(Undef);
2921 continue;
2922 }
2923
2924 Value *StoredVec = State.get(StoredValues[i], Part);
2925
2926 if (Group->isReverse())
2927 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2928
2929 // If this member has different type, cast it to a unified type.
2930
2931 if (StoredVec->getType() != SubVT)
2932 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2933
2934 StoredVecs.push_back(StoredVec);
2935 }
2936
2937 // Concatenate all vectors into a wide vector.
2938 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2939
2940 // Interleave the elements in the wide vector.
2941 Value *IVec = Builder.CreateShuffleVector(
2942 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2943 "interleaved.vec");
2944
2945 Instruction *NewStoreInstr;
2946 if (BlockInMask || MaskForGaps) {
2947 Value *GroupMask = MaskForGaps;
2948 if (BlockInMask) {
2949 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2950 Value *ShuffledMask = Builder.CreateShuffleVector(
2951 BlockInMaskPart,
2952 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2953 "interleaved.mask");
2954 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2955 ShuffledMask, MaskForGaps)
2956 : ShuffledMask;
2957 }
2958 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2959 Group->getAlign(), GroupMask);
2960 } else
2961 NewStoreInstr =
2962 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2963
2964 Group->addMetadata(NewStoreInstr);
2965 }
2966}
2967
2968void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2969 VPReplicateRecipe *RepRecipe,
2970 const VPIteration &Instance,
2971 bool IfPredicateInstr,
2972 VPTransformState &State) {
2973 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")(static_cast <bool> (!Instr->getType()->isAggregateType
() && "Can't handle vectors") ? void (0) : __assert_fail
("!Instr->getType()->isAggregateType() && \"Can't handle vectors\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2973, __extension__
__PRETTY_FUNCTION__))
;
2974
2975 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2976 // the first lane and part.
2977 if (isa<NoAliasScopeDeclInst>(Instr))
2978 if (!Instance.isFirstIteration())
2979 return;
2980
2981 setDebugLocFromInst(Instr);
2982
2983 // Does this instruction return a value ?
2984 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2985
2986 Instruction *Cloned = Instr->clone();
2987 if (!IsVoidRetTy)
2988 Cloned->setName(Instr->getName() + ".cloned");
2989
2990 // If the scalarized instruction contributes to the address computation of a
2991 // widen masked load/store which was in a basic block that needed predication
2992 // and is not predicated after vectorization, we can't propagate
2993 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2994 // instruction could feed a poison value to the base address of the widen
2995 // load/store.
2996 if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2997 Cloned->dropPoisonGeneratingFlags();
2998
2999 State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
3000 Builder.GetInsertPoint());
3001 // Replace the operands of the cloned instructions with their scalar
3002 // equivalents in the new loop.
3003 for (auto &I : enumerate(RepRecipe->operands())) {
3004 auto InputInstance = Instance;
3005 VPValue *Operand = I.value();
3006 if (State.Plan->isUniformAfterVectorization(Operand))
3007 InputInstance.Lane = VPLane::getFirstLane();
3008 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
3009 }
3010 addNewMetadata(Cloned, Instr);
3011
3012 // Place the cloned scalar in the new loop.
3013 Builder.Insert(Cloned);
3014
3015 State.set(RepRecipe, Cloned, Instance);
3016
3017 // If we just cloned a new assumption, add it the assumption cache.
3018 if (auto *II = dyn_cast<AssumeInst>(Cloned))
3019 AC->registerAssumption(II);
3020
3021 // End if-block.
3022 if (IfPredicateInstr)
3023 PredicatedInstructions.push_back(Cloned);
3024}
3025
3026void InnerLoopVectorizer::createHeaderBranch(Loop *L) {
3027 BasicBlock *Header = L->getHeader();
3028 assert(!L->getLoopLatch() && "loop should not have a latch at this point")(static_cast <bool> (!L->getLoopLatch() && "loop should not have a latch at this point"
) ? void (0) : __assert_fail ("!L->getLoopLatch() && \"loop should not have a latch at this point\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3028, __extension__
__PRETTY_FUNCTION__))
;
3029
3030 IRBuilder<> B(Header->getTerminator());
3031 Instruction *OldInst =
3032 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
3033 setDebugLocFromInst(OldInst, &B);
3034
3035 // Connect the header to the exit and header blocks and replace the old
3036 // terminator.
3037 B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header);
3038
3039 // Now we have two terminators. Remove the old one from the block.
3040 Header->getTerminator()->eraseFromParent();
3041}
3042
3043Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3044 if (TripCount)
3045 return TripCount;
3046
3047 assert(L && "Create Trip Count for null loop.")(static_cast <bool> (L && "Create Trip Count for null loop."
) ? void (0) : __assert_fail ("L && \"Create Trip Count for null loop.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3047, __extension__
__PRETTY_FUNCTION__))
;
3048 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3049 // Find the loop boundaries.
3050 ScalarEvolution *SE = PSE.getSE();
3051 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3052 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&(static_cast <bool> (!isa<SCEVCouldNotCompute>(BackedgeTakenCount
) && "Invalid loop count") ? void (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3053, __extension__
__PRETTY_FUNCTION__))
3053 "Invalid loop count")(static_cast <bool> (!isa<SCEVCouldNotCompute>(BackedgeTakenCount
) && "Invalid loop count") ? void (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3053, __extension__
__PRETTY_FUNCTION__))
;
3054
3055 Type *IdxTy = Legal->getWidestInductionType();
3056 assert(IdxTy && "No type for induction")(static_cast <bool> (IdxTy && "No type for induction"
) ? void (0) : __assert_fail ("IdxTy && \"No type for induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3056, __extension__
__PRETTY_FUNCTION__))
;
3057
3058 // The exit count might have the type of i64 while the phi is i32. This can
3059 // happen if we have an induction variable that is sign extended before the
3060 // compare. The only way that we get a backedge taken count is that the
3061 // induction variable was signed and as such will not overflow. In such a case
3062 // truncation is legal.
3063 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3064 IdxTy->getPrimitiveSizeInBits())
3065 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3066 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3067
3068 // Get the total trip count from the count by adding 1.
3069 const SCEV *ExitCount = SE->getAddExpr(
3070 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3071
3072 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3073
3074 // Expand the trip count and place the new instructions in the preheader.
3075 // Notice that the pre-header does not change, only the loop body.
3076 SCEVExpander Exp(*SE, DL, "induction");
3077
3078 // Count holds the overall loop count (N).
3079 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3080 L->getLoopPreheader()->getTerminator());
3081
3082 if (TripCount->getType()->isPointerTy())
3083 TripCount =
3084 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3085 L->getLoopPreheader()->getTerminator());
3086
3087 return TripCount;
3088}
3089
3090Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3091 if (VectorTripCount)
3092 return VectorTripCount;
3093
3094 Value *TC = getOrCreateTripCount(L);
3095 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3096
3097 Type *Ty = TC->getType();
3098 // This is where we can make the step a runtime constant.
3099 Value *Step = createStepForVF(Builder, Ty, VF, UF);
3100
3101 // If the tail is to be folded by masking, round the number of iterations N
3102 // up to a multiple of Step instead of rounding down. This is done by first
3103 // adding Step-1 and then rounding down. Note that it's ok if this addition
3104 // overflows: the vector induction variable will eventually wrap to zero given
3105 // that it starts at zero and its Step is a power of two; the loop will then
3106 // exit, with the last early-exit vector comparison also producing all-true.
3107 if (Cost->foldTailByMasking()) {
3108 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
) * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3109, __extension__
__PRETTY_FUNCTION__))
3109 "VF*UF must be a power of 2 when folding tail by masking")(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
) * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3109, __extension__
__PRETTY_FUNCTION__))
;
3110 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
3111 TC = Builder.CreateAdd(
3112 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
3113 }
3114
3115 // Now we need to generate the expression for the part of the loop that the
3116 // vectorized body will execute. This is equal to N - (N % Step) if scalar
3117 // iterations are not required for correctness, or N - Step, otherwise. Step
3118 // is equal to the vectorization factor (number of SIMD elements) times the
3119 // unroll factor (number of SIMD instructions).
3120 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3121
3122 // There are cases where we *must* run at least one iteration in the remainder
3123 // loop. See the cost model for when this can happen. If the step evenly
3124 // divides the trip count, we set the remainder to be equal to the step. If
3125 // the step does not evenly divide the trip count, no adjustment is necessary
3126 // since there will already be scalar iterations. Note that the minimum
3127 // iterations check ensures that N >= Step.
3128 if (Cost->requiresScalarEpilogue(VF)) {
3129 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3130 R = Builder.CreateSelect(IsZero, Step, R);
3131 }
3132
3133 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3134
3135 return VectorTripCount;
3136}
3137
3138Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3139 const DataLayout &DL) {
3140 // Verify that V is a vector type with same number of elements as DstVTy.
3141 auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3142 unsigned VF = DstFVTy->getNumElements();
3143 auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3144 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match")(static_cast <bool> ((VF == SrcVecTy->getNumElements
()) && "Vector dimensions do not match") ? void (0) :
__assert_fail ("(VF == SrcVecTy->getNumElements()) && \"Vector dimensions do not match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3144, __extension__
__PRETTY_FUNCTION__))
;
3145 Type *SrcElemTy = SrcVecTy->getElementType();
3146 Type *DstElemTy = DstFVTy->getElementType();
3147 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3148, __extension__
__PRETTY_FUNCTION__))
3148 "Vector elements must have same size")(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3148, __extension__
__PRETTY_FUNCTION__))
;
3149
3150 // Do a direct cast if element types are castable.
3151 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3152 return Builder.CreateBitOrPointerCast(V, DstFVTy);
3153 }
3154 // V cannot be directly casted to desired vector type.
3155 // May happen when V is a floating point vector but DstVTy is a vector of
3156 // pointers or vice-versa. Handle this using a two-step bitcast using an
3157 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3158 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3159, __extension__
__PRETTY_FUNCTION__))
3159 "Only one type should be a pointer type")(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3159, __extension__
__PRETTY_FUNCTION__))
;
3160 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&(static_cast <bool> ((DstElemTy->isFloatingPointTy()
!= SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3161, __extension__
__PRETTY_FUNCTION__))
3161 "Only one type should be a floating point type")(static_cast <bool> ((DstElemTy->isFloatingPointTy()
!= SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3161, __extension__
__PRETTY_FUNCTION__))
;
3162 Type *IntTy =
3163 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3164 auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3165 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3166 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3167}
3168
3169void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3170 BasicBlock *Bypass) {
3171 Value *Count = getOrCreateTripCount(L);
3172 // Reuse existing vector loop preheader for TC checks.
3173 // Note that new preheader block is generated for vector loop.
3174 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3175 IRBuilder<> Builder(TCCheckBlock->getTerminator());
3176
3177 // Generate code to check if the loop's trip count is less than VF * UF, or
3178 // equal to it in case a scalar epilogue is required; this implies that the
3179 // vector trip count is zero. This check also covers the case where adding one
3180 // to the backedge-taken count overflowed leading to an incorrect trip count
3181 // of zero. In this case we will also jump to the scalar loop.
3182 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
3183 : ICmpInst::ICMP_ULT;
3184
3185 // If tail is to be folded, vector loop takes care of all iterations.
3186 Value *CheckMinIters = Builder.getFalse();
3187 if (!Cost->foldTailByMasking()) {
3188 Value *Step = createStepForVF(Builder, Count->getType(), VF, UF);
3189 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3190 }
3191 // Create new preheader for vector loop.
3192 LoopVectorPreHeader =
3193 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3194 "vector.ph");
3195
3196 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3198, __extension__
__PRETTY_FUNCTION__))
3197 DT->getNode(Bypass)->getIDom()) &&(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3198, __extension__
__PRETTY_FUNCTION__))
3198 "TC check is expected to dominate Bypass")(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3198, __extension__
__PRETTY_FUNCTION__))
;
3199
3200 // Update dominator for Bypass & LoopExit (if needed).
3201 DT->changeImmediateDominator(Bypass, TCCheckBlock);
3202 if (!Cost->requiresScalarEpilogue(VF))
3203 // If there is an epilogue which must run, there's no edge from the
3204 // middle block to exit blocks and thus no need to update the immediate
3205 // dominator of the exit blocks.
3206 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3207
3208 ReplaceInstWithInst(
3209 TCCheckBlock->getTerminator(),
3210 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3211 LoopBypassBlocks.push_back(TCCheckBlock);
3212}
3213
3214BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3215
3216 BasicBlock *const SCEVCheckBlock =
3217 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3218 if (!SCEVCheckBlock)
3219 return nullptr;
3220
3221 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3224, __extension__
__PRETTY_FUNCTION__))
3222 (OptForSizeBasedOnProfile &&(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3224, __extension__
__PRETTY_FUNCTION__))
3223 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3224, __extension__
__PRETTY_FUNCTION__))
3224 "Cannot SCEV check stride or overflow when optimizing for size")(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3224, __extension__
__PRETTY_FUNCTION__))
;
3225
3226
3227 // Update dominator only if this is first RT check.
3228 if (LoopBypassBlocks.empty()) {
3229 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3230 if (!Cost->requiresScalarEpilogue(VF))
3231 // If there is an epilogue which must run, there's no edge from the
3232 // middle block to exit blocks and thus no need to update the immediate
3233 // dominator of the exit blocks.
3234 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3235 }
3236
3237 LoopBypassBlocks.push_back(SCEVCheckBlock);
3238 AddedSafetyChecks = true;
3239 return SCEVCheckBlock;
3240}
3241
3242BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3243 BasicBlock *Bypass) {
3244 // VPlan-native path does not do any analysis for runtime checks currently.
3245 if (EnableVPlanNativePath)
3246 return nullptr;
3247
3248 BasicBlock *const MemCheckBlock =
3249 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3250
3251 // Check if we generated code that checks in runtime if arrays overlap. We put
3252 // the checks into a separate block to make the more common case of few
3253 // elements faster.
3254 if (!MemCheckBlock)
3255 return nullptr;
3256
3257 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3258 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3260, __extension__
__PRETTY_FUNCTION__))
3259 "Cannot emit memory checks when optimizing for size, unless forced "(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3260, __extension__
__PRETTY_FUNCTION__))
3260 "to vectorize.")(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3260, __extension__
__PRETTY_FUNCTION__))
;
3261 ORE->emit([&]() {
3262 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationCodeSize",
3263 L->getStartLoc(), L->getHeader())
3264 << "Code-size may be reduced by not forcing "
3265 "vectorization, or by source-code modifications "
3266 "eliminating the need for runtime checks "
3267 "(e.g., adding 'restrict').";
3268 });
3269 }
3270
3271 LoopBypassBlocks.push_back(MemCheckBlock);
3272
3273 AddedSafetyChecks = true;
3274
3275 // We currently don't use LoopVersioning for the actual loop cloning but we
3276 // still use it to add the noalias metadata.
3277 LVer = std::make_unique<LoopVersioning>(
3278 *Legal->getLAI(),
3279 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3280 DT, PSE.getSE());
3281 LVer->prepareNoAliasMetadata();
3282 return MemCheckBlock;
3283}
3284
3285Value *InnerLoopVectorizer::emitTransformedIndex(
3286 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3287 const InductionDescriptor &ID, BasicBlock *VectorHeader) const {
3288
3289 SCEVExpander Exp(*SE, DL, "induction");
3290 auto Step = ID.getStep();
3291 auto StartValue = ID.getStartValue();
3292 assert(Index->getType()->getScalarType() == Step->getType() &&(static_cast <bool> (Index->getType()->getScalarType
() == Step->getType() && "Index scalar type does not match StepValue type"
) ? void (0) : __assert_fail ("Index->getType()->getScalarType() == Step->getType() && \"Index scalar type does not match StepValue type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3293, __extension__
__PRETTY_FUNCTION__))
3293 "Index scalar type does not match StepValue type")(static_cast <bool> (Index->getType()->getScalarType
() == Step->getType() && "Index scalar type does not match StepValue type"
) ? void (0) : __assert_fail ("Index->getType()->getScalarType() == Step->getType() && \"Index scalar type does not match StepValue type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3293, __extension__
__PRETTY_FUNCTION__))
;
3294
3295 // Note: the IR at this point is broken. We cannot use SE to create any new
3296 // SCEV and then expand it, hoping that SCEV's simplification will give us
3297 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3298 // lead to various SCEV crashes. So all we can do is to use builder and rely
3299 // on InstCombine for future simplifications. Here we handle some trivial
3300 // cases only.
3301 auto CreateAdd = [&B](Value *X, Value *Y) {
3302 assert(X->getType() == Y->getType() && "Types don't match!")(static_cast <bool> (X->getType() == Y->getType()
&& "Types don't match!") ? void (0) : __assert_fail (
"X->getType() == Y->getType() && \"Types don't match!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3302, __extension__
__PRETTY_FUNCTION__))
;
3303 if (auto *CX = dyn_cast<ConstantInt>(X))
3304 if (CX->isZero())
3305 return Y;
3306 if (auto *CY = dyn_cast<ConstantInt>(Y))
3307 if (CY->isZero())
3308 return X;
3309 return B.CreateAdd(X, Y);
3310 };
3311
3312 // We allow X to be a vector type, in which case Y will potentially be
3313 // splatted into a vector with the same element count.
3314 auto CreateMul = [&B](Value *X, Value *Y) {
3315 assert(X->getType()->getScalarType() == Y->getType() &&(static_cast <bool> (X->getType()->getScalarType(
) == Y->getType() && "Types don't match!") ? void (
0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3316, __extension__
__PRETTY_FUNCTION__))
3316 "Types don't match!")(static_cast <bool> (X->getType()->getScalarType(
) == Y->getType() && "Types don't match!") ? void (
0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3316, __extension__
__PRETTY_FUNCTION__))
;
3317 if (auto *CX = dyn_cast<ConstantInt>(X))
3318 if (CX->isOne())
3319 return Y;
3320 if (auto *CY = dyn_cast<ConstantInt>(Y))
3321 if (CY->isOne())
3322 return X;
3323 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
3324 if (XVTy && !isa<VectorType>(Y->getType()))
3325 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
3326 return B.CreateMul(X, Y);
3327 };
3328
3329 // Get a suitable insert point for SCEV expansion. For blocks in the vector
3330 // loop, choose the end of the vector loop header (=VectorHeader), because
3331 // the DomTree is not kept up-to-date for additional blocks generated in the
3332 // vector loop. By using the header as insertion point, we guarantee that the
3333 // expanded instructions dominate all their uses.
3334 auto GetInsertPoint = [this, &B, VectorHeader]() {
3335 BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3336 if (InsertBB != LoopVectorBody &&
3337 LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB))
3338 return VectorHeader->getTerminator();
3339 return &*B.GetInsertPoint();
3340 };
3341
3342 switch (ID.getKind()) {
3343 case InductionDescriptor::IK_IntInduction: {
3344 assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for integer inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3345, __extension__
__PRETTY_FUNCTION__))
3345 "Vector indices not supported for integer inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for integer inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3345, __extension__
__PRETTY_FUNCTION__))
;
3346 assert(Index->getType() == StartValue->getType() &&(static_cast <bool> (Index->getType() == StartValue->
getType() && "Index type does not match StartValue type"
) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3347, __extension__
__PRETTY_FUNCTION__))
3347 "Index type does not match StartValue type")(static_cast <bool> (Index->getType() == StartValue->
getType() && "Index type does not match StartValue type"
) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3347, __extension__
__PRETTY_FUNCTION__))
;
3348 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3349 return B.CreateSub(StartValue, Index);
3350 auto *Offset = CreateMul(
3351 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3352 return CreateAdd(StartValue, Offset);
3353 }
3354 case InductionDescriptor::IK_PtrInduction: {
3355 assert(isa<SCEVConstant>(Step) &&(static_cast <bool> (isa<SCEVConstant>(Step) &&
"Expected constant step for pointer induction") ? void (0) :
__assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3356, __extension__
__PRETTY_FUNCTION__))
3356 "Expected constant step for pointer induction")(static_cast <bool> (isa<SCEVConstant>(Step) &&
"Expected constant step for pointer induction") ? void (0) :
__assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3356, __extension__
__PRETTY_FUNCTION__))
;
3357 return B.CreateGEP(
3358 ID.getElementType(), StartValue,
3359 CreateMul(Index,
3360 Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
3361 GetInsertPoint())));
3362 }
3363 case InductionDescriptor::IK_FpInduction: {
3364 assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for FP inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3365, __extension__
__PRETTY_FUNCTION__))
3365 "Vector indices not supported for FP inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for FP inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3365, __extension__
__PRETTY_FUNCTION__))
;
3366 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value")(static_cast <bool> (Step->getType()->isFloatingPointTy
() && "Expected FP Step value") ? void (0) : __assert_fail
("Step->getType()->isFloatingPointTy() && \"Expected FP Step value\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3366, __extension__
__PRETTY_FUNCTION__))
;
3367 auto InductionBinOp = ID.getInductionBinOp();
3368 assert(InductionBinOp &&(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3371, __extension__
__PRETTY_FUNCTION__))
3369 (InductionBinOp->getOpcode() == Instruction::FAdd ||(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3371, __extension__
__PRETTY_FUNCTION__))
3370 InductionBinOp->getOpcode() == Instruction::FSub) &&(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3371, __extension__
__PRETTY_FUNCTION__))
3371 "Original bin op should be defined for FP induction")(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3371, __extension__
__PRETTY_FUNCTION__))
;
3372
3373 Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3374 Value *MulExp = B.CreateFMul(StepValue, Index);
3375 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3376 "induction");
3377 }
3378 case InductionDescriptor::IK_NoInduction:
3379 return nullptr;
3380 }
3381 llvm_unreachable("invalid enum")::llvm::llvm_unreachable_internal("invalid enum", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3381)
;
3382}
3383
3384Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3385 LoopScalarBody = OrigLoop->getHeader();
3386 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3387 assert(LoopVectorPreHeader && "Invalid loop structure")(static_cast <bool> (LoopVectorPreHeader && "Invalid loop structure"
) ? void (0) : __assert_fail ("LoopVectorPreHeader && \"Invalid loop structure\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3387, __extension__
__PRETTY_FUNCTION__))
;
3388 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3389 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&(static_cast <bool> ((LoopExitBlock || Cost->requiresScalarEpilogue
(VF)) && "multiple exit loop without required epilogue?"
) ? void (0) : __assert_fail ("(LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && \"multiple exit loop without required epilogue?\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3390, __extension__
__PRETTY_FUNCTION__))
3390 "multiple exit loop without required epilogue?")(static_cast <bool> ((LoopExitBlock || Cost->requiresScalarEpilogue
(VF)) && "multiple exit loop without required epilogue?"
) ? void (0) : __assert_fail ("(LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && \"multiple exit loop without required epilogue?\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3390, __extension__
__PRETTY_FUNCTION__))
;
3391
3392 LoopMiddleBlock =
3393 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3394 LI, nullptr, Twine(Prefix) + "middle.block");
3395 LoopScalarPreHeader =
3396 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3397 nullptr, Twine(Prefix) + "scalar.ph");
3398
3399 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3400
3401 // Set up the middle block terminator. Two cases:
3402 // 1) If we know that we must execute the scalar epilogue, emit an
3403 // unconditional branch.
3404 // 2) Otherwise, we must have a single unique exit block (due to how we
3405 // implement the multiple exit case). In this case, set up a conditonal
3406 // branch from the middle block to the loop scalar preheader, and the
3407 // exit block. completeLoopSkeleton will update the condition to use an
3408 // iteration check, if required to decide whether to execute the remainder.
3409 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3410 BranchInst::Create(LoopScalarPreHeader) :
3411 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3412 Builder.getTrue());
3413 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3414 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3415
3416 // We intentionally don't let SplitBlock to update LoopInfo since
3417 // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3418 // LoopVectorBody is explicitly added to the correct place few lines later.
3419 LoopVectorBody =
3420 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3421 nullptr, nullptr, Twine(Prefix) + "vector.body");
3422
3423 // Update dominator for loop exit.
3424 if (!Cost->requiresScalarEpilogue(VF))
3425 // If there is an epilogue which must run, there's no edge from the
3426 // middle block to exit blocks and thus no need to update the immediate
3427 // dominator of the exit blocks.
3428 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3429
3430 // Create and register the new vector loop.
3431 Loop *Lp = LI->AllocateLoop();
3432 Loop *ParentLoop = OrigLoop->getParentLoop();
3433
3434 // Insert the new loop into the loop nest and register the new basic blocks
3435 // before calling any utilities such as SCEV that require valid LoopInfo.
3436 if (ParentLoop) {
3437 ParentLoop->addChildLoop(Lp);
3438 } else {
3439 LI->addTopLevelLoop(Lp);
3440 }
3441 Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3442 return Lp;
3443}
3444
3445void InnerLoopVectorizer::createInductionResumeValues(
3446 Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) {
3447 assert(((AdditionalBypass.first && AdditionalBypass.second) ||(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3449, __extension__
__PRETTY_FUNCTION__))
3448 (!AdditionalBypass.first && !AdditionalBypass.second)) &&(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3449, __extension__
__PRETTY_FUNCTION__))
3449 "Inconsistent information about additional bypass.")(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3449, __extension__
__PRETTY_FUNCTION__))
;
3450
3451 Value *VectorTripCount = getOrCreateVectorTripCount(L);
3452 assert(VectorTripCount && L && "Expected valid arguments")(static_cast <bool> (VectorTripCount && L &&
"Expected valid arguments") ? void (0) : __assert_fail ("VectorTripCount && L && \"Expected valid arguments\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3452, __extension__
__PRETTY_FUNCTION__))
;
3453 // We are going to resume the execution of the scalar loop.
3454 // Go over all of the induction variables that we found and fix the
3455 // PHIs that are left in the scalar version of the loop.
3456 // The starting values of PHI nodes depend on the counter of the last
3457 // iteration in the vectorized loop.
3458 // If we come from a bypass edge then we need to start from the original
3459 // start value.
3460 Instruction *OldInduction = Legal->getPrimaryInduction();
3461 for (auto &InductionEntry : Legal->getInductionVars()) {
3462 PHINode *OrigPhi = InductionEntry.first;
3463 InductionDescriptor II = InductionEntry.second;
3464
3465 // Create phi nodes to merge from the backedge-taken check block.
3466 PHINode *BCResumeVal =
3467 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3468 LoopScalarPreHeader->getTerminator());
3469 // Copy original phi DL over to the new one.
3470 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3471 Value *&EndValue = IVEndValues[OrigPhi];
3472 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3473 if (OrigPhi == OldInduction) {
3474 // We know what the end value is.
3475 EndValue = VectorTripCount;
3476 } else {
3477 IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3478
3479 // Fast-math-flags propagate from the original induction instruction.
3480 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3481 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3482
3483 Type *StepType = II.getStep()->getType();
3484 Instruction::CastOps CastOp =
3485 CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3486 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3487 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3488 EndValue =
3489 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
3490 EndValue->setName("ind.end");
3491
3492 // Compute the end value for the additional bypass (if applicable).
3493 if (AdditionalBypass.first) {
3494 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3495 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3496 StepType, true);
3497 CRD =
3498 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3499 EndValueFromAdditionalBypass =
3500 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
3501 EndValueFromAdditionalBypass->setName("ind.end");
3502 }
3503 }
3504 // The new PHI merges the original incoming value, in case of a bypass,
3505 // or the value at the end of the vectorized loop.
3506 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3507
3508 // Fix the scalar body counter (PHI node).
3509 // The old induction's phi node in the scalar body needs the truncated
3510 // value.
3511 for (BasicBlock *BB : LoopBypassBlocks)
3512 BCResumeVal->addIncoming(II.getStartValue(), BB);
3513
3514 if (AdditionalBypass.first)
3515 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3516 EndValueFromAdditionalBypass);
3517
3518 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3519 }
3520}
3521
3522BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3523 MDNode *OrigLoopID) {
3524 assert(L && "Expected valid loop.")(static_cast <bool> (L && "Expected valid loop."
) ? void (0) : __assert_fail ("L && \"Expected valid loop.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3524, __extension__
__PRETTY_FUNCTION__))
;
3525
3526 // The trip counts should be cached by now.
3527 Value *Count = getOrCreateTripCount(L);
3528 Value *VectorTripCount = getOrCreateVectorTripCount(L);
3529
3530 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3531
3532 // Add a check in the middle block to see if we have completed
3533 // all of the iterations in the first vector loop. Three cases:
3534 // 1) If we require a scalar epilogue, there is no conditional branch as
3535 // we unconditionally branch to the scalar preheader. Do nothing.
3536 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3537 // Thus if tail is to be folded, we know we don't need to run the
3538 // remainder and we can use the previous value for the condition (true).
3539 // 3) Otherwise, construct a runtime check.
3540 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3541 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3542 Count, VectorTripCount, "cmp.n",
3543 LoopMiddleBlock->getTerminator());
3544
3545 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3546 // of the corresponding compare because they may have ended up with
3547 // different line numbers and we want to avoid awkward line stepping while
3548 // debugging. Eg. if the compare has got a line number inside the loop.
3549 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3550 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3551 }
3552
3553 // Get ready to start creating new instructions into the vectorized body.
3554 assert(LoopVectorPreHeader == L->getLoopPreheader() &&(static_cast <bool> (LoopVectorPreHeader == L->getLoopPreheader
() && "Inconsistent vector loop preheader") ? void (0
) : __assert_fail ("LoopVectorPreHeader == L->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3555, __extension__
__PRETTY_FUNCTION__))
3555 "Inconsistent vector loop preheader")(static_cast <bool> (LoopVectorPreHeader == L->getLoopPreheader
() && "Inconsistent vector loop preheader") ? void (0
) : __assert_fail ("LoopVectorPreHeader == L->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3555, __extension__
__PRETTY_FUNCTION__))
;
3556 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3557
3558#ifdef EXPENSIVE_CHECKS
3559 assert(DT->verify(DominatorTree::VerificationLevel::Fast))(static_cast <bool> (DT->verify(DominatorTree::VerificationLevel
::Fast)) ? void (0) : __assert_fail ("DT->verify(DominatorTree::VerificationLevel::Fast)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3559, __extension__
__PRETTY_FUNCTION__))
;
3560 LI->verify(*DT);
3561#endif
3562
3563 return LoopVectorPreHeader;
3564}
3565
3566std::pair<BasicBlock *, Value *>
3567InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3568 /*
3569 In this function we generate a new loop. The new loop will contain
3570 the vectorized instructions while the old loop will continue to run the
3571 scalar remainder.
3572
3573 [ ] <-- loop iteration number check.
3574 / |
3575 / v
3576 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3577 | / |
3578 | / v
3579 || [ ] <-- vector pre header.
3580 |/ |
3581 | v
3582 | [ ] \
3583 | [ ]_| <-- vector loop.
3584 | |
3585 | v
3586 \ -[ ] <--- middle-block.
3587 \/ |
3588 /\ v
3589 | ->[ ] <--- new preheader.
3590 | |
3591 (opt) v <-- edge from middle to exit iff epilogue is not required.
3592 | [ ] \
3593 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3594 \ |
3595 \ v
3596 >[ ] <-- exit block(s).
3597 ...
3598 */
3599
3600 // Get the metadata of the original loop before it gets modified.
3601 MDNode *OrigLoopID = OrigLoop->getLoopID();
3602
3603 // Workaround! Compute the trip count of the original loop and cache it
3604 // before we start modifying the CFG. This code has a systemic problem
3605 // wherein it tries to run analysis over partially constructed IR; this is
3606 // wrong, and not simply for SCEV. The trip count of the original loop
3607 // simply happens to be prone to hitting this in practice. In theory, we
3608 // can hit the same issue for any SCEV, or ValueTracking query done during
3609 // mutation. See PR49900.
3610 getOrCreateTripCount(OrigLoop);
3611
3612 // Create an empty vector loop, and prepare basic blocks for the runtime
3613 // checks.
3614 Loop *Lp = createVectorLoopSkeleton("");
3615
3616 // Now, compare the new count to zero. If it is zero skip the vector loop and
3617 // jump to the scalar loop. This check also covers the case where the
3618 // backedge-taken count is uint##_max: adding one to it will overflow leading
3619 // to an incorrect trip count of zero. In this (rare) case we will also jump
3620 // to the scalar loop.
3621 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3622
3623 // Generate the code to check any assumptions that we've made for SCEV
3624 // expressions.
3625 emitSCEVChecks(Lp, LoopScalarPreHeader);
3626
3627 // Generate the code that checks in runtime if arrays overlap. We put the
3628 // checks into a separate block to make the more common case of few elements
3629 // faster.
3630 emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3631
3632 createHeaderBranch(Lp);
3633
3634 // Emit phis for the new starting index of the scalar loop.
3635 createInductionResumeValues(Lp);
3636
3637 return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
3638}
3639
3640// Fix up external users of the induction variable. At this point, we are
3641// in LCSSA form, with all external PHIs that use the IV having one input value,
3642// coming from the remainder loop. We need those PHIs to also have a correct
3643// value for the IV when arriving directly from the middle block.
3644void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3645 const InductionDescriptor &II,
3646 Value *CountRoundDown, Value *EndValue,
3647 BasicBlock *MiddleBlock) {
3648 // There are two kinds of external IV usages - those that use the value
3649 // computed in the last iteration (the PHI) and those that use the penultimate
3650 // value (the value that feeds into the phi from the loop latch).
3651 // We allow both, but they, obviously, have different values.
3652
3653 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block")(static_cast <bool> (OrigLoop->getUniqueExitBlock() &&
"Expected a single exit block") ? void (0) : __assert_fail (
"OrigLoop->getUniqueExitBlock() && \"Expected a single exit block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3653, __extension__
__PRETTY_FUNCTION__))
;
3654
3655 DenseMap<Value *, Value *> MissingVals;
3656
3657 // An external user of the last iteration's value should see the value that
3658 // the remainder loop uses to initialize its own IV.
3659 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3660 for (User *U : PostInc->users()) {
3661 Instruction *UI = cast<Instruction>(U);
3662 if (!OrigLoop->contains(UI)) {
3663 assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3663, __extension__
__PRETTY_FUNCTION__))
;
3664 MissingVals[UI] = EndValue;
3665 }
3666 }
3667
3668 // An external user of the penultimate value need to see EndValue - Step.
3669 // The simplest way to get this is to recompute it from the constituent SCEVs,
3670 // that is Start + (Step * (CRD - 1)).
3671 for (User *U : OrigPhi->users()) {
3672 auto *UI = cast<Instruction>(U);
3673 if (!OrigLoop->contains(UI)) {
3674 const DataLayout &DL =
3675 OrigLoop->getHeader()->getModule()->getDataLayout();
3676 assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3676, __extension__
__PRETTY_FUNCTION__))
;
3677
3678 IRBuilder<> B(MiddleBlock->getTerminator());
3679
3680 // Fast-math-flags propagate from the original induction instruction.
3681 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3682 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3683
3684 Value *CountMinusOne = B.CreateSub(
3685 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3686 Value *CMO =
3687 !II.getStep()->getType()->isIntegerTy()
3688 ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3689 II.getStep()->getType())
3690 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3691 CMO->setName("cast.cmo");
3692 Value *Escape =
3693 emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody);
3694 Escape->setName("ind.escape");
3695 MissingVals[UI] = Escape;
3696 }
3697 }
3698
3699 for (auto &I : MissingVals) {
3700 PHINode *PHI = cast<PHINode>(I.first);
3701 // One corner case we have to handle is two IVs "chasing" each-other,
3702 // that is %IV2 = phi [...], [ %IV1, %latch ]
3703 // In this case, if IV1 has an external use, we need to avoid adding both
3704 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3705 // don't already have an incoming value for the middle block.
3706 if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3707 PHI->addIncoming(I.second, MiddleBlock);
3708 }
3709}
3710
3711namespace {
3712
3713struct CSEDenseMapInfo {
3714 static bool canHandle(const Instruction *I) {
3715 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3716 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3717 }
3718
3719 static inline Instruction *getEmptyKey() {
3720 return DenseMapInfo<Instruction *>::getEmptyKey();
3721 }
3722
3723 static inline Instruction *getTombstoneKey() {
3724 return DenseMapInfo<Instruction *>::getTombstoneKey();
3725 }
3726
3727 static unsigned getHashValue(const Instruction *I) {
3728 assert(canHandle(I) && "Unknown instruction!")(static_cast <bool> (canHandle(I) && "Unknown instruction!"
) ? void (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3728, __extension__
__PRETTY_FUNCTION__))
;
3729 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3730 I->value_op_end()));
3731 }
3732
3733 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3734 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3735 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3736 return LHS == RHS;
3737 return LHS->isIdenticalTo(RHS);
3738 }
3739};
3740
3741} // end anonymous namespace
3742
3743///Perform cse of induction variable instructions.
3744static void cse(BasicBlock *BB) {
3745 // Perform simple cse.
3746 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3747 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3748 if (!CSEDenseMapInfo::canHandle(&In))
3749 continue;
3750
3751 // Check if we can replace this instruction with any of the
3752 // visited instructions.
3753 if (Instruction *V = CSEMap.lookup(&In)) {
3754 In.replaceAllUsesWith(V);
3755 In.eraseFromParent();
3756 continue;
3757 }
3758
3759 CSEMap[&In] = &In;
3760 }
3761}
3762
3763InstructionCost
3764LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3765 bool &NeedToScalarize) const {
3766 Function *F = CI->getCalledFunction();
3767 Type *ScalarRetTy = CI->getType();
3768 SmallVector<Type *, 4> Tys, ScalarTys;
3769 for (auto &ArgOp : CI->args())
3770 ScalarTys.push_back(ArgOp->getType());
3771
3772 // Estimate cost of scalarized vector call. The source operands are assumed
3773 // to be vectors, so we need to extract individual elements from there,
3774 // execute VF scalar calls, and then gather the result into the vector return
3775 // value.
3776 InstructionCost ScalarCallCost =
3777 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3778 if (VF.isScalar())
3779 return ScalarCallCost;
3780
3781 // Compute corresponding vector type for return value and arguments.
3782 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3783 for (Type *ScalarTy : ScalarTys)
3784 Tys.push_back(ToVectorTy(ScalarTy, VF));
3785
3786 // Compute costs of unpacking argument values for the scalar calls and
3787 // packing the return values to a vector.
3788 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3789
3790 InstructionCost Cost =
3791 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3792
3793 // If we can't emit a vector call for this function, then the currently found
3794 // cost is the cost we need to return.
3795 NeedToScalarize = true;
3796 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3797 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3798
3799 if (!TLI || CI->isNoBuiltin() || !VecFunc)
3800 return Cost;
3801
3802 // If the corresponding vector cost is cheaper, return its cost.
3803 InstructionCost VectorCallCost =
3804 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3805 if (VectorCallCost < Cost) {
3806 NeedToScalarize = false;
3807 Cost = VectorCallCost;
3808 }
3809 return Cost;
3810}
3811
3812static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3813 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3814 return Elt;
3815 return VectorType::get(Elt, VF);
3816}
3817
3818InstructionCost
3819LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3820 ElementCount VF) const {
3821 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3822 assert(ID && "Expected intrinsic call!")(static_cast <bool> (ID && "Expected intrinsic call!"
) ? void (0) : __assert_fail ("ID && \"Expected intrinsic call!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3822, __extension__
__PRETTY_FUNCTION__))
;
3823 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3824 FastMathFlags FMF;
3825 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3826 FMF = FPMO->getFastMathFlags();
3827
3828 SmallVector<const Value *> Arguments(CI->args());
3829 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3830 SmallVector<Type *> ParamTys;
3831 std::transform(FTy->param_begin(), FTy->param_end(),
3832 std::back_inserter(ParamTys),
3833 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3834
3835 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3836 dyn_cast<IntrinsicInst>(CI));
3837 return TTI.getIntrinsicInstrCost(CostAttrs,
3838 TargetTransformInfo::TCK_RecipThroughput);
3839}
3840
3841static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3842 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3843 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3844 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3845}
3846
3847static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3848 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3849 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3850 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3851}
3852
3853void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3854 // For every instruction `I` in MinBWs, truncate the operands, create a
3855 // truncated version of `I` and reextend its result. InstCombine runs
3856 // later and will remove any ext/trunc pairs.
3857 SmallPtrSet<Value *, 4> Erased;
3858 for (const auto &KV : Cost->getMinimalBitwidths()) {
3859 // If the value wasn't vectorized, we must maintain the original scalar
3860 // type. The absence of the value from State indicates that it
3861 // wasn't vectorized.
3862 // FIXME: Should not rely on getVPValue at this point.
3863 VPValue *Def = State.Plan->getVPValue(KV.first, true);
3864 if (!State.hasAnyVectorValue(Def))
3865 continue;
3866 for (unsigned Part = 0; Part < UF; ++Part) {
3867 Value *I = State.get(Def, Part);
3868 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3869 continue;
3870 Type *OriginalTy = I->getType();
3871 Type *ScalarTruncatedTy =
3872 IntegerType::get(OriginalTy->getContext(), KV.second);
3873 auto *TruncatedTy = VectorType::get(
3874 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3875 if (TruncatedTy == OriginalTy)
3876 continue;
3877
3878 IRBuilder<> B(cast<Instruction>(I));
3879 auto ShrinkOperand = [&](Value *V) -> Value * {
3880 if (auto *ZI = dyn_cast<ZExtInst>(V))
3881 if (ZI->getSrcTy() == TruncatedTy)
3882 return ZI->getOperand(0);
3883 return B.CreateZExtOrTrunc(V, TruncatedTy);
3884 };
3885
3886 // The actual instruction modification depends on the instruction type,
3887 // unfortunately.
3888 Value *NewI = nullptr;
3889 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3890 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3891 ShrinkOperand(BO->getOperand(1)));
3892
3893 // Any wrapping introduced by shrinking this operation shouldn't be
3894 // considered undefined behavior. So, we can't unconditionally copy
3895 // arithmetic wrapping flags to NewI.
3896 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3897 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3898 NewI =
3899 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3900 ShrinkOperand(CI->getOperand(1)));
3901 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3902 NewI = B.CreateSelect(SI->getCondition(),
3903 ShrinkOperand(SI->getTrueValue()),
3904 ShrinkOperand(SI->getFalseValue()));
3905 } else if (auto *CI = dyn_cast<CastInst>(I)) {
3906 switch (CI->getOpcode()) {
3907 default:
3908 llvm_unreachable("Unhandled cast!")::llvm::llvm_unreachable_internal("Unhandled cast!", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3908)
;
3909 case Instruction::Trunc:
3910 NewI = ShrinkOperand(CI->getOperand(0));
3911 break;
3912 case Instruction::SExt:
3913 NewI = B.CreateSExtOrTrunc(
3914 CI->getOperand(0),
3915 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3916 break;
3917 case Instruction::ZExt:
3918 NewI = B.CreateZExtOrTrunc(
3919 CI->getOperand(0),
3920 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3921 break;
3922 }
3923 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3924 auto Elements0 =
3925 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3926 auto *O0 = B.CreateZExtOrTrunc(
3927 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3928 auto Elements1 =
3929 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3930 auto *O1 = B.CreateZExtOrTrunc(
3931 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3932
3933 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3934 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3935 // Don't do anything with the operands, just extend the result.
3936 continue;
3937 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3938 auto Elements =
3939 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3940 auto *O0 = B.CreateZExtOrTrunc(
3941 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3942 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3943 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3944 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3945 auto Elements =
3946 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3947 auto *O0 = B.CreateZExtOrTrunc(
3948 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3949 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3950 } else {
3951 // If we don't know what to do, be conservative and don't do anything.
3952 continue;
3953 }
3954
3955 // Lastly, extend the result.
3956 NewI->takeName(cast<Instruction>(I));
3957 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3958 I->replaceAllUsesWith(Res);
3959 cast<Instruction>(I)->eraseFromParent();
3960 Erased.insert(I);
3961 State.reset(Def, Res, Part);
3962 }
3963 }
3964
3965 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3966 for (const auto &KV : Cost->getMinimalBitwidths()) {
3967 // If the value wasn't vectorized, we must maintain the original scalar
3968 // type. The absence of the value from State indicates that it
3969 // wasn't vectorized.
3970 // FIXME: Should not rely on getVPValue at this point.
3971 VPValue *Def = State.Plan->getVPValue(KV.first, true);
3972 if (!State.hasAnyVectorValue(Def))
3973 continue;
3974 for (unsigned Part = 0; Part < UF; ++Part) {
3975 Value *I = State.get(Def, Part);
3976 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3977 if (Inst && Inst->use_empty()) {
3978 Value *NewI = Inst->getOperand(0);
3979 Inst->eraseFromParent();
3980 State.reset(Def, NewI, Part);
3981 }
3982 }
3983 }
3984}
3985
3986void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
3987 // Insert truncates and extends for any truncated instructions as hints to
3988 // InstCombine.
3989 if (VF.isVector())
3990 truncateToMinimalBitwidths(State);
3991
3992 // Fix widened non-induction PHIs by setting up the PHI operands.
3993 if (OrigPHIsToFix.size()) {
3994 assert(EnableVPlanNativePath &&(static_cast <bool> (EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3995, __extension__
__PRETTY_FUNCTION__))
3995 "Unexpected non-induction PHIs for fixup in non VPlan-native path")(static_cast <bool> (EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3995, __extension__
__PRETTY_FUNCTION__))
;
3996 fixNonInductionPHIs(State);
3997 }
3998
3999 // At this point every instruction in the original loop is widened to a
4000 // vector form. Now we need to fix the recurrences in the loop. These PHI
4001 // nodes are currently empty because we did not want to introduce cycles.
4002 // This is the second stage of vectorizing recurrences.
4003 fixCrossIterationPHIs(State);
4004
4005 // Forget the original basic block.
4006 PSE.getSE()->forgetLoop(OrigLoop);
4007
4008 // If we inserted an edge from the middle block to the unique exit block,
4009 // update uses outside the loop (phis) to account for the newly inserted
4010 // edge.
4011 if (!Cost->requiresScalarEpilogue(VF)) {
4012 // Fix-up external users of the induction variables.
4013 for (auto &Entry : Legal->getInductionVars())
4014 fixupIVUsers(Entry.first, Entry.second,
4015 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4016 IVEndValues[Entry.first], LoopMiddleBlock);
4017
4018 fixLCSSAPHIs(State);
4019 }
4020
4021 for (Instruction *PI : PredicatedInstructions)
4022 sinkScalarOperands(&*PI);
4023
4024 // Remove redundant induction instructions.
4025 cse(LoopVectorBody);
4026
4027 // Set/update profile weights for the vector and remainder loops as original
4028 // loop iterations are now distributed among them. Note that original loop
4029 // represented by LoopScalarBody becomes remainder loop after vectorization.
4030 //
4031 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4032 // end up getting slightly roughened result but that should be OK since
4033 // profile is not inherently precise anyway. Note also possible bypass of
4034 // vector code caused by legality checks is ignored, assigning all the weight
4035 // to the vector loop, optimistically.
4036 //
4037 // For scalable vectorization we can't know at compile time how many iterations
4038 // of the loop are handled in one vector iteration, so instead assume a pessimistic
4039 // vscale of '1'.
4040 setProfileInfoAfterUnrolling(
4041 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
4042 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
4043}
4044
4045void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
4046 // In order to support recurrences we need to be able to vectorize Phi nodes.
4047 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4048 // stage #2: We now need to fix the recurrences by adding incoming edges to
4049 // the currently empty PHI nodes. At this point every instruction in the
4050 // original loop is widened to a vector form so we can use them to construct
4051 // the incoming edges.
4052 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
4053 for (VPRecipeBase &R : Header->phis()) {
4054 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
4055 fixReduction(ReductionPhi, State);
4056 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
4057 fixFirstOrderRecurrence(FOR, State);
4058 }
4059}
4060
4061void InnerLoopVectorizer::fixFirstOrderRecurrence(
4062 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
4063 // This is the second phase of vectorizing first-order recurrences. An
4064 // overview of the transformation is described below. Suppose we have the
4065 // following loop.
4066 //
4067 // for (int i = 0; i < n; ++i)
4068 // b[i] = a[i] - a[i - 1];
4069 //
4070 // There is a first-order recurrence on "a". For this loop, the shorthand
4071 // scalar IR looks like:
4072 //
4073 // scalar.ph:
4074 // s_init = a[-1]
4075 // br scalar.body
4076 //
4077 // scalar.body:
4078 // i = phi [0, scalar.ph], [i+1, scalar.body]
4079 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4080 // s2 = a[i]
4081 // b[i] = s2 - s1
4082 // br cond, scalar.body, ...
4083 //
4084 // In this example, s1 is a recurrence because it's value depends on the
4085 // previous iteration. In the first phase of vectorization, we created a
4086 // vector phi v1 for s1. We now complete the vectorization and produce the
4087 // shorthand vector IR shown below (for VF = 4, UF = 1).
4088 //
4089 // vector.ph:
4090 // v_init = vector(..., ..., ..., a[-1])
4091 // br vector.body
4092 //
4093 // vector.body
4094 // i = phi [0, vector.ph], [i+4, vector.body]
4095 // v1 = phi [v_init, vector.ph], [v2, vector.body]
4096 // v2 = a[i, i+1, i+2, i+3];
4097 // v3 = vector(v1(3), v2(0, 1, 2))
4098 // b[i, i+1, i+2, i+3] = v2 - v3
4099 // br cond, vector.body, middle.block
4100 //
4101 // middle.block:
4102 // x = v2(3)
4103 // br scalar.ph
4104 //
4105 // scalar.ph:
4106 // s_init = phi [x, middle.block], [a[-1], otherwise]
4107 // br scalar.body
4108 //
4109 // After execution completes the vector loop, we extract the next value of
4110 // the recurrence (x) to use as the initial value in the scalar loop.
4111
4112 // Extract the last vector element in the middle block. This will be the
4113 // initial value for the recurrence when jumping to the scalar loop.
4114 VPValue *PreviousDef = PhiR->getBackedgeValue();
4115 Value *Incoming = State.get(PreviousDef, UF - 1);
4116 auto *ExtractForScalar = Incoming;
4117 auto *IdxTy = Builder.getInt32Ty();
4118 if (VF.isVector()) {
4119 auto *One = ConstantInt::get(IdxTy, 1);
4120 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4121 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4122 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4123 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
4124 "vector.recur.extract");
4125 }
4126 // Extract the second last element in the middle block if the
4127 // Phi is used outside the loop. We need to extract the phi itself
4128 // and not the last element (the phi update in the current iteration). This
4129 // will be the value when jumping to the exit block from the LoopMiddleBlock,
4130 // when the scalar loop is not run at all.
4131 Value *ExtractForPhiUsedOutsideLoop = nullptr;
4132 if (VF.isVector()) {
4133 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4134 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
4135 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4136 Incoming, Idx, "vector.recur.extract.for.phi");
4137 } else if (UF > 1)
4138 // When loop is unrolled without vectorizing, initialize
4139 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
4140 // of `Incoming`. This is analogous to the vectorized case above: extracting
4141 // the second last element when VF > 1.
4142 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4143
4144 // Fix the initial value of the original recurrence in the scalar loop.
4145 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4146 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
4147 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4148 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
4149 for (auto *BB : predecessors(LoopScalarPreHeader)) {
4150 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4151 Start->addIncoming(Incoming, BB);
4152 }
4153
4154 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4155 Phi->setName("scalar.recur");
4156
4157 // Finally, fix users of the recurrence outside the loop. The users will need
4158 // either the last value of the scalar recurrence or the last value of the
4159 // vector recurrence we extracted in the middle block. Since the loop is in
4160 // LCSSA form, we just need to find all the phi nodes for the original scalar
4161 // recurrence in the exit block, and then add an edge for the middle block.
4162 // Note that LCSSA does not imply single entry when the original scalar loop
4163 // had multiple exiting edges (as we always run the last iteration in the
4164 // scalar epilogue); in that case, there is no edge from middle to exit and
4165 // and thus no phis which needed updated.
4166 if (!Cost->requiresScalarEpilogue(VF))
4167 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4168 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
4169 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4170}
4171
4172void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
4173 VPTransformState &State) {
4174 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4175 // Get it's reduction variable descriptor.
4176 assert(Legal->isReductionVariable(OrigPhi) &&(static_cast <bool> (Legal->isReductionVariable(OrigPhi
) && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4177, __extension__
__PRETTY_FUNCTION__))
4177 "Unable to find the reduction variable")(static_cast <bool> (Legal->isReductionVariable(OrigPhi
) && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4177, __extension__
__PRETTY_FUNCTION__))
;
4178 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4179
4180 RecurKind RK = RdxDesc.getRecurrenceKind();
4181 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4182 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4183 setDebugLocFromInst(ReductionStartValue);
4184
4185 VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
4186 // This is the vector-clone of the value that leaves the loop.
4187 Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4188
4189 // Wrap flags are in general invalid after vectorization, clear them.
4190 clearReductionWrapFlags(RdxDesc, State);
4191
4192 // Before each round, move the insertion point right between
4193 // the PHIs and the values we are going to write.
4194 // This allows us to write both PHINodes and the extractelement
4195 // instructions.
4196 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4197
4198 setDebugLocFromInst(LoopExitInst);
4199
4200 Type *PhiTy = OrigPhi->getType();
4201 // If tail is folded by masking, the vector value to leave the loop should be
4202 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4203 // instead of the former. For an inloop reduction the reduction will already
4204 // be predicated, and does not need to be handled here.
4205 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
4206 for (unsigned Part = 0; Part < UF; ++Part) {
4207 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4208 Value *Sel = nullptr;
4209 for (User *U : VecLoopExitInst->users()) {
4210 if (isa<SelectInst>(U)) {
4211 assert(!Sel && "Reduction exit feeding two selects")(static_cast <bool> (!Sel && "Reduction exit feeding two selects"
) ? void (0) : __assert_fail ("!Sel && \"Reduction exit feeding two selects\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4211, __extension__
__PRETTY_FUNCTION__))
;
4212 Sel = U;
4213 } else
4214 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select")(static_cast <bool> (isa<PHINode>(U) && "Reduction exit must feed Phi's or select"
) ? void (0) : __assert_fail ("isa<PHINode>(U) && \"Reduction exit must feed Phi's or select\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4214, __extension__
__PRETTY_FUNCTION__))
;
4215 }
4216 assert(Sel && "Reduction exit feeds no select")(static_cast <bool> (Sel && "Reduction exit feeds no select"
) ? void (0) : __assert_fail ("Sel && \"Reduction exit feeds no select\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4216, __extension__
__PRETTY_FUNCTION__))
;
4217 State.reset(LoopExitInstDef, Sel, Part);
4218
4219 // If the target can create a predicated operator for the reduction at no
4220 // extra cost in the loop (for example a predicated vadd), it can be
4221 // cheaper for the select to remain in the loop than be sunk out of it,
4222 // and so use the select value for the phi instead of the old
4223 // LoopExitValue.
4224 if (PreferPredicatedReductionSelect ||
4225 TTI->preferPredicatedReductionSelect(
4226 RdxDesc.getOpcode(), PhiTy,
4227 TargetTransformInfo::ReductionFlags())) {
4228 auto *VecRdxPhi =
4229 cast<PHINode>(State.get(PhiR, Part));
4230 VecRdxPhi->setIncomingValueForBlock(
4231 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4232 }
4233 }
4234 }
4235
4236 // If the vector reduction can be performed in a smaller type, we truncate
4237 // then extend the loop exit value to enable InstCombine to evaluate the
4238 // entire expression in the smaller type.
4239 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4240 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!")(static_cast <bool> (!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"
) ? void (0) : __assert_fail ("!PhiR->isInLoop() && \"Unexpected truncated inloop reduction!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4240, __extension__
__PRETTY_FUNCTION__))
;
4241 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4242 Builder.SetInsertPoint(
4243 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4244 VectorParts RdxParts(UF);
4245 for (unsigned Part = 0; Part < UF; ++Part) {
4246 RdxParts[Part] = State.get(LoopExitInstDef, Part);
4247 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4248 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4249 : Builder.CreateZExt(Trunc, VecTy);
4250 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
4251 if (U != Trunc) {
4252 U->replaceUsesOfWith(RdxParts[Part], Extnd);
4253 RdxParts[Part] = Extnd;
4254 }
4255 }
4256 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4257 for (unsigned Part = 0; Part < UF; ++Part) {
4258 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4259 State.reset(LoopExitInstDef, RdxParts[Part], Part);
4260 }
4261 }
4262
4263 // Reduce all of the unrolled parts into a single vector.
4264 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4265 unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4266
4267 // The middle block terminator has already been assigned a DebugLoc here (the
4268 // OrigLoop's single latch terminator). We want the whole middle block to
4269 // appear to execute on this line because: (a) it is all compiler generated,
4270 // (b) these instructions are always executed after evaluating the latch
4271 // conditional branch, and (c) other passes may add new predecessors which
4272 // terminate on this line. This is the easiest way to ensure we don't
4273 // accidentally cause an extra step back into the loop while debugging.
4274 setDebugLocFromInst(LoopMiddleBlock->getTerminator());
4275 if (PhiR->isOrdered())
4276 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
4277 else {
4278 // Floating-point operations should have some FMF to enable the reduction.
4279 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4280 Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4281 for (unsigned Part = 1; Part < UF; ++Part) {
4282 Value *RdxPart = State.get(LoopExitInstDef, Part);
4283 if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4284 ReducedPartRdx = Builder.CreateBinOp(
4285 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4286 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
4287 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
4288 ReducedPartRdx, RdxPart);
4289 else
4290 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4291 }
4292 }
4293
4294 // Create the reduction after the loop. Note that inloop reductions create the
4295 // target reduction in the loop using a Reduction recipe.
4296 if (VF.isVector() && !PhiR->isInLoop()) {
4297 ReducedPartRdx =
4298 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
4299 // If the reduction can be performed in a smaller type, we need to extend
4300 // the reduction to the wider type before we branch to the original loop.
4301 if (PhiTy != RdxDesc.getRecurrenceType())
4302 ReducedPartRdx = RdxDesc.isSigned()
4303 ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4304 : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4305 }
4306
4307 PHINode *ResumePhi =
4308 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
4309
4310 // Create a phi node that merges control-flow from the backedge-taken check
4311 // block and the middle block.
4312 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4313 LoopScalarPreHeader->getTerminator());
4314
4315 // If we are fixing reductions in the epilogue loop then we should already
4316 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
4317 // we carry over the incoming values correctly.
4318 for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
4319 if (Incoming == LoopMiddleBlock)
4320 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
4321 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
4322 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
4323 Incoming);
4324 else
4325 BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4326 }
4327
4328 // Set the resume value for this reduction
4329 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4330
4331 // Now, we need to fix the users of the reduction variable
4332 // inside and outside of the scalar remainder loop.
4333
4334 // We know that the loop is in LCSSA form. We need to update the PHI nodes
4335 // in the exit blocks. See comment on analogous loop in
4336 // fixFirstOrderRecurrence for a more complete explaination of the logic.
4337 if (!Cost->requiresScalarEpilogue(VF))
4338 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4339 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
4340 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4341
4342 // Fix the scalar loop reduction variable with the incoming reduction sum
4343 // from the vector body and from the backedge value.
4344 int IncomingEdgeBlockIdx =
4345 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4346 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")(static_cast <bool> (IncomingEdgeBlockIdx >= 0 &&
"Invalid block index") ? void (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4346, __extension__
__PRETTY_FUNCTION__))
;
4347 // Pick the other block.
4348 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4349 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4350 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4351}
4352
4353void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4354 VPTransformState &State) {
4355 RecurKind RK = RdxDesc.getRecurrenceKind();
4356 if (RK != RecurKind::Add && RK != RecurKind::Mul)
4357 return;
4358
4359 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4360 assert(LoopExitInstr && "null loop exit instruction")(static_cast <bool> (LoopExitInstr && "null loop exit instruction"
) ? void (0) : __assert_fail ("LoopExitInstr && \"null loop exit instruction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4360, __extension__
__PRETTY_FUNCTION__))
;
4361 SmallVector<Instruction *, 8> Worklist;
4362 SmallPtrSet<Instruction *, 8> Visited;
4363 Worklist.push_back(LoopExitInstr);
4364 Visited.insert(LoopExitInstr);
4365
4366 while (!Worklist.empty()) {
4367 Instruction *Cur = Worklist.pop_back_val();
4368 if (isa<OverflowingBinaryOperator>(Cur))
4369 for (unsigned Part = 0; Part < UF; ++Part) {
4370 // FIXME: Should not rely on getVPValue at this point.
4371 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
4372 cast<Instruction>(V)->dropPoisonGeneratingFlags();
4373 }
4374
4375 for (User *U : Cur->users()) {
4376 Instruction *UI = cast<Instruction>(U);
4377 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4378 Visited.insert(UI).second)
4379 Worklist.push_back(UI);
4380 }
4381 }
4382}
4383
4384void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4385 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4386 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4387 // Some phis were already hand updated by the reduction and recurrence
4388 // code above, leave them alone.
4389 continue;
4390
4391 auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4392 // Non-instruction incoming values will have only one value.
4393
4394 VPLane Lane = VPLane::getFirstLane();
4395 if (isa<Instruction>(IncomingValue) &&
4396 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4397 VF))
4398 Lane = VPLane::getLastLaneForVF(VF);
4399
4400 // Can be a loop invariant incoming value or the last scalar value to be
4401 // extracted from the vectorized loop.
4402 // FIXME: Should not rely on getVPValue at this point.
4403 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4404 Value *lastIncomingValue =
4405 OrigLoop->isLoopInvariant(IncomingValue)
4406 ? IncomingValue
4407 : State.get(State.Plan->getVPValue(IncomingValue, true),
4408 VPIteration(UF - 1, Lane));
4409 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4410 }
4411}
4412
4413void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4414 // The basic block and loop containing the predicated instruction.
4415 auto *PredBB = PredInst->getParent();
4416 auto *VectorLoop = LI->getLoopFor(PredBB);
4417
4418 // Initialize a worklist with the operands of the predicated instruction.
4419 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4420
4421 // Holds instructions that we need to analyze again. An instruction may be
4422 // reanalyzed if we don't yet know if we can sink it or not.
4423 SmallVector<Instruction *, 8> InstsToReanalyze;
4424
4425 // Returns true if a given use occurs in the predicated block. Phi nodes use
4426 // their operands in their corresponding predecessor blocks.
4427 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4428 auto *I = cast<Instruction>(U.getUser());
4429 BasicBlock *BB = I->getParent();
4430 if (auto *Phi = dyn_cast<PHINode>(I))
4431 BB = Phi->getIncomingBlock(
4432 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4433 return BB == PredBB;
4434 };
4435
4436 // Iteratively sink the scalarized operands of the predicated instruction
4437 // into the block we created for it. When an instruction is sunk, it's
4438 // operands are then added to the worklist. The algorithm ends after one pass
4439 // through the worklist doesn't sink a single instruction.
4440 bool Changed;
4441 do {
4442 // Add the instructions that need to be reanalyzed to the worklist, and
4443 // reset the changed indicator.
4444 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4445 InstsToReanalyze.clear();
4446 Changed = false;
4447
4448 while (!Worklist.empty()) {
4449 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4450
4451 // We can't sink an instruction if it is a phi node, is not in the loop,
4452 // or may have side effects.
4453 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4454 I->mayHaveSideEffects())
4455 continue;
4456
4457 // If the instruction is already in PredBB, check if we can sink its
4458 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4459 // sinking the scalar instruction I, hence it appears in PredBB; but it
4460 // may have failed to sink I's operands (recursively), which we try
4461 // (again) here.
4462 if (I->getParent() == PredBB) {
4463 Worklist.insert(I->op_begin(), I->op_end());
4464 continue;
4465 }
4466
4467 // It's legal to sink the instruction if all its uses occur in the
4468 // predicated block. Otherwise, there's nothing to do yet, and we may
4469 // need to reanalyze the instruction.
4470 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4471 InstsToReanalyze.push_back(I);
4472 continue;
4473 }
4474
4475 // Move the instruction to the beginning of the predicated block, and add
4476 // it's operands to the worklist.
4477 I->moveBefore(&*PredBB->getFirstInsertionPt());
4478 Worklist.insert(I->op_begin(), I->op_end());
4479
4480 // The sinking may have enabled other instructions to be sunk, so we will
4481 // need to iterate.
4482 Changed = true;
4483 }
4484 } while (Changed);
4485}
4486
4487void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4488 for (PHINode *OrigPhi : OrigPHIsToFix) {
4489 VPWidenPHIRecipe *VPPhi =
4490 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4491 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4492 // Make sure the builder has a valid insert point.
4493 Builder.SetInsertPoint(NewPhi);
4494 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4495 VPValue *Inc = VPPhi->getIncomingValue(i);
4496 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4497 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4498 }
4499 }
4500}
4501
4502bool InnerLoopVectorizer::useOrderedReductions(
4503 const RecurrenceDescriptor &RdxDesc) {
4504 return Cost->useOrderedReductions(RdxDesc);
4505}
4506
4507void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4508 VPWidenPHIRecipe *PhiR,
4509 VPTransformState &State) {
4510 PHINode *P = cast<PHINode>(PN);
4511 if (EnableVPlanNativePath) {
4512 // Currently we enter here in the VPlan-native path for non-induction
4513 // PHIs where all control flow is uniform. We simply widen these PHIs.
4514 // Create a vector phi with no operands - the vector phi operands will be
4515 // set at the end of vector code generation.
4516 Type *VecTy = (State.VF.isScalar())
4517 ? PN->getType()
4518 : VectorType::get(PN->getType(), State.VF);
4519 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4520 State.set(PhiR, VecPhi, 0);
4521 OrigPHIsToFix.push_back(P);
4522
4523 return;
4524 }
4525
4526 assert(PN->getParent() == OrigLoop->getHeader() &&(static_cast <bool> (PN->getParent() == OrigLoop->
getHeader() && "Non-header phis should have been handled elsewhere"
) ? void (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4527, __extension__
__PRETTY_FUNCTION__))
4527 "Non-header phis should have been handled elsewhere")(static_cast <bool> (PN->getParent() == OrigLoop->
getHeader() && "Non-header phis should have been handled elsewhere"
) ? void (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4527, __extension__
__PRETTY_FUNCTION__))
;
4528
4529 // In order to support recurrences we need to be able to vectorize Phi nodes.
4530 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4531 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4532 // this value when we vectorize all of the instructions that use the PHI.
4533
4534 assert(!Legal->isReductionVariable(P) &&(static_cast <bool> (!Legal->isReductionVariable(P) &&
"reductions should be handled elsewhere") ? void (0) : __assert_fail
("!Legal->isReductionVariable(P) && \"reductions should be handled elsewhere\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4535, __extension__
__PRETTY_FUNCTION__))
4535 "reductions should be handled elsewhere")(static_cast <bool> (!Legal->isReductionVariable(P) &&
"reductions should be handled elsewhere") ? void (0) : __assert_fail
("!Legal->isReductionVariable(P) && \"reductions should be handled elsewhere\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4535, __extension__
__PRETTY_FUNCTION__))
;
4536
4537 setDebugLocFromInst(P);
4538
4539 // This PHINode must be an induction variable.
4540 // Make sure that we know about it.
4541 assert(Legal->getInductionVars().count(P) && "Not an induction variable")(static_cast <bool> (Legal->getInductionVars().count
(P) && "Not an induction variable") ? void (0) : __assert_fail
("Legal->getInductionVars().count(P) && \"Not an induction variable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4541, __extension__
__PRETTY_FUNCTION__))
;
4542
4543 InductionDescriptor II = Legal->getInductionVars().lookup(P);
4544 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4545
4546 auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV();
4547 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
4548
4549 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4550 // which can be found from the original scalar operations.
4551 switch (II.getKind()) {
4552 case InductionDescriptor::IK_NoInduction:
4553 llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4553)
;
4554 case InductionDescriptor::IK_IntInduction:
4555 case InductionDescriptor::IK_FpInduction:
4556 llvm_unreachable("Integer/fp induction is handled elsewhere.")::llvm::llvm_unreachable_internal("Integer/fp induction is handled elsewhere."
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4556)
;
4557 case InductionDescriptor::IK_PtrInduction: {
4558 // Handle the pointer induction variable case.
4559 assert(P->getType()->isPointerTy() && "Unexpected type.")(static_cast <bool> (P->getType()->isPointerTy() &&
"Unexpected type.") ? void (0) : __assert_fail ("P->getType()->isPointerTy() && \"Unexpected type.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4559, __extension__
__PRETTY_FUNCTION__))
;
4560
4561 if (Cost->isScalarAfterVectorization(P, State.VF)) {
4562 // This is the normalized GEP that starts counting at zero.
4563 Value *PtrInd =
4564 Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType());
4565 // Determine the number of scalars we need to generate for each unroll
4566 // iteration. If the instruction is uniform, we only need to generate the
4567 // first lane. Otherwise, we generate all VF values.
4568 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
4569 assert((IsUniform || !State.VF.isScalable()) &&(static_cast <bool> ((IsUniform || !State.VF.isScalable
()) && "Cannot scalarize a scalable VF") ? void (0) :
__assert_fail ("(IsUniform || !State.VF.isScalable()) && \"Cannot scalarize a scalable VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4570, __extension__
__PRETTY_FUNCTION__))
4570 "Cannot scalarize a scalable VF")(static_cast <bool> ((IsUniform || !State.VF.isScalable
()) && "Cannot scalarize a scalable VF") ? void (0) :
__assert_fail ("(IsUniform || !State.VF.isScalable()) && \"Cannot scalarize a scalable VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4570, __extension__
__PRETTY_FUNCTION__))
;
4571 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
4572
4573 for (unsigned Part = 0; Part < UF; ++Part) {
4574 Value *PartStart =
4575 createStepForVF(Builder, PtrInd->getType(), VF, Part);
4576
4577 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4578 Value *Idx = Builder.CreateAdd(
4579 PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4580 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4581 Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(),
4582 DL, II, State.CFG.PrevBB);
4583 SclrGep->setName("next.gep");
4584 State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4585 }
4586 }
4587 return;
4588 }
4589 assert(isa<SCEVConstant>(II.getStep()) &&(static_cast <bool> (isa<SCEVConstant>(II.getStep
()) && "Induction step not a SCEV constant!") ? void (
0) : __assert_fail ("isa<SCEVConstant>(II.getStep()) && \"Induction step not a SCEV constant!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4590, __extension__
__PRETTY_FUNCTION__))
4590 "Induction step not a SCEV constant!")(static_cast <bool> (isa<SCEVConstant>(II.getStep
()) && "Induction step not a SCEV constant!") ? void (
0) : __assert_fail ("isa<SCEVConstant>(II.getStep()) && \"Induction step not a SCEV constant!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4590, __extension__
__PRETTY_FUNCTION__))
;
4591 Type *PhiType = II.getStep()->getType();
4592
4593 // Build a pointer phi
4594 Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue();
4595 Type *ScStValueType = ScalarStartValue->getType();
4596 PHINode *NewPointerPhi =
4597 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
4598 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4599
4600 // A pointer induction, performed by using a gep
4601 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4602 Instruction *InductionLoc = LoopLatch->getTerminator();
4603 const SCEV *ScalarStep = II.getStep();
4604 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4605 Value *ScalarStepValue =
4606 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4607 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4608 Value *NumUnrolledElems =
4609 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4610 Value *InductionGEP = GetElementPtrInst::Create(
4611 II.getElementType(), NewPointerPhi,
4612 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4613 InductionLoc);
4614 NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4615
4616 // Create UF many actual address geps that use the pointer
4617 // phi as base and a vectorized version of the step value
4618 // (<step*0, ..., step*N>) as offset.
4619 for (unsigned Part = 0; Part < State.UF; ++Part) {
4620 Type *VecPhiType = VectorType::get(PhiType, State.VF);
4621 Value *StartOffsetScalar =
4622 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4623 Value *StartOffset =
4624 Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4625 // Create a vector of consecutive numbers from zero to VF.
4626 StartOffset =
4627 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4628
4629 Value *GEP = Builder.CreateGEP(
4630 II.getElementType(), NewPointerPhi,
4631 Builder.CreateMul(
4632 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4633 "vector.gep"));
4634 State.set(PhiR, GEP, Part);
4635 }
4636 }
4637 }
4638}
4639
4640/// A helper function for checking whether an integer division-related
4641/// instruction may divide by zero (in which case it must be predicated if
4642/// executed conditionally in the scalar code).
4643/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4644/// Non-zero divisors that are non compile-time constants will not be
4645/// converted into multiplication, so we will still end up scalarizing
4646/// the division, but can do so w/o predication.
4647static bool mayDivideByZero(Instruction &I) {
4648 assert((I.getOpcode() == Instruction::UDiv ||(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
|| I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4652, __extension__
__PRETTY_FUNCTION__))
4649 I.getOpcode() == Instruction::SDiv ||(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
|| I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4652, __extension__
__PRETTY_FUNCTION__))
4650 I.getOpcode() == Instruction::URem ||(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
|| I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4652, __extension__
__PRETTY_FUNCTION__))
4651 I.getOpcode() == Instruction::SRem) &&(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
|| I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4652, __extension__
__PRETTY_FUNCTION__))
4652 "Unexpected instruction")(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
|| I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4652, __extension__
__PRETTY_FUNCTION__))
;
4653 Value *Divisor = I.getOperand(1);
4654 auto *CInt = dyn_cast<ConstantInt>(Divisor);
4655 return !CInt || CInt->isZero();
4656}
4657
4658void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4659 VPUser &ArgOperands,
4660 VPTransformState &State) {
4661 assert(!isa<DbgInfoIntrinsic>(I) &&(static_cast <bool> (!isa<DbgInfoIntrinsic>(I) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction"
) ? void (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(I) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4662, __extension__
__PRETTY_FUNCTION__))
4662 "DbgInfoIntrinsic should have been dropped during VPlan construction")(static_cast <bool> (!isa<DbgInfoIntrinsic>(I) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction"
) ? void (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(I) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4662, __extension__
__PRETTY_FUNCTION__))
;
4663 setDebugLocFromInst(&I);
4664
4665 Module *M = I.getParent()->getParent()->getParent();
4666 auto *CI = cast<CallInst>(&I);
4667
4668 SmallVector<Type *, 4> Tys;
4669 for (Value *ArgOperand : CI->args())
4670 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4671
4672 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4673
4674 // The flag shows whether we use Intrinsic or a usual Call for vectorized
4675 // version of the instruction.
4676 // Is it beneficial to perform intrinsic call compared to lib call?
4677 bool NeedToScalarize = false;
4678 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4679 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4680 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4681 assert((UseVectorIntrinsic || !NeedToScalarize) &&(static_cast <bool> ((UseVectorIntrinsic || !NeedToScalarize
) && "Instruction should be scalarized elsewhere.") ?
void (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4682, __extension__
__PRETTY_FUNCTION__))
4682 "Instruction should be scalarized elsewhere.")(static_cast <bool> ((UseVectorIntrinsic || !NeedToScalarize
) && "Instruction should be scalarized elsewhere.") ?
void (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4682, __extension__
__PRETTY_FUNCTION__))
;
4683 assert((IntrinsicCost.isValid() || CallCost.isValid()) &&(static_cast <bool> ((IntrinsicCost.isValid() || CallCost
.isValid()) && "Either the intrinsic cost or vector call cost must be valid"
) ? void (0) : __assert_fail ("(IntrinsicCost.isValid() || CallCost.isValid()) && \"Either the intrinsic cost or vector call cost must be valid\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4684, __extension__
__PRETTY_FUNCTION__))
4684 "Either the intrinsic cost or vector call cost must be valid")(static_cast <bool> ((IntrinsicCost.isValid() || CallCost
.isValid()) && "Either the intrinsic cost or vector call cost must be valid"
) ? void (0) : __assert_fail ("(IntrinsicCost.isValid() || CallCost.isValid()) && \"Either the intrinsic cost or vector call cost must be valid\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4684, __extension__
__PRETTY_FUNCTION__))
;
4685
4686 for (unsigned Part = 0; Part < UF; ++Part) {
4687 SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4688 SmallVector<Value *, 4> Args;
4689 for (auto &I : enumerate(ArgOperands.operands())) {
4690 // Some intrinsics have a scalar argument - don't replace it with a
4691 // vector.
4692 Value *Arg;
4693 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4694 Arg = State.get(I.value(), Part);
4695 else {
4696 Arg = State.get(I.value(), VPIteration(0, 0));
4697 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
4698 TysForDecl.push_back(Arg->getType());
4699 }
4700 Args.push_back(Arg);
4701 }
4702
4703 Function *VectorF;
4704 if (UseVectorIntrinsic) {
4705 // Use vector version of the intrinsic.
4706 if (VF.isVector())
4707 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4708 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4709 assert(VectorF && "Can't retrieve vector intrinsic.")(static_cast <bool> (VectorF && "Can't retrieve vector intrinsic."
) ? void (0) : __assert_fail ("VectorF && \"Can't retrieve vector intrinsic.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4709, __extension__
__PRETTY_FUNCTION__))
;
4710 } else {
4711 // Use vector version of the function call.
4712 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4713#ifndef NDEBUG
4714 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&(static_cast <bool> (VFDatabase(*CI).getVectorizedFunction
(Shape) != nullptr && "Can't create vector function."
) ? void (0) : __assert_fail ("VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4715, __extension__
__PRETTY_FUNCTION__))
4715 "Can't create vector function.")(static_cast <bool> (VFDatabase(*CI).getVectorizedFunction
(Shape) != nullptr && "Can't create vector function."
) ? void (0) : __assert_fail ("VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4715, __extension__
__PRETTY_FUNCTION__))
;
4716#endif
4717 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4718 }
4719 SmallVector<OperandBundleDef, 1> OpBundles;
4720 CI->getOperandBundlesAsDefs(OpBundles);
4721 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4722
4723 if (isa<FPMathOperator>(V))
4724 V->copyFastMathFlags(CI);
4725
4726 State.set(Def, V, Part);
4727 addMetadata(V, &I);
4728 }
4729}
4730
4731void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4732 // We should not collect Scalars more than once per VF. Right now, this
4733 // function is called from collectUniformsAndScalars(), which already does
4734 // this check. Collecting Scalars for VF=1 does not make any sense.
4735 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&(static_cast <bool> (VF.isVector() && Scalars.find
(VF) == Scalars.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4736, __extension__
__PRETTY_FUNCTION__))
4736 "This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Scalars.find
(VF) == Scalars.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4736, __extension__
__PRETTY_FUNCTION__))
;
4737
4738 SmallSetVector<Instruction *, 8> Worklist;
4739
4740 // These sets are used to seed the analysis with pointers used by memory
4741 // accesses that will remain scalar.
4742 SmallSetVector<Instruction *, 8> ScalarPtrs;
4743 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4744 auto *Latch = TheLoop->getLoopLatch();
4745
4746 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4747 // The pointer operands of loads and stores will be scalar as long as the
4748 // memory access is not a gather or scatter operation. The value operand of a
4749 // store will remain scalar if the store is scalarized.
4750 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4751 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4752 assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4753, __extension__
__PRETTY_FUNCTION__))
4753 "Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4753, __extension__
__PRETTY_FUNCTION__))
;
4754 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4755 if (Ptr == Store->getValueOperand())
4756 return WideningDecision == CM_Scalarize;
4757 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4758, __extension__
__PRETTY_FUNCTION__))
4758 "Ptr is neither a value or pointer operand")(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4758, __extension__
__PRETTY_FUNCTION__))
;
4759 return WideningDecision != CM_GatherScatter;
4760 };
4761
4762 // A helper that returns true if the given value is a bitcast or
4763 // getelementptr instruction contained in the loop.
4764 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4765 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4766 isa<GetElementPtrInst>(V)) &&
4767 !TheLoop->isLoopInvariant(V);
4768 };
4769
4770 // A helper that evaluates a memory access's use of a pointer. If the use will
4771 // be a scalar use and the pointer is only used by memory accesses, we place
4772 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4773 // PossibleNonScalarPtrs.
4774 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4775 // We only care about bitcast and getelementptr instructions contained in
4776 // the loop.
4777 if (!isLoopVaryingBitCastOrGEP(Ptr))
4778 return;
4779
4780 // If the pointer has already been identified as scalar (e.g., if it was
4781 // also identified as uniform), there's nothing to do.
4782 auto *I = cast<Instruction>(Ptr);
4783 if (Worklist.count(I))
4784 return;
4785
4786 // If the use of the pointer will be a scalar use, and all users of the
4787 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4788 // place the pointer in PossibleNonScalarPtrs.
4789 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4790 return isa<LoadInst>(U) || isa<StoreInst>(U);
4791 }))
4792 ScalarPtrs.insert(I);
4793 else
4794 PossibleNonScalarPtrs.insert(I);
4795 };
4796
4797 // We seed the scalars analysis with three classes of instructions: (1)
4798 // instructions marked uniform-after-vectorization and (2) bitcast,
4799 // getelementptr and (pointer) phi instructions used by memory accesses
4800 // requiring a scalar use.
4801 //
4802 // (1) Add to the worklist all instructions that have been identified as
4803 // uniform-after-vectorization.
4804 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4805
4806 // (2) Add to the worklist all bitcast and getelementptr instructions used by
4807 // memory accesses requiring a scalar use. The pointer operands of loads and
4808 // stores will be scalar as long as the memory accesses is not a gather or
4809 // scatter operation. The value operand of a store will remain scalar if the
4810 // store is scalarized.
4811 for (auto *BB : TheLoop->blocks())
4812 for (auto &I : *BB) {
4813 if (auto *Load = dyn_cast<LoadInst>(&I)) {
4814 evaluatePtrUse(Load, Load->getPointerOperand());
4815 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4816 evaluatePtrUse(Store, Store->getPointerOperand());
4817 evaluatePtrUse(Store, Store->getValueOperand());
4818 }
4819 }
4820 for (auto *I : ScalarPtrs)
4821 if (!PossibleNonScalarPtrs.count(I)) {
4822 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *I << "\n"; } } while (false)
;
4823 Worklist.insert(I);
4824 }
4825
4826 // Insert the forced scalars.
4827 // FIXME: Currently widenPHIInstruction() often creates a dead vector
4828 // induction variable when the PHI user is scalarized.
4829 auto ForcedScalar = ForcedScalars.find(VF);
4830 if (ForcedScalar != ForcedScalars.end())
4831 for (auto *I : ForcedScalar->second)
4832 Worklist.insert(I);
4833
4834 // Expand the worklist by looking through any bitcasts and getelementptr
4835 // instructions we've already identified as scalar. This is similar to the
4836 // expansion step in collectLoopUniforms(); however, here we're only
4837 // expanding to include additional bitcasts and getelementptr instructions.
4838 unsigned Idx = 0;
4839 while (Idx != Worklist.size()) {
4840 Instruction *Dst = Worklist[Idx++];
4841 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4842 continue;
4843 auto *Src = cast<Instruction>(Dst->getOperand(0));
4844 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4845 auto *J = cast<Instruction>(U);
4846 return !TheLoop->contains(J) || Worklist.count(J) ||
4847 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4848 isScalarUse(J, Src));
4849 })) {
4850 Worklist.insert(Src);
4851 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Src << "\n"; } } while (false)
;
4852 }
4853 }
4854
4855 // An induction variable will remain scalar if all users of the induction
4856 // variable and induction variable update remain scalar.
4857 for (auto &Induction : Legal->getInductionVars()) {
4858 auto *Ind = Induction.first;
4859 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4860
4861 // If tail-folding is applied, the primary induction variable will be used
4862 // to feed a vector compare.
4863 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4864 continue;
4865
4866 // Returns true if \p Indvar is a pointer induction that is used directly by
4867 // load/store instruction \p I.
4868 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4869 Instruction *I) {
4870 return Induction.second.getKind() ==
4871 InductionDescriptor::IK_PtrInduction &&
4872 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4873 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4874 };
4875
4876 // Determine if all users of the induction variable are scalar after
4877 // vectorization.
4878 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4879 auto *I = cast<Instruction>(U);
4880 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4881 IsDirectLoadStoreFromPtrIndvar(Ind, I);
4882 });
4883 if (!ScalarInd)
4884 continue;
4885
4886 // Determine if all users of the induction variable update instruction are
4887 // scalar after vectorization.
4888 auto ScalarIndUpdate =
4889 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4890 auto *I = cast<Instruction>(U);
4891 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4892 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4893 });
4894 if (!ScalarIndUpdate)
4895 continue;
4896
4897 // The induction variable and its update instruction will remain scalar.
4898 Worklist.insert(Ind);
4899 Worklist.insert(IndUpdate);
4900 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Ind << "\n"; } } while (false)
;
4901 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
4902 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
;
4903 }
4904
4905 Scalars[VF].insert(Worklist.begin(), Worklist.end());
4906}
4907
4908bool LoopVectorizationCostModel::isScalarWithPredication(
4909 Instruction *I, ElementCount VF) const {
4910 if (!blockNeedsPredicationForAnyReason(I->getParent()))
4911 return false;
4912 switch(I->getOpcode()) {
4913 default:
4914 break;
4915 case Instruction::Load:
4916 case Instruction::Store: {
4917 if (!Legal->isMaskRequired(I))
4918 return false;
4919 auto *Ptr = getLoadStorePointerOperand(I);
4920 auto *Ty = getLoadStoreType(I);
4921 Type *VTy = Ty;
4922 if (VF.isVector())
4923 VTy = VectorType::get(Ty, VF);
4924 const Align Alignment = getLoadStoreAlignment(I);
4925 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4926 TTI.isLegalMaskedGather(VTy, Alignment))
4927 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4928 TTI.isLegalMaskedScatter(VTy, Alignment));
4929 }
4930 case Instruction::UDiv:
4931 case Instruction::SDiv:
4932 case Instruction::SRem:
4933 case Instruction::URem:
4934 return mayDivideByZero(*I);
4935 }
4936 return false;
4937}
4938
4939bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4940 Instruction *I, ElementCount VF) {
4941 assert(isAccessInterleaved(I) && "Expecting interleaved access.")(static_cast <bool> (isAccessInterleaved(I) && "Expecting interleaved access."
) ? void (0) : __assert_fail ("isAccessInterleaved(I) && \"Expecting interleaved access.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4941, __extension__
__PRETTY_FUNCTION__))
;
4942 assert(getWideningDecision(I, VF) == CM_Unknown &&(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown
&& "Decision should not be set yet.") ? void (0) : __assert_fail
("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4943, __extension__
__PRETTY_FUNCTION__))
4943 "Decision should not be set yet.")(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown
&& "Decision should not be set yet.") ? void (0) : __assert_fail
("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4943, __extension__
__PRETTY_FUNCTION__))
;
4944 auto *Group = getInterleavedAccessGroup(I);
4945 assert(Group && "Must have a group.")(static_cast <bool> (Group && "Must have a group."
) ? void (0) : __assert_fail ("Group && \"Must have a group.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4945, __extension__
__PRETTY_FUNCTION__))
;
4946
4947 // If the instruction's allocated size doesn't equal it's type size, it
4948 // requires padding and will be scalarized.
4949 auto &DL = I->getModule()->getDataLayout();
4950 auto *ScalarTy = getLoadStoreType(I);
4951 if (hasIrregularType(ScalarTy, DL))
4952 return false;
4953
4954 // Check if masking is required.
4955 // A Group may need masking for one of two reasons: it resides in a block that
4956 // needs predication, or it was decided to use masking to deal with gaps
4957 // (either a gap at the end of a load-access that may result in a speculative
4958 // load, or any gaps in a store-access).
4959 bool PredicatedAccessRequiresMasking =
4960 blockNeedsPredicationForAnyReason(I->getParent()) &&
4961 Legal->isMaskRequired(I);
4962 bool LoadAccessWithGapsRequiresEpilogMasking =
4963 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4964 !isScalarEpilogueAllowed();
4965 bool StoreAccessWithGapsRequiresMasking =
4966 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4967 if (!PredicatedAccessRequiresMasking &&
4968 !LoadAccessWithGapsRequiresEpilogMasking &&
4969 !StoreAccessWithGapsRequiresMasking)
4970 return true;
4971
4972 // If masked interleaving is required, we expect that the user/target had
4973 // enabled it, because otherwise it either wouldn't have been created or
4974 // it should have been invalidated by the CostModel.
4975 assert(useMaskedInterleavedAccesses(TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled."
) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4976, __extension__
__PRETTY_FUNCTION__))
4976 "Masked interleave-groups for predicated accesses are not enabled.")(static_cast <bool> (useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled."
) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4976, __extension__
__PRETTY_FUNCTION__))
;
4977
4978 if (Group->isReverse())
4979 return false;
4980
4981 auto *Ty = getLoadStoreType(I);
4982 const Align Alignment = getLoadStoreAlignment(I);
4983 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4984 : TTI.isLegalMaskedStore(Ty, Alignment);
4985}
4986
4987bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4988 Instruction *I, ElementCount VF) {
4989 // Get and ensure we have a valid memory instruction.
4990 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction")(static_cast <bool> ((isa<LoadInst, StoreInst>(I)
) && "Invalid memory instruction") ? void (0) : __assert_fail
("(isa<LoadInst, StoreInst>(I)) && \"Invalid memory instruction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4990, __extension__
__PRETTY_FUNCTION__))
;
4991
4992 auto *Ptr = getLoadStorePointerOperand(I);
4993 auto *ScalarTy = getLoadStoreType(I);
4994
4995 // In order to be widened, the pointer should be consecutive, first of all.
4996 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4997 return false;
4998
4999 // If the instruction is a store located in a predicated block, it will be
5000 // scalarized.
5001 if (isScalarWithPredication(I, VF))
5002 return false;
5003
5004 // If the instruction's allocated size doesn't equal it's type size, it
5005 // requires padding and will be scalarized.
5006 auto &DL = I->getModule()->getDataLayout();
5007 if (hasIrregularType(ScalarTy, DL))
5008 return false;
5009
5010 return true;
5011}
5012
5013void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5014 // We should not collect Uniforms more than once per VF. Right now,
5015 // this function is called from collectUniformsAndScalars(), which
5016 // already does this check. Collecting Uniforms for VF=1 does not make any
5017 // sense.
5018
5019 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&(static_cast <bool> (VF.isVector() && Uniforms.
find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5020, __extension__
__PRETTY_FUNCTION__))
5020 "This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Uniforms.
find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5020, __extension__
__PRETTY_FUNCTION__))
;
5021
5022 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5023 // not analyze again. Uniforms.count(VF) will return 1.
5024 Uniforms[VF].clear();
5025
5026 // We now know that the loop is vectorizable!
5027 // Collect instructions inside the loop that will remain uniform after
5028 // vectorization.
5029
5030 // Global values, params and instructions outside of current loop are out of
5031 // scope.
5032 auto isOutOfScope = [&](Value *V) -> bool {
5033 Instruction *I = dyn_cast<Instruction>(V);
5034 return (!I || !TheLoop->contains(I));
5035 };
5036
5037 // Worklist containing uniform instructions demanding lane 0.
5038 SetVector<Instruction *> Worklist;
5039 BasicBlock *Latch = TheLoop->getLoopLatch();
5040
5041 // Add uniform instructions demanding lane 0 to the worklist. Instructions
5042 // that are scalar with predication must not be considered uniform after
5043 // vectorization, because that would create an erroneous replicating region
5044 // where only a single instance out of VF should be formed.
5045 // TODO: optimize such seldom cases if found important, see PR40816.
5046 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5047 if (isOutOfScope(I)) {
5048 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
5049 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
;
5050 return;
5051 }
5052 if (isScalarWithPredication(I, VF)) {
5053 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
5054 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
;
5055 return;
5056 }
5057 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
<< *I << "\n"; } } while (false)
;
5058 Worklist.insert(I);
5059 };
5060
5061 // Start with the conditional branch. If the branch condition is an
5062 // instruction contained in the loop that is only used by the branch, it is
5063 // uniform.
5064 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5065 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5066 addToWorklistIfAllowed(Cmp);
5067
5068 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5069 InstWidening WideningDecision = getWideningDecision(I, VF);
5070 assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5071, __extension__
__PRETTY_FUNCTION__))
5071 "Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5071, __extension__
__PRETTY_FUNCTION__))
;
5072
5073 // A uniform memory op is itself uniform. We exclude uniform stores
5074 // here as they demand the last lane, not the first one.
5075 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5076 assert(WideningDecision == CM_Scalarize)(static_cast <bool> (WideningDecision == CM_Scalarize) ?
void (0) : __assert_fail ("WideningDecision == CM_Scalarize"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5076, __extension__
__PRETTY_FUNCTION__))
;
5077 return true;
5078 }
5079
5080 return (WideningDecision == CM_Widen ||
5081 WideningDecision == CM_Widen_Reverse ||
5082 WideningDecision == CM_Interleave);
5083 };
5084
5085
5086 // Returns true if Ptr is the pointer operand of a memory access instruction
5087 // I, and I is known to not require scalarization.
5088 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5089 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5090 };
5091
5092 // Holds a list of values which are known to have at least one uniform use.
5093 // Note that there may be other uses which aren't uniform. A "uniform use"
5094 // here is something which only demands lane 0 of the unrolled iterations;
5095 // it does not imply that all lanes produce the same value (e.g. this is not
5096 // the usual meaning of uniform)
5097 SetVector<Value *> HasUniformUse;
5098
5099 // Scan the loop for instructions which are either a) known to have only
5100 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5101 for (auto *BB : TheLoop->blocks())
5102 for (auto &I : *BB) {
5103 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
5104 switch (II->getIntrinsicID()) {
5105 case Intrinsic::sideeffect:
5106 case Intrinsic::experimental_noalias_scope_decl:
5107 case Intrinsic::assume:
5108 case Intrinsic::lifetime_start:
5109 case Intrinsic::lifetime_end:
5110 if (TheLoop->hasLoopInvariantOperands(&I))
5111 addToWorklistIfAllowed(&I);
5112 break;
5113 default:
5114 break;
5115 }
5116 }
5117
5118 // ExtractValue instructions must be uniform, because the operands are
5119 // known to be loop-invariant.
5120 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
5121 assert(isOutOfScope(EVI->getAggregateOperand()) &&(static_cast <bool> (isOutOfScope(EVI->getAggregateOperand
()) && "Expected aggregate value to be loop invariant"
) ? void (0) : __assert_fail ("isOutOfScope(EVI->getAggregateOperand()) && \"Expected aggregate value to be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5122, __extension__
__PRETTY_FUNCTION__))
5122 "Expected aggregate value to be loop invariant")(static_cast <bool> (isOutOfScope(EVI->getAggregateOperand
()) && "Expected aggregate value to be loop invariant"
) ? void (0) : __assert_fail ("isOutOfScope(EVI->getAggregateOperand()) && \"Expected aggregate value to be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5122, __extension__
__PRETTY_FUNCTION__))
;
5123 addToWorklistIfAllowed(EVI);
5124 continue;
5125 }
5126
5127 // If there's no pointer operand, there's nothing to do.
5128 auto *Ptr = getLoadStorePointerOperand(&I);
5129 if (!Ptr)
5130 continue;
5131
5132 // A uniform memory op is itself uniform. We exclude uniform stores
5133 // here as they demand the last lane, not the first one.
5134 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5135 addToWorklistIfAllowed(&I);
5136
5137 if (isUniformDecision(&I, VF)) {
5138 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check")(static_cast <bool> (isVectorizedMemAccessUse(&I, Ptr
) && "consistency check") ? void (0) : __assert_fail (
"isVectorizedMemAccessUse(&I, Ptr) && \"consistency check\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5138, __extension__
__PRETTY_FUNCTION__))
;
5139 HasUniformUse.insert(Ptr);
5140 }
5141 }
5142
5143 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5144 // demanding) users. Since loops are assumed to be in LCSSA form, this
5145 // disallows uses outside the loop as well.
5146 for (auto *V : HasUniformUse) {
5147 if (isOutOfScope(V))
5148 continue;
5149 auto *I = cast<Instruction>(V);
5150 auto UsersAreMemAccesses =
5151 llvm::all_of(I->users(), [&](User *U) -> bool {
5152 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5153 });
5154 if (UsersAreMemAccesses)
5155 addToWorklistIfAllowed(I);
5156 }
5157
5158 // Expand Worklist in topological order: whenever a new instruction
5159 // is added , its users should be already inside Worklist. It ensures
5160 // a uniform instruction will only be used by uniform instructions.
5161 unsigned idx = 0;
5162 while (idx != Worklist.size()) {
5163 Instruction *I = Worklist[idx++];
5164
5165 for (auto OV : I->operand_values()) {
5166 // isOutOfScope operands cannot be uniform instructions.
5167 if (isOutOfScope(OV))
5168 continue;
5169 // First order recurrence Phi's should typically be considered
5170 // non-uniform.
5171 auto *OP = dyn_cast<PHINode>(OV);
5172 if (OP && Legal->isFirstOrderRecurrence(OP))
5173 continue;
5174 // If all the users of the operand are uniform, then add the
5175 // operand into the uniform worklist.
5176 auto *OI = cast<Instruction>(OV);
5177 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5178 auto *J = cast<Instruction>(U);
5179 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5180 }))
5181 addToWorklistIfAllowed(OI);
5182 }
5183 }
5184
5185 // For an instruction to be added into Worklist above, all its users inside
5186 // the loop should also be in Worklist. However, this condition cannot be
5187 // true for phi nodes that form a cyclic dependence. We must process phi
5188 // nodes separately. An induction variable will remain uniform if all users
5189 // of the induction variable and induction variable update remain uniform.
5190 // The code below handles both pointer and non-pointer induction variables.
5191 for (auto &Induction : Legal->getInductionVars()) {
5192 auto *Ind = Induction.first;
5193 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5194
5195 // Determine if all users of the induction variable are uniform after
5196 // vectorization.
5197 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5198 auto *I = cast<Instruction>(U);
5199 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5200 isVectorizedMemAccessUse(I, Ind);
5201 });
5202 if (!UniformInd)
5203 continue;
5204
5205 // Determine if all users of the induction variable update instruction are
5206 // uniform after vectorization.
5207 auto UniformIndUpdate =
5208 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5209 auto *I = cast<Instruction>(U);
5210 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5211 isVectorizedMemAccessUse(I, IndUpdate);
5212 });
5213 if (!UniformIndUpdate)
5214 continue;
5215
5216 // The induction variable and its update instruction will remain uniform.
5217 addToWorklistIfAllowed(Ind);
5218 addToWorklistIfAllowed(IndUpdate);
5219 }
5220
5221 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5222}
5223
5224bool LoopVectorizationCostModel::runtimeChecksRequired() {
5225 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Performing code size checks.\n"
; } } while (false)
;
5226
5227 if (Legal->getRuntimePointerChecking()->Need) {
5228 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5229 "runtime pointer checks needed. Enable vectorization of this "
5230 "loop with '#pragma clang loop vectorize(enable)' when "
5231 "compiling with -Os/-Oz",
5232 "CantVersionLoopWithOptForSize", ORE, TheLoop);
5233 return true;
5234 }
5235
5236 if (!PSE.getUnionPredicate().getPredicates().empty()) {
5237 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5238 "runtime SCEV checks needed. Enable vectorization of this "
5239 "loop with '#pragma clang loop vectorize(enable)' when "
5240 "compiling with -Os/-Oz",
5241 "CantVersionLoopWithOptForSize", ORE, TheLoop);
5242 return true;
5243 }
5244
5245 // FIXME: Avoid specializing for stride==1 instead of bailing out.
5246 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5247 reportVectorizationFailure("Runtime stride check for small trip count",
5248 "runtime stride == 1 checks needed. Enable vectorization of "
5249 "this loop without such check by compiling with -Os/-Oz",
5250 "CantVersionLoopWithOptForSize", ORE, TheLoop);
5251 return true;
5252 }
5253
5254 return false;
5255}
5256
5257ElementCount
5258LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
5259 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
5260 return ElementCount::getScalable(0);
5261
5262 if (Hints->isScalableVectorizationDisabled()) {
5263 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
5264 "ScalableVectorizationDisabled", ORE, TheLoop);
5265 return ElementCount::getScalable(0);
5266 }
5267
5268 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalable vectorization is available\n"
; } } while (false)
;
5269
5270 auto MaxScalableVF = ElementCount::getScalable(
5271 std::numeric_limits<ElementCount::ScalarTy>::max());
5272
5273 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
5274 // FIXME: While for scalable vectors this is currently sufficient, this should
5275 // be replaced by a more detailed mechanism that filters out specific VFs,
5276 // instead of invalidating vectorization for a whole set of VFs based on the
5277 // MaxVF.
5278
5279 // Disable scalable vectorization if the loop contains unsupported reductions.
5280 if (!canVectorizeReductions(MaxScalableVF)) {
5281 reportVectorizationInfo(
5282 "Scalable vectorization not supported for the reduction "
5283 "operations found in this loop.",
5284 "ScalableVFUnfeasible", ORE, TheLoop);
5285 return ElementCount::getScalable(0);
5286 }
5287
5288 // Disable scalable vectorization if the loop contains any instructions
5289 // with element types not supported for scalable vectors.
5290 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
5291 return !Ty->isVoidTy() &&
5292 !this->TTI.isElementTypeLegalForScalableVector(Ty);
5293 })) {
5294 reportVectorizationInfo("Scalable vectorization is not supported "
5295 "for all element types found in this loop.",
5296 "ScalableVFUnfeasible", ORE, TheLoop);
5297 return ElementCount::getScalable(0);
5298 }
5299
5300 if (Legal->isSafeForAnyVectorWidth())
5301 return MaxScalableVF;
5302
5303 // Limit MaxScalableVF by the maximum safe dependence distance.
5304 Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5305 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
5306 MaxVScale =
5307 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
5308 MaxScalableVF = ElementCount::getScalable(
5309 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5310 if (!MaxScalableVF)
5311 reportVectorizationInfo(
5312 "Max legal vector width too small, scalable vectorization "
5313 "unfeasible.",
5314 "ScalableVFUnfeasible", ORE, TheLoop);
5315
5316 return MaxScalableVF;
5317}
5318
5319FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
5320 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
5321 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5322 unsigned SmallestType, WidestType;
5323 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5324
5325 // Get the maximum safe dependence distance in bits computed by LAA.
5326 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5327 // the memory accesses that is most restrictive (involved in the smallest
5328 // dependence distance).
5329 unsigned MaxSafeElements =
5330 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
5331
5332 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
5333 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
5334
5335 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe fixed VF is: "
<< MaxSafeFixedVF << ".\n"; } } while (false)
5336 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe fixed VF is: "
<< MaxSafeFixedVF << ".\n"; } } while (false)
;
5337 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe scalable VF is: "
<< MaxSafeScalableVF << ".\n"; } } while (false)
5338 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe scalable VF is: "
<< MaxSafeScalableVF << ".\n"; } } while (false)
;
5339
5340 // First analyze the UserVF, fall back if the UserVF should be ignored.
5341 if (UserVF) {
5342 auto MaxSafeUserVF =
5343 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
5344
5345 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
5346 // If `VF=vscale x N` is safe, then so is `VF=N`
5347 if (UserVF.isScalable())
5348 return FixedScalableVFPair(
5349 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
5350 else
5351 return UserVF;
5352 }
5353
5354 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF))(static_cast <bool> (ElementCount::isKnownGT(UserVF, MaxSafeUserVF
)) ? void (0) : __assert_fail ("ElementCount::isKnownGT(UserVF, MaxSafeUserVF)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5354, __extension__
__PRETTY_FUNCTION__))
;
5355
5356 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
5357 // is better to ignore the hint and let the compiler choose a suitable VF.
5358 if (!UserVF.isScalable()) {
5359 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeFixedVF << ".\n"; } } while (false)
5360 << " is unsafe, clamping to max safe VF="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeFixedVF << ".\n"; } } while (false)
5361 << MaxSafeFixedVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeFixedVF << ".\n"; } } while (false)
;
5362 ORE->emit([&]() {
5363 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
5364 TheLoop->getStartLoc(),
5365 TheLoop->getHeader())
5366 << "User-specified vectorization factor "
5367 << ore::NV("UserVectorizationFactor", UserVF)
5368 << " is unsafe, clamping to maximum safe vectorization factor "
5369 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
5370 });
5371 return MaxSafeFixedVF;
5372 }
5373
5374 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
5375 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is ignored because scalable vectors are not "
"available.\n"; } } while (false)
5376 << " is ignored because scalable vectors are not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is ignored because scalable vectors are not "
"available.\n"; } } while (false)
5377 "available.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is ignored because scalable vectors are not "
"available.\n"; } } while (false)
;
5378 ORE->emit([&]() {
5379 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
5380 TheLoop->getStartLoc(),
5381 TheLoop->getHeader())
5382 << "User-specified vectorization factor "
5383 << ore::NV("UserVectorizationFactor", UserVF)
5384 << " is ignored because the target does not support scalable "
5385 "vectors. The compiler will pick a more suitable value.";
5386 });
5387 } else {
5388 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe. Ignoring scalable UserVF.\n"; }
} while (false)
5389 << " is unsafe. Ignoring scalable UserVF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe. Ignoring scalable UserVF.\n"; }
} while (false)
;
5390 ORE->emit([&]() {
5391 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
5392 TheLoop->getStartLoc(),
5393 TheLoop->getHeader())
5394 << "User-specified vectorization factor "
5395 << ore::NV("UserVectorizationFactor", UserVF)
5396 << " is unsafe. Ignoring the hint to let the compiler pick a "
5397 "more suitable value.";
5398 });
5399 }
5400 }
5401
5402 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestTypedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
5403 << " / " << WidestType << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
;
5404
5405 FixedScalableVFPair Result(ElementCount::getFixed(1),
5406 ElementCount::getScalable(0));
5407 if (auto MaxVF =
5408 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5409 MaxSafeFixedVF, FoldTailByMasking))
5410 Result.FixedVF = MaxVF;
5411
5412 if (auto MaxVF =
5413 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5414 MaxSafeScalableVF, FoldTailByMasking))
5415 if (MaxVF.isScalable()) {
5416 Result.ScalableVF = MaxVF;
5417 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found feasible scalable VF = "
<< MaxVF << "\n"; } } while (false)
5418 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found feasible scalable VF = "
<< MaxVF << "\n"; } } while (false)
;
5419 }
5420
5421 return Result;
5422}
5423
5424FixedScalableVFPair
5425LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5426 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5427 // TODO: It may by useful to do since it's still likely to be dynamically
5428 // uniform if the target can skip.
5429 reportVectorizationFailure(
5430 "Not inserting runtime ptr check for divergent target",
5431 "runtime pointer checks needed. Not enabled for divergent target",
5432 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5433 return FixedScalableVFPair::getNone();
5434 }
5435
5436 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5437 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found trip count: "
<< TC << '\n'; } } while (false)
;
5438 if (TC == 1) {
5439 reportVectorizationFailure("Single iteration (non) loop",
5440 "loop trip count is one, irrelevant for vectorization",
5441 "SingleIterationLoop", ORE, TheLoop);
5442 return FixedScalableVFPair::getNone();
5443 }
5444
5445 switch (ScalarEpilogueStatus) {
5446 case CM_ScalarEpilogueAllowed:
5447 return computeFeasibleMaxVF(TC, UserVF, false);
5448 case CM_ScalarEpilogueNotAllowedUsePredicate:
5449 LLVM_FALLTHROUGH[[gnu::fallthrough]];
5450 case CM_ScalarEpilogueNotNeededUsePredicate:
5451 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5452 dbgs() << "LV: vector predicate hint/switch found.\n"do { if (::llvm::DebugFlag &