Bug Summary

File:build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Warning:line 8109, column 35
Potential leak of memory pointed to by 'BlockMask'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name LoopVectorize.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm -resource-dir /usr/lib/llvm-16/lib/clang/16.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/llvm/lib/Transforms/Vectorize -I include -I /build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-16/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm=build-llvm -fmacro-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm=build-llvm -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm=build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/= -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-09-04-125545-48738-1 -x c++ /build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/Proposal/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanHCFGBuilder.h"
61#include "VPlanTransforms.h"
62#include "llvm/ADT/APInt.h"
63#include "llvm/ADT/ArrayRef.h"
64#include "llvm/ADT/DenseMap.h"
65#include "llvm/ADT/DenseMapInfo.h"
66#include "llvm/ADT/Hashing.h"
67#include "llvm/ADT/MapVector.h"
68#include "llvm/ADT/None.h"
69#include "llvm/ADT/Optional.h"
70#include "llvm/ADT/STLExtras.h"
71#include "llvm/ADT/SmallPtrSet.h"
72#include "llvm/ADT/SmallSet.h"
73#include "llvm/ADT/SmallVector.h"
74#include "llvm/ADT/Statistic.h"
75#include "llvm/ADT/StringRef.h"
76#include "llvm/ADT/Twine.h"
77#include "llvm/ADT/iterator_range.h"
78#include "llvm/Analysis/AssumptionCache.h"
79#include "llvm/Analysis/BasicAliasAnalysis.h"
80#include "llvm/Analysis/BlockFrequencyInfo.h"
81#include "llvm/Analysis/CFG.h"
82#include "llvm/Analysis/CodeMetrics.h"
83#include "llvm/Analysis/DemandedBits.h"
84#include "llvm/Analysis/GlobalsModRef.h"
85#include "llvm/Analysis/LoopAccessAnalysis.h"
86#include "llvm/Analysis/LoopAnalysisManager.h"
87#include "llvm/Analysis/LoopInfo.h"
88#include "llvm/Analysis/LoopIterator.h"
89#include "llvm/Analysis/OptimizationRemarkEmitter.h"
90#include "llvm/Analysis/ProfileSummaryInfo.h"
91#include "llvm/Analysis/ScalarEvolution.h"
92#include "llvm/Analysis/ScalarEvolutionExpressions.h"
93#include "llvm/Analysis/TargetLibraryInfo.h"
94#include "llvm/Analysis/TargetTransformInfo.h"
95#include "llvm/Analysis/ValueTracking.h"
96#include "llvm/Analysis/VectorUtils.h"
97#include "llvm/IR/Attributes.h"
98#include "llvm/IR/BasicBlock.h"
99#include "llvm/IR/CFG.h"
100#include "llvm/IR/Constant.h"
101#include "llvm/IR/Constants.h"
102#include "llvm/IR/DataLayout.h"
103#include "llvm/IR/DebugInfoMetadata.h"
104#include "llvm/IR/DebugLoc.h"
105#include "llvm/IR/DerivedTypes.h"
106#include "llvm/IR/DiagnosticInfo.h"
107#include "llvm/IR/Dominators.h"
108#include "llvm/IR/Function.h"
109#include "llvm/IR/IRBuilder.h"
110#include "llvm/IR/InstrTypes.h"
111#include "llvm/IR/Instruction.h"
112#include "llvm/IR/Instructions.h"
113#include "llvm/IR/IntrinsicInst.h"
114#include "llvm/IR/Intrinsics.h"
115#include "llvm/IR/Metadata.h"
116#include "llvm/IR/Module.h"
117#include "llvm/IR/Operator.h"
118#include "llvm/IR/PatternMatch.h"
119#include "llvm/IR/Type.h"
120#include "llvm/IR/Use.h"
121#include "llvm/IR/User.h"
122#include "llvm/IR/Value.h"
123#include "llvm/IR/ValueHandle.h"
124#include "llvm/IR/Verifier.h"
125#include "llvm/InitializePasses.h"
126#include "llvm/Pass.h"
127#include "llvm/Support/Casting.h"
128#include "llvm/Support/CommandLine.h"
129#include "llvm/Support/Compiler.h"
130#include "llvm/Support/Debug.h"
131#include "llvm/Support/ErrorHandling.h"
132#include "llvm/Support/InstructionCost.h"
133#include "llvm/Support/MathExtras.h"
134#include "llvm/Support/raw_ostream.h"
135#include "llvm/Transforms/Utils/BasicBlockUtils.h"
136#include "llvm/Transforms/Utils/InjectTLIMappings.h"
137#include "llvm/Transforms/Utils/LoopSimplify.h"
138#include "llvm/Transforms/Utils/LoopUtils.h"
139#include "llvm/Transforms/Utils/LoopVersioning.h"
140#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141#include "llvm/Transforms/Utils/SizeOpts.h"
142#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143#include <algorithm>
144#include <cassert>
145#include <cstdint>
146#include <functional>
147#include <iterator>
148#include <limits>
149#include <map>
150#include <memory>
151#include <string>
152#include <tuple>
153#include <utility>
154
155using namespace llvm;
156
157#define LV_NAME"loop-vectorize" "loop-vectorize"
158#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"
159
160#ifndef NDEBUG
161const char VerboseDebug[] = DEBUG_TYPE"loop-vectorize" "-verbose";
162#endif
163
164/// @{
165/// Metadata attribute names
166const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
167const char LLVMLoopVectorizeFollowupVectorized[] =
168 "llvm.loop.vectorize.followup_vectorized";
169const char LLVMLoopVectorizeFollowupEpilogue[] =
170 "llvm.loop.vectorize.followup_epilogue";
171/// @}
172
173STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = {"loop-vectorize", "LoopsVectorized"
, "Number of loops vectorized"}
;
174STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = {"loop-vectorize", "LoopsAnalyzed"
, "Number of loops analyzed for vectorization"}
;
175STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized")static llvm::Statistic LoopsEpilogueVectorized = {"loop-vectorize"
, "LoopsEpilogueVectorized", "Number of epilogues vectorized"
}
;
176
177static cl::opt<bool> EnableEpilogueVectorization(
178 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
179 cl::desc("Enable vectorization of epilogue loops."));
180
181static cl::opt<unsigned> EpilogueVectorizationForceVF(
182 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
183 cl::desc("When epilogue vectorization is enabled, and a value greater than "
184 "1 is specified, forces the given VF for all applicable epilogue "
185 "loops."));
186
187static cl::opt<unsigned> EpilogueVectorizationMinVF(
188 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189 cl::desc("Only loops with vectorization factor equal to or larger than "
190 "the specified value are considered for epilogue vectorization."));
191
192/// Loops with a known constant trip count below this number are vectorized only
193/// if no scalar iteration overheads are incurred.
194static cl::opt<unsigned> TinyTripCountVectorThreshold(
195 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
196 cl::desc("Loops with a constant trip count that is smaller than this "
197 "value are vectorized only if no scalar iteration overheads "
198 "are incurred."));
199
200static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
201 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
202 cl::desc("The maximum allowed number of runtime memory checks"));
203
204// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
205// that predication is preferred, and this lists all options. I.e., the
206// vectorizer will try to fold the tail-loop (epilogue) into the vector body
207// and predicate the instructions accordingly. If tail-folding fails, there are
208// different fallback strategies depending on these values:
209namespace PreferPredicateTy {
210 enum Option {
211 ScalarEpilogue = 0,
212 PredicateElseScalarEpilogue,
213 PredicateOrDontVectorize
214 };
215} // namespace PreferPredicateTy
216
217static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
218 "prefer-predicate-over-epilogue",
219 cl::init(PreferPredicateTy::ScalarEpilogue),
220 cl::Hidden,
221 cl::desc("Tail-folding and predication preferences over creating a scalar "
222 "epilogue loop."),
223 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
224 "scalar-epilogue",llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
225 "Don't tail-predicate loops, create scalar epilogue")llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
,
226 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
227 "predicate-else-scalar-epilogue",llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
228 "prefer tail-folding, create scalar epilogue if tail "llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
229 "folding fails.")llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
,
230 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
231 "predicate-dont-vectorize",llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
232 "prefers tail-folding, don't attempt vectorization if "llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
233 "tail-folding fails.")llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
));
234
235static cl::opt<bool> MaximizeBandwidth(
236 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
237 cl::desc("Maximize bandwidth when selecting vectorization factor which "
238 "will be determined by the smallest type in loop."));
239
240static cl::opt<bool> EnableInterleavedMemAccesses(
241 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
242 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
243
244/// An interleave-group may need masking if it resides in a block that needs
245/// predication, or in order to mask away gaps.
246static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
247 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
248 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
249
250static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
251 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
252 cl::desc("We don't interleave loops with a estimated constant trip count "
253 "below this number"));
254
255static cl::opt<unsigned> ForceTargetNumScalarRegs(
256 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
257 cl::desc("A flag that overrides the target's number of scalar registers."));
258
259static cl::opt<unsigned> ForceTargetNumVectorRegs(
260 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
261 cl::desc("A flag that overrides the target's number of vector registers."));
262
263static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
264 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
265 cl::desc("A flag that overrides the target's max interleave factor for "
266 "scalar loops."));
267
268static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
269 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
270 cl::desc("A flag that overrides the target's max interleave factor for "
271 "vectorized loops."));
272
273static cl::opt<unsigned> ForceTargetInstructionCost(
274 "force-target-instruction-cost", cl::init(0), cl::Hidden,
275 cl::desc("A flag that overrides the target's expected cost for "
276 "an instruction to a single constant value. Mostly "
277 "useful for getting consistent testing."));
278
279static cl::opt<bool> ForceTargetSupportsScalableVectors(
280 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
281 cl::desc(
282 "Pretend that scalable vectors are supported, even if the target does "
283 "not support them. This flag should only be used for testing."));
284
285static cl::opt<unsigned> SmallLoopCost(
286 "small-loop-cost", cl::init(20), cl::Hidden,
287 cl::desc(
288 "The cost of a loop that is considered 'small' by the interleaver."));
289
290static cl::opt<bool> LoopVectorizeWithBlockFrequency(
291 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
292 cl::desc("Enable the use of the block frequency analysis to access PGO "
293 "heuristics minimizing code growth in cold regions and being more "
294 "aggressive in hot regions."));
295
296// Runtime interleave loops for load/store throughput.
297static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
298 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
299 cl::desc(
300 "Enable runtime interleaving until load/store ports are saturated"));
301
302/// Interleave small loops with scalar reductions.
303static cl::opt<bool> InterleaveSmallLoopScalarReduction(
304 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
305 cl::desc("Enable interleaving for loops with small iteration counts that "
306 "contain scalar reductions to expose ILP."));
307
308/// The number of stores in a loop that are allowed to need predication.
309static cl::opt<unsigned> NumberOfStoresToPredicate(
310 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
311 cl::desc("Max number of stores to be predicated behind an if."));
312
313static cl::opt<bool> EnableIndVarRegisterHeur(
314 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
315 cl::desc("Count the induction variable only once when interleaving"));
316
317static cl::opt<bool> EnableCondStoresVectorization(
318 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
319 cl::desc("Enable if predication of stores during vectorization."));
320
321static cl::opt<unsigned> MaxNestedScalarReductionIC(
322 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
323 cl::desc("The maximum interleave count to use when interleaving a scalar "
324 "reduction in a nested loop."));
325
326static cl::opt<bool>
327 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
328 cl::Hidden,
329 cl::desc("Prefer in-loop vector reductions, "
330 "overriding the targets preference."));
331
332static cl::opt<bool> ForceOrderedReductions(
333 "force-ordered-reductions", cl::init(false), cl::Hidden,
334 cl::desc("Enable the vectorisation of loops with in-order (strict) "
335 "FP reductions"));
336
337static cl::opt<bool> PreferPredicatedReductionSelect(
338 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
339 cl::desc(
340 "Prefer predicating a reduction operation over an after loop select."));
341
342cl::opt<bool> EnableVPlanNativePath(
343 "enable-vplan-native-path", cl::init(false), cl::Hidden,
344 cl::desc("Enable VPlan-native vectorization path with "
345 "support for outer loop vectorization."));
346
347// This flag enables the stress testing of the VPlan H-CFG construction in the
348// VPlan-native vectorization path. It must be used in conjuction with
349// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
350// verification of the H-CFGs built.
351static cl::opt<bool> VPlanBuildStressTest(
352 "vplan-build-stress-test", cl::init(false), cl::Hidden,
353 cl::desc(
354 "Build VPlan for every supported loop nest in the function and bail "
355 "out right after the build (stress test the VPlan H-CFG construction "
356 "in the VPlan-native vectorization path)."));
357
358cl::opt<bool> llvm::EnableLoopInterleaving(
359 "interleave-loops", cl::init(true), cl::Hidden,
360 cl::desc("Enable loop interleaving in Loop vectorization passes"));
361cl::opt<bool> llvm::EnableLoopVectorization(
362 "vectorize-loops", cl::init(true), cl::Hidden,
363 cl::desc("Run the Loop vectorization passes"));
364
365cl::opt<bool> PrintVPlansInDotFormat(
366 "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
367 cl::desc("Use dot format instead of plain text when dumping VPlans"));
368
369/// A helper function that returns true if the given type is irregular. The
370/// type is irregular if its allocated size doesn't equal the store size of an
371/// element of the corresponding vector type.
372static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
373 // Determine if an array of N elements of type Ty is "bitcast compatible"
374 // with a <N x Ty> vector.
375 // This is only true if there is no padding between the array elements.
376 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
377}
378
379/// A helper function that returns the reciprocal of the block probability of
380/// predicated blocks. If we return X, we are assuming the predicated block
381/// will execute once for every X iterations of the loop header.
382///
383/// TODO: We should use actual block probability here, if available. Currently,
384/// we always assume predicated blocks have a 50% chance of executing.
385static unsigned getReciprocalPredBlockProb() { return 2; }
386
387/// A helper function that returns an integer or floating-point constant with
388/// value C.
389static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
390 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
391 : ConstantFP::get(Ty, C);
392}
393
394/// Returns "best known" trip count for the specified loop \p L as defined by
395/// the following procedure:
396/// 1) Returns exact trip count if it is known.
397/// 2) Returns expected trip count according to profile data if any.
398/// 3) Returns upper bound estimate if it is known.
399/// 4) Returns None if all of the above failed.
400static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
401 // Check if exact trip count is known.
402 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
403 return ExpectedTC;
404
405 // Check if there is an expected trip count available from profile data.
406 if (LoopVectorizeWithBlockFrequency)
407 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
408 return EstimatedTC;
409
410 // Check if upper bound estimate is known.
411 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
412 return ExpectedTC;
413
414 return None;
415}
416
417// Forward declare GeneratedRTChecks.
418class GeneratedRTChecks;
419
420namespace llvm {
421
422AnalysisKey ShouldRunExtraVectorPasses::Key;
423
424/// InnerLoopVectorizer vectorizes loops which contain only one basic
425/// block to a specified vectorization factor (VF).
426/// This class performs the widening of scalars into vectors, or multiple
427/// scalars. This class also implements the following features:
428/// * It inserts an epilogue loop for handling loops that don't have iteration
429/// counts that are known to be a multiple of the vectorization factor.
430/// * It handles the code generation for reduction variables.
431/// * Scalarization (implementation using scalars) of un-vectorizable
432/// instructions.
433/// InnerLoopVectorizer does not perform any vectorization-legality
434/// checks, and relies on the caller to check for the different legality
435/// aspects. The InnerLoopVectorizer relies on the
436/// LoopVectorizationLegality class to provide information about the induction
437/// and reduction variables that were found to a given vectorization factor.
438class InnerLoopVectorizer {
439public:
440 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
441 LoopInfo *LI, DominatorTree *DT,
442 const TargetLibraryInfo *TLI,
443 const TargetTransformInfo *TTI, AssumptionCache *AC,
444 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
445 ElementCount MinProfitableTripCount,
446 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
447 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
448 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
449 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
450 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
451 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
452 PSI(PSI), RTChecks(RTChecks) {
453 // Query this against the original loop and save it here because the profile
454 // of the original loop header may change as the transformation happens.
455 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
456 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
457
458 if (MinProfitableTripCount.isZero())
459 this->MinProfitableTripCount = VecWidth;
460 else
461 this->MinProfitableTripCount = MinProfitableTripCount;
462 }
463
464 virtual ~InnerLoopVectorizer() = default;
465
466 /// Create a new empty loop that will contain vectorized instructions later
467 /// on, while the old loop will be used as the scalar remainder. Control flow
468 /// is generated around the vectorized (and scalar epilogue) loops consisting
469 /// of various checks and bypasses. Return the pre-header block of the new
470 /// loop and the start value for the canonical induction, if it is != 0. The
471 /// latter is the case when vectorizing the epilogue loop. In the case of
472 /// epilogue vectorization, this function is overriden to handle the more
473 /// complex control flow around the loops.
474 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
475
476 /// Widen a single call instruction within the innermost loop.
477 void widenCallInstruction(CallInst &CI, VPValue *Def, VPUser &ArgOperands,
478 VPTransformState &State,
479 Intrinsic::ID VectorIntrinsicID);
480
481 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
482 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
483
484 // Return true if any runtime check is added.
485 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
486
487 /// A type for vectorized values in the new loop. Each value from the
488 /// original loop, when vectorized, is represented by UF vector values in the
489 /// new unrolled loop, where UF is the unroll factor.
490 using VectorParts = SmallVector<Value *, 2>;
491
492 /// A helper function to scalarize a single Instruction in the innermost loop.
493 /// Generates a sequence of scalar instances for each lane between \p MinLane
494 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
495 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
496 /// Instr's operands.
497 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
498 const VPIteration &Instance, bool IfPredicateInstr,
499 VPTransformState &State);
500
501 /// Construct the vector value of a scalarized value \p V one lane at a time.
502 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
503 VPTransformState &State);
504
505 /// Try to vectorize interleaved access group \p Group with the base address
506 /// given in \p Addr, optionally masking the vector operations if \p
507 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
508 /// values in the vectorized loop.
509 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
510 ArrayRef<VPValue *> VPDefs,
511 VPTransformState &State, VPValue *Addr,
512 ArrayRef<VPValue *> StoredValues,
513 VPValue *BlockInMask = nullptr);
514
515 /// Fix the non-induction PHIs in \p Plan.
516 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
517
518 /// Returns true if the reordering of FP operations is not allowed, but we are
519 /// able to vectorize with strict in-order reductions for the given RdxDesc.
520 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
521
522 /// Create a broadcast instruction. This method generates a broadcast
523 /// instruction (shuffle) for loop invariant values and for the induction
524 /// value. If this is the induction variable then we extend it to N, N+1, ...
525 /// this is needed because each iteration in the loop corresponds to a SIMD
526 /// element.
527 virtual Value *getBroadcastInstrs(Value *V);
528
529 // Returns the resume value (bc.merge.rdx) for a reduction as
530 // generated by fixReduction.
531 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
532
533protected:
534 friend class LoopVectorizationPlanner;
535
536 /// A small list of PHINodes.
537 using PhiVector = SmallVector<PHINode *, 4>;
538
539 /// A type for scalarized values in the new loop. Each value from the
540 /// original loop, when scalarized, is represented by UF x VF scalar values
541 /// in the new unrolled loop, where UF is the unroll factor and VF is the
542 /// vectorization factor.
543 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
544
545 /// Set up the values of the IVs correctly when exiting the vector loop.
546 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
547 Value *VectorTripCount, Value *EndValue,
548 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
549 VPlan &Plan);
550
551 /// Handle all cross-iteration phis in the header.
552 void fixCrossIterationPHIs(VPTransformState &State);
553
554 /// Create the exit value of first order recurrences in the middle block and
555 /// update their users.
556 void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
557 VPTransformState &State);
558
559 /// Create code for the loop exit value of the reduction.
560 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
561
562 /// Clear NSW/NUW flags from reduction instructions if necessary.
563 void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
564 VPTransformState &State);
565
566 /// Iteratively sink the scalarized operands of a predicated instruction into
567 /// the block that was created for it.
568 void sinkScalarOperands(Instruction *PredInst);
569
570 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
571 /// represented as.
572 void truncateToMinimalBitwidths(VPTransformState &State);
573
574 /// Returns (and creates if needed) the original loop trip count.
575 Value *getOrCreateTripCount(BasicBlock *InsertBlock);
576
577 /// Returns (and creates if needed) the trip count of the widened loop.
578 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
579
580 /// Returns a bitcasted value to the requested vector type.
581 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
582 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
583 const DataLayout &DL);
584
585 /// Emit a bypass check to see if the vector trip count is zero, including if
586 /// it overflows.
587 void emitIterationCountCheck(BasicBlock *Bypass);
588
589 /// Emit a bypass check to see if all of the SCEV assumptions we've
590 /// had to make are correct. Returns the block containing the checks or
591 /// nullptr if no checks have been added.
592 BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
593
594 /// Emit bypass checks to check any memory assumptions we may have made.
595 /// Returns the block containing the checks or nullptr if no checks have been
596 /// added.
597 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
598
599 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
600 /// vector loop preheader, middle block and scalar preheader.
601 void createVectorLoopSkeleton(StringRef Prefix);
602
603 /// Create new phi nodes for the induction variables to resume iteration count
604 /// in the scalar epilogue, from where the vectorized loop left off.
605 /// In cases where the loop skeleton is more complicated (eg. epilogue
606 /// vectorization) and the resume values can come from an additional bypass
607 /// block, the \p AdditionalBypass pair provides information about the bypass
608 /// block and the end value on the edge from bypass to this loop.
609 void createInductionResumeValues(
610 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
611
612 /// Complete the loop skeleton by adding debug MDs, creating appropriate
613 /// conditional branches in the middle block, preparing the builder and
614 /// running the verifier. Return the preheader of the completed vector loop.
615 BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID);
616
617 /// Collect poison-generating recipes that may generate a poison value that is
618 /// used after vectorization, even when their operands are not poison. Those
619 /// recipes meet the following conditions:
620 /// * Contribute to the address computation of a recipe generating a widen
621 /// memory load/store (VPWidenMemoryInstructionRecipe or
622 /// VPInterleaveRecipe).
623 /// * Such a widen memory load/store has at least one underlying Instruction
624 /// that is in a basic block that needs predication and after vectorization
625 /// the generated instruction won't be predicated.
626 void collectPoisonGeneratingRecipes(VPTransformState &State);
627
628 /// Allow subclasses to override and print debug traces before/after vplan
629 /// execution, when trace information is requested.
630 virtual void printDebugTracesAtStart(){};
631 virtual void printDebugTracesAtEnd(){};
632
633 /// The original loop.
634 Loop *OrigLoop;
635
636 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
637 /// dynamic knowledge to simplify SCEV expressions and converts them to a
638 /// more usable form.
639 PredicatedScalarEvolution &PSE;
640
641 /// Loop Info.
642 LoopInfo *LI;
643
644 /// Dominator Tree.
645 DominatorTree *DT;
646
647 /// Alias Analysis.
648 AAResults *AA;
649
650 /// Target Library Info.
651 const TargetLibraryInfo *TLI;
652
653 /// Target Transform Info.
654 const TargetTransformInfo *TTI;
655
656 /// Assumption Cache.
657 AssumptionCache *AC;
658
659 /// Interface to emit optimization remarks.
660 OptimizationRemarkEmitter *ORE;
661
662 /// The vectorization SIMD factor to use. Each vector will have this many
663 /// vector elements.
664 ElementCount VF;
665
666 ElementCount MinProfitableTripCount;
667
668 /// The vectorization unroll factor to use. Each scalar is vectorized to this
669 /// many different vector instructions.
670 unsigned UF;
671
672 /// The builder that we use
673 IRBuilder<> Builder;
674
675 // --- Vectorization state ---
676
677 /// The vector-loop preheader.
678 BasicBlock *LoopVectorPreHeader;
679
680 /// The scalar-loop preheader.
681 BasicBlock *LoopScalarPreHeader;
682
683 /// Middle Block between the vector and the scalar.
684 BasicBlock *LoopMiddleBlock;
685
686 /// The unique ExitBlock of the scalar loop if one exists. Note that
687 /// there can be multiple exiting edges reaching this block.
688 BasicBlock *LoopExitBlock;
689
690 /// The scalar loop body.
691 BasicBlock *LoopScalarBody;
692
693 /// A list of all bypass blocks. The first block is the entry of the loop.
694 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
695
696 /// Store instructions that were predicated.
697 SmallVector<Instruction *, 4> PredicatedInstructions;
698
699 /// Trip count of the original loop.
700 Value *TripCount = nullptr;
701
702 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
703 Value *VectorTripCount = nullptr;
704
705 /// The legality analysis.
706 LoopVectorizationLegality *Legal;
707
708 /// The profitablity analysis.
709 LoopVectorizationCostModel *Cost;
710
711 // Record whether runtime checks are added.
712 bool AddedSafetyChecks = false;
713
714 // Holds the end values for each induction variable. We save the end values
715 // so we can later fix-up the external users of the induction variables.
716 DenseMap<PHINode *, Value *> IVEndValues;
717
718 /// BFI and PSI are used to check for profile guided size optimizations.
719 BlockFrequencyInfo *BFI;
720 ProfileSummaryInfo *PSI;
721
722 // Whether this loop should be optimized for size based on profile guided size
723 // optimizatios.
724 bool OptForSizeBasedOnProfile;
725
726 /// Structure to hold information about generated runtime checks, responsible
727 /// for cleaning the checks, if vectorization turns out unprofitable.
728 GeneratedRTChecks &RTChecks;
729
730 // Holds the resume values for reductions in the loops, used to set the
731 // correct start value of reduction PHIs when vectorizing the epilogue.
732 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
733 ReductionResumeValues;
734};
735
736class InnerLoopUnroller : public InnerLoopVectorizer {
737public:
738 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
739 LoopInfo *LI, DominatorTree *DT,
740 const TargetLibraryInfo *TLI,
741 const TargetTransformInfo *TTI, AssumptionCache *AC,
742 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
743 LoopVectorizationLegality *LVL,
744 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
745 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
746 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
747 ElementCount::getFixed(1),
748 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
749 BFI, PSI, Check) {}
750
751private:
752 Value *getBroadcastInstrs(Value *V) override;
753};
754
755/// Encapsulate information regarding vectorization of a loop and its epilogue.
756/// This information is meant to be updated and used across two stages of
757/// epilogue vectorization.
758struct EpilogueLoopVectorizationInfo {
759 ElementCount MainLoopVF = ElementCount::getFixed(0);
760 unsigned MainLoopUF = 0;
761 ElementCount EpilogueVF = ElementCount::getFixed(0);
762 unsigned EpilogueUF = 0;
763 BasicBlock *MainLoopIterationCountCheck = nullptr;
764 BasicBlock *EpilogueIterationCountCheck = nullptr;
765 BasicBlock *SCEVSafetyCheck = nullptr;
766 BasicBlock *MemSafetyCheck = nullptr;
767 Value *TripCount = nullptr;
768 Value *VectorTripCount = nullptr;
769
770 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
771 ElementCount EVF, unsigned EUF)
772 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
773 assert(EUF == 1 &&(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 774, __extension__
__PRETTY_FUNCTION__))
774 "A high UF for the epilogue loop is likely not beneficial.")(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 774, __extension__
__PRETTY_FUNCTION__))
;
775 }
776};
777
778/// An extension of the inner loop vectorizer that creates a skeleton for a
779/// vectorized loop that has its epilogue (residual) also vectorized.
780/// The idea is to run the vplan on a given loop twice, firstly to setup the
781/// skeleton and vectorize the main loop, and secondly to complete the skeleton
782/// from the first step and vectorize the epilogue. This is achieved by
783/// deriving two concrete strategy classes from this base class and invoking
784/// them in succession from the loop vectorizer planner.
785class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
786public:
787 InnerLoopAndEpilogueVectorizer(
788 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
789 DominatorTree *DT, const TargetLibraryInfo *TLI,
790 const TargetTransformInfo *TTI, AssumptionCache *AC,
791 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
792 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
793 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
794 GeneratedRTChecks &Checks)
795 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
796 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
797 CM, BFI, PSI, Checks),
798 EPI(EPI) {}
799
800 // Override this function to handle the more complex control flow around the
801 // three loops.
802 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton() final {
803 return createEpilogueVectorizedLoopSkeleton();
804 }
805
806 /// The interface for creating a vectorized skeleton using one of two
807 /// different strategies, each corresponding to one execution of the vplan
808 /// as described above.
809 virtual std::pair<BasicBlock *, Value *>
810 createEpilogueVectorizedLoopSkeleton() = 0;
811
812 /// Holds and updates state information required to vectorize the main loop
813 /// and its epilogue in two separate passes. This setup helps us avoid
814 /// regenerating and recomputing runtime safety checks. It also helps us to
815 /// shorten the iteration-count-check path length for the cases where the
816 /// iteration count of the loop is so small that the main vector loop is
817 /// completely skipped.
818 EpilogueLoopVectorizationInfo &EPI;
819};
820
821/// A specialized derived class of inner loop vectorizer that performs
822/// vectorization of *main* loops in the process of vectorizing loops and their
823/// epilogues.
824class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
825public:
826 EpilogueVectorizerMainLoop(
827 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
828 DominatorTree *DT, const TargetLibraryInfo *TLI,
829 const TargetTransformInfo *TTI, AssumptionCache *AC,
830 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
831 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
832 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
833 GeneratedRTChecks &Check)
834 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
835 EPI, LVL, CM, BFI, PSI, Check) {}
836 /// Implements the interface for creating a vectorized skeleton using the
837 /// *main loop* strategy (ie the first pass of vplan execution).
838 std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
839
840protected:
841 /// Emits an iteration count bypass check once for the main loop (when \p
842 /// ForEpilogue is false) and once for the epilogue loop (when \p
843 /// ForEpilogue is true).
844 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
845 void printDebugTracesAtStart() override;
846 void printDebugTracesAtEnd() override;
847};
848
849// A specialized derived class of inner loop vectorizer that performs
850// vectorization of *epilogue* loops in the process of vectorizing loops and
851// their epilogues.
852class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
853public:
854 EpilogueVectorizerEpilogueLoop(
855 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
856 DominatorTree *DT, const TargetLibraryInfo *TLI,
857 const TargetTransformInfo *TTI, AssumptionCache *AC,
858 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
859 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
860 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
861 GeneratedRTChecks &Checks)
862 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
863 EPI, LVL, CM, BFI, PSI, Checks) {
864 TripCount = EPI.TripCount;
865 }
866 /// Implements the interface for creating a vectorized skeleton using the
867 /// *epilogue loop* strategy (ie the second pass of vplan execution).
868 std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
869
870protected:
871 /// Emits an iteration count bypass check after the main vector loop has
872 /// finished to see if there are any iterations left to execute by either
873 /// the vector epilogue or the scalar epilogue.
874 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
875 BasicBlock *Bypass,
876 BasicBlock *Insert);
877 void printDebugTracesAtStart() override;
878 void printDebugTracesAtEnd() override;
879};
880} // end namespace llvm
881
882/// Look for a meaningful debug location on the instruction or it's
883/// operands.
884static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
885 if (!I)
886 return I;
887
888 DebugLoc Empty;
889 if (I->getDebugLoc() != Empty)
890 return I;
891
892 for (Use &Op : I->operands()) {
893 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
894 if (OpInst->getDebugLoc() != Empty)
895 return OpInst;
896 }
897
898 return I;
899}
900
901/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
902/// is passed, the message relates to that particular instruction.
903#ifndef NDEBUG
904static void debugVectorizationMessage(const StringRef Prefix,
905 const StringRef DebugMsg,
906 Instruction *I) {
907 dbgs() << "LV: " << Prefix << DebugMsg;
908 if (I != nullptr)
909 dbgs() << " " << *I;
910 else
911 dbgs() << '.';
912 dbgs() << '\n';
913}
914#endif
915
916/// Create an analysis remark that explains why vectorization failed
917///
918/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
919/// RemarkName is the identifier for the remark. If \p I is passed it is an
920/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
921/// the location of the remark. \return the remark object that can be
922/// streamed to.
923static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
924 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
925 Value *CodeRegion = TheLoop->getHeader();
926 DebugLoc DL = TheLoop->getStartLoc();
927
928 if (I) {
929 CodeRegion = I->getParent();
930 // If there is no debug location attached to the instruction, revert back to
931 // using the loop's.
932 if (I->getDebugLoc())
933 DL = I->getDebugLoc();
934 }
935
936 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
937}
938
939namespace llvm {
940
941/// Return a value for Step multiplied by VF.
942Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
943 int64_t Step) {
944 assert(Ty->isIntegerTy() && "Expected an integer step")(static_cast <bool> (Ty->isIntegerTy() && "Expected an integer step"
) ? void (0) : __assert_fail ("Ty->isIntegerTy() && \"Expected an integer step\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 944, __extension__
__PRETTY_FUNCTION__))
;
945 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
946 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
947}
948
949/// Return the runtime value for VF.
950Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
951 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
952 return VF.isScalable() ? B.CreateVScale(EC) : EC;
953}
954
955static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
956 ElementCount VF) {
957 assert(FTy->isFloatingPointTy() && "Expected floating point type!")(static_cast <bool> (FTy->isFloatingPointTy() &&
"Expected floating point type!") ? void (0) : __assert_fail (
"FTy->isFloatingPointTy() && \"Expected floating point type!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 957, __extension__
__PRETTY_FUNCTION__))
;
958 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
959 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
960 return B.CreateUIToFP(RuntimeVF, FTy);
961}
962
963void reportVectorizationFailure(const StringRef DebugMsg,
964 const StringRef OREMsg, const StringRef ORETag,
965 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
966 Instruction *I) {
967 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationMessage("Not vectorizing: "
, DebugMsg, I); } } while (false)
;
968 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
969 ORE->emit(
970 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
971 << "loop not vectorized: " << OREMsg);
972}
973
974void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
975 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
976 Instruction *I) {
977 LLVM_DEBUG(debugVectorizationMessage("", Msg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationMessage("", Msg, I); }
} while (false)
;
978 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
979 ORE->emit(
980 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
981 << Msg);
982}
983
984} // end namespace llvm
985
986#ifndef NDEBUG
987/// \return string containing a file name and a line # for the given loop.
988static std::string getDebugLocString(const Loop *L) {
989 std::string Result;
990 if (L) {
991 raw_string_ostream OS(Result);
992 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
993 LoopDbgLoc.print(OS);
994 else
995 // Just print the module name.
996 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
997 OS.flush();
998 }
999 return Result;
1000}
1001#endif
1002
1003void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1004 VPTransformState &State) {
1005
1006 // Collect recipes in the backward slice of `Root` that may generate a poison
1007 // value that is used after vectorization.
1008 SmallPtrSet<VPRecipeBase *, 16> Visited;
1009 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1010 SmallVector<VPRecipeBase *, 16> Worklist;
1011 Worklist.push_back(Root);
1012
1013 // Traverse the backward slice of Root through its use-def chain.
1014 while (!Worklist.empty()) {
1015 VPRecipeBase *CurRec = Worklist.back();
1016 Worklist.pop_back();
1017
1018 if (!Visited.insert(CurRec).second)
1019 continue;
1020
1021 // Prune search if we find another recipe generating a widen memory
1022 // instruction. Widen memory instructions involved in address computation
1023 // will lead to gather/scatter instructions, which don't need to be
1024 // handled.
1025 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1026 isa<VPInterleaveRecipe>(CurRec) ||
1027 isa<VPScalarIVStepsRecipe>(CurRec) ||
1028 isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1029 isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1030 continue;
1031
1032 // This recipe contributes to the address computation of a widen
1033 // load/store. Collect recipe if its underlying instruction has
1034 // poison-generating flags.
1035 Instruction *Instr = CurRec->getUnderlyingInstr();
1036 if (Instr && Instr->hasPoisonGeneratingFlags())
1037 State.MayGeneratePoisonRecipes.insert(CurRec);
1038
1039 // Add new definitions to the worklist.
1040 for (VPValue *operand : CurRec->operands())
1041 if (VPDef *OpDef = operand->getDef())
1042 Worklist.push_back(cast<VPRecipeBase>(OpDef));
1043 }
1044 });
1045
1046 // Traverse all the recipes in the VPlan and collect the poison-generating
1047 // recipes in the backward slice starting at the address of a VPWidenRecipe or
1048 // VPInterleaveRecipe.
1049 auto Iter = depth_first(
1050 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1051 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1052 for (VPRecipeBase &Recipe : *VPBB) {
1053 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1054 Instruction &UnderlyingInstr = WidenRec->getIngredient();
1055 VPDef *AddrDef = WidenRec->getAddr()->getDef();
1056 if (AddrDef && WidenRec->isConsecutive() &&
1057 Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1058 collectPoisonGeneratingInstrsInBackwardSlice(
1059 cast<VPRecipeBase>(AddrDef));
1060 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1061 VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1062 if (AddrDef) {
1063 // Check if any member of the interleave group needs predication.
1064 const InterleaveGroup<Instruction> *InterGroup =
1065 InterleaveRec->getInterleaveGroup();
1066 bool NeedPredication = false;
1067 for (int I = 0, NumMembers = InterGroup->getNumMembers();
1068 I < NumMembers; ++I) {
1069 Instruction *Member = InterGroup->getMember(I);
1070 if (Member)
1071 NeedPredication |=
1072 Legal->blockNeedsPredication(Member->getParent());
1073 }
1074
1075 if (NeedPredication)
1076 collectPoisonGeneratingInstrsInBackwardSlice(
1077 cast<VPRecipeBase>(AddrDef));
1078 }
1079 }
1080 }
1081 }
1082}
1083
1084PHINode *InnerLoopVectorizer::getReductionResumeValue(
1085 const RecurrenceDescriptor &RdxDesc) {
1086 auto It = ReductionResumeValues.find(&RdxDesc);
1087 assert(It != ReductionResumeValues.end() &&(static_cast <bool> (It != ReductionResumeValues.end() &&
"Expected to find a resume value for the reduction.") ? void
(0) : __assert_fail ("It != ReductionResumeValues.end() && \"Expected to find a resume value for the reduction.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1088, __extension__
__PRETTY_FUNCTION__))
1088 "Expected to find a resume value for the reduction.")(static_cast <bool> (It != ReductionResumeValues.end() &&
"Expected to find a resume value for the reduction.") ? void
(0) : __assert_fail ("It != ReductionResumeValues.end() && \"Expected to find a resume value for the reduction.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1088, __extension__
__PRETTY_FUNCTION__))
;
1089 return It->second;
1090}
1091
1092namespace llvm {
1093
1094// Loop vectorization cost-model hints how the scalar epilogue loop should be
1095// lowered.
1096enum ScalarEpilogueLowering {
1097
1098 // The default: allowing scalar epilogues.
1099 CM_ScalarEpilogueAllowed,
1100
1101 // Vectorization with OptForSize: don't allow epilogues.
1102 CM_ScalarEpilogueNotAllowedOptSize,
1103
1104 // A special case of vectorisation with OptForSize: loops with a very small
1105 // trip count are considered for vectorization under OptForSize, thereby
1106 // making sure the cost of their loop body is dominant, free of runtime
1107 // guards and scalar iteration overheads.
1108 CM_ScalarEpilogueNotAllowedLowTripLoop,
1109
1110 // Loop hint predicate indicating an epilogue is undesired.
1111 CM_ScalarEpilogueNotNeededUsePredicate,
1112
1113 // Directive indicating we must either tail fold or not vectorize
1114 CM_ScalarEpilogueNotAllowedUsePredicate
1115};
1116
1117/// ElementCountComparator creates a total ordering for ElementCount
1118/// for the purposes of using it in a set structure.
1119struct ElementCountComparator {
1120 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1121 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1122 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1123 }
1124};
1125using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1126
1127/// LoopVectorizationCostModel - estimates the expected speedups due to
1128/// vectorization.
1129/// In many cases vectorization is not profitable. This can happen because of
1130/// a number of reasons. In this class we mainly attempt to predict the
1131/// expected speedup/slowdowns due to the supported instruction set. We use the
1132/// TargetTransformInfo to query the different backends for the cost of
1133/// different operations.
1134class LoopVectorizationCostModel {
1135public:
1136 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1137 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1138 LoopVectorizationLegality *Legal,
1139 const TargetTransformInfo &TTI,
1140 const TargetLibraryInfo *TLI, DemandedBits *DB,
1141 AssumptionCache *AC,
1142 OptimizationRemarkEmitter *ORE, const Function *F,
1143 const LoopVectorizeHints *Hints,
1144 InterleavedAccessInfo &IAI)
1145 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1146 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1147 Hints(Hints), InterleaveInfo(IAI) {}
1148
1149 /// \return An upper bound for the vectorization factors (both fixed and
1150 /// scalable). If the factors are 0, vectorization and interleaving should be
1151 /// avoided up front.
1152 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1153
1154 /// \return True if runtime checks are required for vectorization, and false
1155 /// otherwise.
1156 bool runtimeChecksRequired();
1157
1158 /// \return The most profitable vectorization factor and the cost of that VF.
1159 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1160 /// then this vectorization factor will be selected if vectorization is
1161 /// possible.
1162 VectorizationFactor
1163 selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1164
1165 VectorizationFactor
1166 selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1167 const LoopVectorizationPlanner &LVP);
1168
1169 /// Setup cost-based decisions for user vectorization factor.
1170 /// \return true if the UserVF is a feasible VF to be chosen.
1171 bool selectUserVectorizationFactor(ElementCount UserVF) {
1172 collectUniformsAndScalars(UserVF);
1173 collectInstsToScalarize(UserVF);
1174 return expectedCost(UserVF).first.isValid();
1175 }
1176
1177 /// \return The size (in bits) of the smallest and widest types in the code
1178 /// that needs to be vectorized. We ignore values that remain scalar such as
1179 /// 64 bit loop indices.
1180 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1181
1182 /// \return The desired interleave count.
1183 /// If interleave count has been specified by metadata it will be returned.
1184 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1185 /// are the selected vectorization factor and the cost of the selected VF.
1186 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1187
1188 /// Memory access instruction may be vectorized in more than one way.
1189 /// Form of instruction after vectorization depends on cost.
1190 /// This function takes cost-based decisions for Load/Store instructions
1191 /// and collects them in a map. This decisions map is used for building
1192 /// the lists of loop-uniform and loop-scalar instructions.
1193 /// The calculated cost is saved with widening decision in order to
1194 /// avoid redundant calculations.
1195 void setCostBasedWideningDecision(ElementCount VF);
1196
1197 /// A struct that represents some properties of the register usage
1198 /// of a loop.
1199 struct RegisterUsage {
1200 /// Holds the number of loop invariant values that are used in the loop.
1201 /// The key is ClassID of target-provided register class.
1202 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1203 /// Holds the maximum number of concurrent live intervals in the loop.
1204 /// The key is ClassID of target-provided register class.
1205 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1206 };
1207
1208 /// \return Returns information about the register usages of the loop for the
1209 /// given vectorization factors.
1210 SmallVector<RegisterUsage, 8>
1211 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1212
1213 /// Collect values we want to ignore in the cost model.
1214 void collectValuesToIgnore();
1215
1216 /// Collect all element types in the loop for which widening is needed.
1217 void collectElementTypesForWidening();
1218
1219 /// Split reductions into those that happen in the loop, and those that happen
1220 /// outside. In loop reductions are collected into InLoopReductionChains.
1221 void collectInLoopReductions();
1222
1223 /// Returns true if we should use strict in-order reductions for the given
1224 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1225 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1226 /// of FP operations.
1227 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1228 return !Hints->allowReordering() && RdxDesc.isOrdered();
1229 }
1230
1231 /// \returns The smallest bitwidth each instruction can be represented with.
1232 /// The vector equivalents of these instructions should be truncated to this
1233 /// type.
1234 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1235 return MinBWs;
1236 }
1237
1238 /// \returns True if it is more profitable to scalarize instruction \p I for
1239 /// vectorization factor \p VF.
1240 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1241 assert(VF.isVector() &&(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1242, __extension__
__PRETTY_FUNCTION__))
1242 "Profitable to scalarize relevant only for VF > 1.")(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1242, __extension__
__PRETTY_FUNCTION__))
;
1243
1244 // Cost model is not run in the VPlan-native path - return conservative
1245 // result until this changes.
1246 if (EnableVPlanNativePath)
1247 return false;
1248
1249 auto Scalars = InstsToScalarize.find(VF);
1250 assert(Scalars != InstsToScalarize.end() &&(static_cast <bool> (Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability") ? void
(0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1251, __extension__
__PRETTY_FUNCTION__))
1251 "VF not yet analyzed for scalarization profitability")(static_cast <bool> (Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability") ? void
(0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1251, __extension__
__PRETTY_FUNCTION__))
;
1252 return Scalars->second.find(I) != Scalars->second.end();
1253 }
1254
1255 /// Returns true if \p I is known to be uniform after vectorization.
1256 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1257 if (VF.isScalar())
1258 return true;
1259
1260 // Cost model is not run in the VPlan-native path - return conservative
1261 // result until this changes.
1262 if (EnableVPlanNativePath)
1263 return false;
1264
1265 auto UniformsPerVF = Uniforms.find(VF);
1266 assert(UniformsPerVF != Uniforms.end() &&(static_cast <bool> (UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity") ? void (0) : __assert_fail
("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1267, __extension__
__PRETTY_FUNCTION__))
1267 "VF not yet analyzed for uniformity")(static_cast <bool> (UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity") ? void (0) : __assert_fail
("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1267, __extension__
__PRETTY_FUNCTION__))
;
1268 return UniformsPerVF->second.count(I);
1269 }
1270
1271 /// Returns true if \p I is known to be scalar after vectorization.
1272 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1273 if (VF.isScalar())
1274 return true;
1275
1276 // Cost model is not run in the VPlan-native path - return conservative
1277 // result until this changes.
1278 if (EnableVPlanNativePath)
1279 return false;
1280
1281 auto ScalarsPerVF = Scalars.find(VF);
1282 assert(ScalarsPerVF != Scalars.end() &&(static_cast <bool> (ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF") ? void (0) : __assert_fail
("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1283, __extension__
__PRETTY_FUNCTION__))
1283 "Scalar values are not calculated for VF")(static_cast <bool> (ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF") ? void (0) : __assert_fail
("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1283, __extension__
__PRETTY_FUNCTION__))
;
1284 return ScalarsPerVF->second.count(I);
1285 }
1286
1287 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1288 /// for vectorization factor \p VF.
1289 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1290 return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1291 !isProfitableToScalarize(I, VF) &&
1292 !isScalarAfterVectorization(I, VF);
1293 }
1294
1295 /// Decision that was taken during cost calculation for memory instruction.
1296 enum InstWidening {
1297 CM_Unknown,
1298 CM_Widen, // For consecutive accesses with stride +1.
1299 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1300 CM_Interleave,
1301 CM_GatherScatter,
1302 CM_Scalarize
1303 };
1304
1305 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1306 /// instruction \p I and vector width \p VF.
1307 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1308 InstructionCost Cost) {
1309 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1309, __extension__
__PRETTY_FUNCTION__))
;
1310 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1311 }
1312
1313 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1314 /// interleaving group \p Grp and vector width \p VF.
1315 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1316 ElementCount VF, InstWidening W,
1317 InstructionCost Cost) {
1318 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1318, __extension__
__PRETTY_FUNCTION__))
;
1319 /// Broadcast this decicion to all instructions inside the group.
1320 /// But the cost will be assigned to one instruction only.
1321 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1322 if (auto *I = Grp->getMember(i)) {
1323 if (Grp->getInsertPos() == I)
1324 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1325 else
1326 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1327 }
1328 }
1329 }
1330
1331 /// Return the cost model decision for the given instruction \p I and vector
1332 /// width \p VF. Return CM_Unknown if this instruction did not pass
1333 /// through the cost modeling.
1334 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1335 assert(VF.isVector() && "Expected VF to be a vector VF")(static_cast <bool> (VF.isVector() && "Expected VF to be a vector VF"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF to be a vector VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1335, __extension__
__PRETTY_FUNCTION__))
;
1336 // Cost model is not run in the VPlan-native path - return conservative
1337 // result until this changes.
1338 if (EnableVPlanNativePath)
1339 return CM_GatherScatter;
1340
1341 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1342 auto Itr = WideningDecisions.find(InstOnVF);
1343 if (Itr == WideningDecisions.end())
1344 return CM_Unknown;
1345 return Itr->second.first;
1346 }
1347
1348 /// Return the vectorization cost for the given instruction \p I and vector
1349 /// width \p VF.
1350 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1351 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1351, __extension__
__PRETTY_FUNCTION__))
;
1352 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1353 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&(static_cast <bool> (WideningDecisions.find(InstOnVF) !=
WideningDecisions.end() && "The cost is not calculated"
) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1354, __extension__
__PRETTY_FUNCTION__))
1354 "The cost is not calculated")(static_cast <bool> (WideningDecisions.find(InstOnVF) !=
WideningDecisions.end() && "The cost is not calculated"
) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1354, __extension__
__PRETTY_FUNCTION__))
;
1355 return WideningDecisions[InstOnVF].second;
1356 }
1357
1358 /// Return True if instruction \p I is an optimizable truncate whose operand
1359 /// is an induction variable. Such a truncate will be removed by adding a new
1360 /// induction variable with the destination type.
1361 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1362 // If the instruction is not a truncate, return false.
1363 auto *Trunc = dyn_cast<TruncInst>(I);
1364 if (!Trunc)
1365 return false;
1366
1367 // Get the source and destination types of the truncate.
1368 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1369 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1370
1371 // If the truncate is free for the given types, return false. Replacing a
1372 // free truncate with an induction variable would add an induction variable
1373 // update instruction to each iteration of the loop. We exclude from this
1374 // check the primary induction variable since it will need an update
1375 // instruction regardless.
1376 Value *Op = Trunc->getOperand(0);
1377 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1378 return false;
1379
1380 // If the truncated value is not an induction variable, return false.
1381 return Legal->isInductionPhi(Op);
1382 }
1383
1384 /// Collects the instructions to scalarize for each predicated instruction in
1385 /// the loop.
1386 void collectInstsToScalarize(ElementCount VF);
1387
1388 /// Collect Uniform and Scalar values for the given \p VF.
1389 /// The sets depend on CM decision for Load/Store instructions
1390 /// that may be vectorized as interleave, gather-scatter or scalarized.
1391 void collectUniformsAndScalars(ElementCount VF) {
1392 // Do the analysis once.
1393 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1394 return;
1395 setCostBasedWideningDecision(VF);
1396 collectLoopUniforms(VF);
1397 collectLoopScalars(VF);
1398 }
1399
1400 /// Returns true if the target machine supports masked store operation
1401 /// for the given \p DataType and kind of access to \p Ptr.
1402 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1403 return Legal->isConsecutivePtr(DataType, Ptr) &&
1404 TTI.isLegalMaskedStore(DataType, Alignment);
1405 }
1406
1407 /// Returns true if the target machine supports masked load operation
1408 /// for the given \p DataType and kind of access to \p Ptr.
1409 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1410 return Legal->isConsecutivePtr(DataType, Ptr) &&
1411 TTI.isLegalMaskedLoad(DataType, Alignment);
1412 }
1413
1414 /// Returns true if the target machine can represent \p V as a masked gather
1415 /// or scatter operation.
1416 bool isLegalGatherOrScatter(Value *V,
1417 ElementCount VF = ElementCount::getFixed(1)) {
1418 bool LI = isa<LoadInst>(V);
1419 bool SI = isa<StoreInst>(V);
1420 if (!LI && !SI)
1421 return false;
1422 auto *Ty = getLoadStoreType(V);
1423 Align Align = getLoadStoreAlignment(V);
1424 if (VF.isVector())
1425 Ty = VectorType::get(Ty, VF);
1426 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1427 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1428 }
1429
1430 /// Returns true if the target machine supports all of the reduction
1431 /// variables found for the given VF.
1432 bool canVectorizeReductions(ElementCount VF) const {
1433 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1434 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1435 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1436 }));
1437 }
1438
1439 /// Returns true if \p I is an instruction which requires predication and
1440 /// for which our chosen predication strategy is scalarization (i.e. we
1441 /// don't have an alternate strategy such as masking available).
1442 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1443 bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1444
1445 /// Returns true if \p I is an instruction that needs to be predicated
1446 /// at runtime. The result is independent of the predication mechanism.
1447 /// Superset of instructions that return true for isScalarWithPredication.
1448 bool isPredicatedInst(Instruction *I) const;
1449
1450 /// Returns true if \p I is a memory instruction with consecutive memory
1451 /// access that can be widened.
1452 bool
1453 memoryInstructionCanBeWidened(Instruction *I,
1454 ElementCount VF = ElementCount::getFixed(1));
1455
1456 /// Returns true if \p I is a memory instruction in an interleaved-group
1457 /// of memory accesses that can be vectorized with wide vector loads/stores
1458 /// and shuffles.
1459 bool
1460 interleavedAccessCanBeWidened(Instruction *I,
1461 ElementCount VF = ElementCount::getFixed(1));
1462
1463 /// Check if \p Instr belongs to any interleaved access group.
1464 bool isAccessInterleaved(Instruction *Instr) {
1465 return InterleaveInfo.isInterleaved(Instr);
1466 }
1467
1468 /// Get the interleaved access group that \p Instr belongs to.
1469 const InterleaveGroup<Instruction> *
1470 getInterleavedAccessGroup(Instruction *Instr) {
1471 return InterleaveInfo.getInterleaveGroup(Instr);
1472 }
1473
1474 /// Returns true if we're required to use a scalar epilogue for at least
1475 /// the final iteration of the original loop.
1476 bool requiresScalarEpilogue(ElementCount VF) const {
1477 if (!isScalarEpilogueAllowed())
1478 return false;
1479 // If we might exit from anywhere but the latch, must run the exiting
1480 // iteration in scalar form.
1481 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1482 return true;
1483 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1484 }
1485
1486 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1487 /// loop hint annotation.
1488 bool isScalarEpilogueAllowed() const {
1489 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1490 }
1491
1492 /// Returns true if all loop blocks should be masked to fold tail loop.
1493 bool foldTailByMasking() const { return FoldTailByMasking; }
1494
1495 /// Returns true if were tail-folding and want to use the active lane mask
1496 /// for vector loop control flow.
1497 bool useActiveLaneMaskForControlFlow() const {
1498 return FoldTailByMasking &&
1499 TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow;
1500 }
1501
1502 /// Returns true if the instructions in this block requires predication
1503 /// for any reason, e.g. because tail folding now requires a predicate
1504 /// or because the block in the original loop was predicated.
1505 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1506 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1507 }
1508
1509 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1510 /// nodes to the chain of instructions representing the reductions. Uses a
1511 /// MapVector to ensure deterministic iteration order.
1512 using ReductionChainMap =
1513 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1514
1515 /// Return the chain of instructions representing an inloop reduction.
1516 const ReductionChainMap &getInLoopReductionChains() const {
1517 return InLoopReductionChains;
1518 }
1519
1520 /// Returns true if the Phi is part of an inloop reduction.
1521 bool isInLoopReduction(PHINode *Phi) const {
1522 return InLoopReductionChains.count(Phi);
1523 }
1524
1525 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1526 /// with factor VF. Return the cost of the instruction, including
1527 /// scalarization overhead if it's needed.
1528 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1529
1530 /// Estimate cost of a call instruction CI if it were vectorized with factor
1531 /// VF. Return the cost of the instruction, including scalarization overhead
1532 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1533 /// scalarized -
1534 /// i.e. either vector version isn't available, or is too expensive.
1535 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1536 bool &NeedToScalarize) const;
1537
1538 /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1539 /// that of B.
1540 bool isMoreProfitable(const VectorizationFactor &A,
1541 const VectorizationFactor &B) const;
1542
1543 /// Invalidates decisions already taken by the cost model.
1544 void invalidateCostModelingDecisions() {
1545 WideningDecisions.clear();
1546 Uniforms.clear();
1547 Scalars.clear();
1548 }
1549
1550 /// Convenience function that returns the value of vscale_range iff
1551 /// vscale_range.min == vscale_range.max or otherwise returns the value
1552 /// returned by the corresponding TLI method.
1553 Optional<unsigned> getVScaleForTuning() const;
1554
1555private:
1556 unsigned NumPredStores = 0;
1557
1558 /// \return An upper bound for the vectorization factors for both
1559 /// fixed and scalable vectorization, where the minimum-known number of
1560 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1561 /// disabled or unsupported, then the scalable part will be equal to
1562 /// ElementCount::getScalable(0).
1563 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1564 ElementCount UserVF,
1565 bool FoldTailByMasking);
1566
1567 /// \return the maximized element count based on the targets vector
1568 /// registers and the loop trip-count, but limited to a maximum safe VF.
1569 /// This is a helper function of computeFeasibleMaxVF.
1570 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1571 unsigned SmallestType,
1572 unsigned WidestType,
1573 ElementCount MaxSafeVF,
1574 bool FoldTailByMasking);
1575
1576 /// \return the maximum legal scalable VF, based on the safe max number
1577 /// of elements.
1578 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1579
1580 /// The vectorization cost is a combination of the cost itself and a boolean
1581 /// indicating whether any of the contributing operations will actually
1582 /// operate on vector values after type legalization in the backend. If this
1583 /// latter value is false, then all operations will be scalarized (i.e. no
1584 /// vectorization has actually taken place).
1585 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1586
1587 /// Returns the expected execution cost. The unit of the cost does
1588 /// not matter because we use the 'cost' units to compare different
1589 /// vector widths. The cost that is returned is *not* normalized by
1590 /// the factor width. If \p Invalid is not nullptr, this function
1591 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1592 /// each instruction that has an Invalid cost for the given VF.
1593 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1594 VectorizationCostTy
1595 expectedCost(ElementCount VF,
1596 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1597
1598 /// Returns the execution time cost of an instruction for a given vector
1599 /// width. Vector width of one means scalar.
1600 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1601
1602 /// The cost-computation logic from getInstructionCost which provides
1603 /// the vector type as an output parameter.
1604 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1605 Type *&VectorTy);
1606
1607 /// Return the cost of instructions in an inloop reduction pattern, if I is
1608 /// part of that pattern.
1609 Optional<InstructionCost>
1610 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1611 TTI::TargetCostKind CostKind);
1612
1613 /// Calculate vectorization cost of memory instruction \p I.
1614 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1615
1616 /// The cost computation for scalarized memory instruction.
1617 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1618
1619 /// The cost computation for interleaving group of memory instructions.
1620 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1621
1622 /// The cost computation for Gather/Scatter instruction.
1623 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1624
1625 /// The cost computation for widening instruction \p I with consecutive
1626 /// memory access.
1627 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1628
1629 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1630 /// Load: scalar load + broadcast.
1631 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1632 /// element)
1633 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1634
1635 /// Estimate the overhead of scalarizing an instruction. This is a
1636 /// convenience wrapper for the type-based getScalarizationOverhead API.
1637 InstructionCost getScalarizationOverhead(Instruction *I,
1638 ElementCount VF) const;
1639
1640 /// Returns true if an artificially high cost for emulated masked memrefs
1641 /// should be used.
1642 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1643
1644 /// Map of scalar integer values to the smallest bitwidth they can be legally
1645 /// represented as. The vector equivalents of these values should be truncated
1646 /// to this type.
1647 MapVector<Instruction *, uint64_t> MinBWs;
1648
1649 /// A type representing the costs for instructions if they were to be
1650 /// scalarized rather than vectorized. The entries are Instruction-Cost
1651 /// pairs.
1652 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1653
1654 /// A set containing all BasicBlocks that are known to present after
1655 /// vectorization as a predicated block.
1656 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1657 PredicatedBBsAfterVectorization;
1658
1659 /// Records whether it is allowed to have the original scalar loop execute at
1660 /// least once. This may be needed as a fallback loop in case runtime
1661 /// aliasing/dependence checks fail, or to handle the tail/remainder
1662 /// iterations when the trip count is unknown or doesn't divide by the VF,
1663 /// or as a peel-loop to handle gaps in interleave-groups.
1664 /// Under optsize and when the trip count is very small we don't allow any
1665 /// iterations to execute in the scalar loop.
1666 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1667
1668 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1669 bool FoldTailByMasking = false;
1670
1671 /// A map holding scalar costs for different vectorization factors. The
1672 /// presence of a cost for an instruction in the mapping indicates that the
1673 /// instruction will be scalarized when vectorizing with the associated
1674 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1675 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1676
1677 /// Holds the instructions known to be uniform after vectorization.
1678 /// The data is collected per VF.
1679 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1680
1681 /// Holds the instructions known to be scalar after vectorization.
1682 /// The data is collected per VF.
1683 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1684
1685 /// Holds the instructions (address computations) that are forced to be
1686 /// scalarized.
1687 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1688
1689 /// PHINodes of the reductions that should be expanded in-loop along with
1690 /// their associated chains of reduction operations, in program order from top
1691 /// (PHI) to bottom
1692 ReductionChainMap InLoopReductionChains;
1693
1694 /// A Map of inloop reduction operations and their immediate chain operand.
1695 /// FIXME: This can be removed once reductions can be costed correctly in
1696 /// vplan. This was added to allow quick lookup to the inloop operations,
1697 /// without having to loop through InLoopReductionChains.
1698 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1699
1700 /// Returns the expected difference in cost from scalarizing the expression
1701 /// feeding a predicated instruction \p PredInst. The instructions to
1702 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1703 /// non-negative return value implies the expression will be scalarized.
1704 /// Currently, only single-use chains are considered for scalarization.
1705 InstructionCost computePredInstDiscount(Instruction *PredInst,
1706 ScalarCostsTy &ScalarCosts,
1707 ElementCount VF);
1708
1709 /// Collect the instructions that are uniform after vectorization. An
1710 /// instruction is uniform if we represent it with a single scalar value in
1711 /// the vectorized loop corresponding to each vector iteration. Examples of
1712 /// uniform instructions include pointer operands of consecutive or
1713 /// interleaved memory accesses. Note that although uniformity implies an
1714 /// instruction will be scalar, the reverse is not true. In general, a
1715 /// scalarized instruction will be represented by VF scalar values in the
1716 /// vectorized loop, each corresponding to an iteration of the original
1717 /// scalar loop.
1718 void collectLoopUniforms(ElementCount VF);
1719
1720 /// Collect the instructions that are scalar after vectorization. An
1721 /// instruction is scalar if it is known to be uniform or will be scalarized
1722 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1723 /// to the list if they are used by a load/store instruction that is marked as
1724 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1725 /// VF values in the vectorized loop, each corresponding to an iteration of
1726 /// the original scalar loop.
1727 void collectLoopScalars(ElementCount VF);
1728
1729 /// Keeps cost model vectorization decision and cost for instructions.
1730 /// Right now it is used for memory instructions only.
1731 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1732 std::pair<InstWidening, InstructionCost>>;
1733
1734 DecisionList WideningDecisions;
1735
1736 /// Returns true if \p V is expected to be vectorized and it needs to be
1737 /// extracted.
1738 bool needsExtract(Value *V, ElementCount VF) const {
1739 Instruction *I = dyn_cast<Instruction>(V);
1740 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1741 TheLoop->isLoopInvariant(I))
1742 return false;
1743
1744 // Assume we can vectorize V (and hence we need extraction) if the
1745 // scalars are not computed yet. This can happen, because it is called
1746 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1747 // the scalars are collected. That should be a safe assumption in most
1748 // cases, because we check if the operands have vectorizable types
1749 // beforehand in LoopVectorizationLegality.
1750 return Scalars.find(VF) == Scalars.end() ||
1751 !isScalarAfterVectorization(I, VF);
1752 };
1753
1754 /// Returns a range containing only operands needing to be extracted.
1755 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1756 ElementCount VF) const {
1757 return SmallVector<Value *, 4>(make_filter_range(
1758 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1759 }
1760
1761 /// Determines if we have the infrastructure to vectorize loop \p L and its
1762 /// epilogue, assuming the main loop is vectorized by \p VF.
1763 bool isCandidateForEpilogueVectorization(const Loop &L,
1764 const ElementCount VF) const;
1765
1766 /// Returns true if epilogue vectorization is considered profitable, and
1767 /// false otherwise.
1768 /// \p VF is the vectorization factor chosen for the original loop.
1769 bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1770
1771public:
1772 /// The loop that we evaluate.
1773 Loop *TheLoop;
1774
1775 /// Predicated scalar evolution analysis.
1776 PredicatedScalarEvolution &PSE;
1777
1778 /// Loop Info analysis.
1779 LoopInfo *LI;
1780
1781 /// Vectorization legality.
1782 LoopVectorizationLegality *Legal;
1783
1784 /// Vector target information.
1785 const TargetTransformInfo &TTI;
1786
1787 /// Target Library Info.
1788 const TargetLibraryInfo *TLI;
1789
1790 /// Demanded bits analysis.
1791 DemandedBits *DB;
1792
1793 /// Assumption cache.
1794 AssumptionCache *AC;
1795
1796 /// Interface to emit optimization remarks.
1797 OptimizationRemarkEmitter *ORE;
1798
1799 const Function *TheFunction;
1800
1801 /// Loop Vectorize Hint.
1802 const LoopVectorizeHints *Hints;
1803
1804 /// The interleave access information contains groups of interleaved accesses
1805 /// with the same stride and close to each other.
1806 InterleavedAccessInfo &InterleaveInfo;
1807
1808 /// Values to ignore in the cost model.
1809 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1810
1811 /// Values to ignore in the cost model when VF > 1.
1812 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1813
1814 /// All element types found in the loop.
1815 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1816
1817 /// Profitable vector factors.
1818 SmallVector<VectorizationFactor, 8> ProfitableVFs;
1819};
1820} // end namespace llvm
1821
1822/// Helper struct to manage generating runtime checks for vectorization.
1823///
1824/// The runtime checks are created up-front in temporary blocks to allow better
1825/// estimating the cost and un-linked from the existing IR. After deciding to
1826/// vectorize, the checks are moved back. If deciding not to vectorize, the
1827/// temporary blocks are completely removed.
1828class GeneratedRTChecks {
1829 /// Basic block which contains the generated SCEV checks, if any.
1830 BasicBlock *SCEVCheckBlock = nullptr;
1831
1832 /// The value representing the result of the generated SCEV checks. If it is
1833 /// nullptr, either no SCEV checks have been generated or they have been used.
1834 Value *SCEVCheckCond = nullptr;
1835
1836 /// Basic block which contains the generated memory runtime checks, if any.
1837 BasicBlock *MemCheckBlock = nullptr;
1838
1839 /// The value representing the result of the generated memory runtime checks.
1840 /// If it is nullptr, either no memory runtime checks have been generated or
1841 /// they have been used.
1842 Value *MemRuntimeCheckCond = nullptr;
1843
1844 DominatorTree *DT;
1845 LoopInfo *LI;
1846 TargetTransformInfo *TTI;
1847
1848 SCEVExpander SCEVExp;
1849 SCEVExpander MemCheckExp;
1850
1851 bool CostTooHigh = false;
1852
1853public:
1854 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1855 TargetTransformInfo *TTI, const DataLayout &DL)
1856 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1857 MemCheckExp(SE, DL, "scev.check") {}
1858
1859 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1860 /// accurately estimate the cost of the runtime checks. The blocks are
1861 /// un-linked from the IR and is added back during vector code generation. If
1862 /// there is no vector code generation, the check blocks are removed
1863 /// completely.
1864 void Create(Loop *L, const LoopAccessInfo &LAI,
1865 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1866
1867 // Hard cutoff to limit compile-time increase in case a very large number of
1868 // runtime checks needs to be generated.
1869 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1870 // profile info.
1871 CostTooHigh =
1872 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1873 if (CostTooHigh)
1874 return;
1875
1876 BasicBlock *LoopHeader = L->getHeader();
1877 BasicBlock *Preheader = L->getLoopPreheader();
1878
1879 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1880 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1881 // may be used by SCEVExpander. The blocks will be un-linked from their
1882 // predecessors and removed from LI & DT at the end of the function.
1883 if (!UnionPred.isAlwaysTrue()) {
1884 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1885 nullptr, "vector.scevcheck");
1886
1887 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1888 &UnionPred, SCEVCheckBlock->getTerminator());
1889 }
1890
1891 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1892 if (RtPtrChecking.Need) {
1893 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1894 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1895 "vector.memcheck");
1896
1897 auto DiffChecks = RtPtrChecking.getDiffChecks();
1898 if (DiffChecks) {
1899 Value *RuntimeVF = nullptr;
1900 MemRuntimeCheckCond = addDiffRuntimeChecks(
1901 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1902 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1903 if (!RuntimeVF)
1904 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1905 return RuntimeVF;
1906 },
1907 IC);
1908 } else {
1909 MemRuntimeCheckCond =
1910 addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1911 RtPtrChecking.getChecks(), MemCheckExp);
1912 }
1913 assert(MemRuntimeCheckCond &&(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1915, __extension__
__PRETTY_FUNCTION__))
1914 "no RT checks generated although RtPtrChecking "(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1915, __extension__
__PRETTY_FUNCTION__))
1915 "claimed checks are required")(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1915, __extension__
__PRETTY_FUNCTION__))
;
1916 }
1917
1918 if (!MemCheckBlock && !SCEVCheckBlock)
1919 return;
1920
1921 // Unhook the temporary block with the checks, update various places
1922 // accordingly.
1923 if (SCEVCheckBlock)
1924 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1925 if (MemCheckBlock)
1926 MemCheckBlock->replaceAllUsesWith(Preheader);
1927
1928 if (SCEVCheckBlock) {
1929 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1930 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1931 Preheader->getTerminator()->eraseFromParent();
1932 }
1933 if (MemCheckBlock) {
1934 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1935 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1936 Preheader->getTerminator()->eraseFromParent();
1937 }
1938
1939 DT->changeImmediateDominator(LoopHeader, Preheader);
1940 if (MemCheckBlock) {
1941 DT->eraseNode(MemCheckBlock);
1942 LI->removeBlock(MemCheckBlock);
1943 }
1944 if (SCEVCheckBlock) {
1945 DT->eraseNode(SCEVCheckBlock);
1946 LI->removeBlock(SCEVCheckBlock);
1947 }
1948 }
1949
1950 InstructionCost getCost() {
1951 if (SCEVCheckBlock || MemCheckBlock)
1952 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Calculating cost of runtime checks:\n"
; } } while (false)
;
1953
1954 if (CostTooHigh) {
1955 InstructionCost Cost;
1956 Cost.setInvalid();
1957 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " number of checks exceeded threshold\n"
; } } while (false)
;
1958 return Cost;
1959 }
1960
1961 InstructionCost RTCheckCost = 0;
1962 if (SCEVCheckBlock)
1963 for (Instruction &I : *SCEVCheckBlock) {
1964 if (SCEVCheckBlock->getTerminator() == &I)
1965 continue;
1966 InstructionCost C =
1967 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
1968 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " " << C <<
" for " << I << "\n"; } } while (false)
;
1969 RTCheckCost += C;
1970 }
1971 if (MemCheckBlock)
1972 for (Instruction &I : *MemCheckBlock) {
1973 if (MemCheckBlock->getTerminator() == &I)
1974 continue;
1975 InstructionCost C =
1976 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
1977 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " " << C <<
" for " << I << "\n"; } } while (false)
;
1978 RTCheckCost += C;
1979 }
1980
1981 if (SCEVCheckBlock || MemCheckBlock)
1982 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCostdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Total cost of runtime checks: "
<< RTCheckCost << "\n"; } } while (false)
1983 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Total cost of runtime checks: "
<< RTCheckCost << "\n"; } } while (false)
;
1984
1985 return RTCheckCost;
1986 }
1987
1988 /// Remove the created SCEV & memory runtime check blocks & instructions, if
1989 /// unused.
1990 ~GeneratedRTChecks() {
1991 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1992 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
1993 if (!SCEVCheckCond)
1994 SCEVCleaner.markResultUsed();
1995
1996 if (!MemRuntimeCheckCond)
1997 MemCheckCleaner.markResultUsed();
1998
1999 if (MemRuntimeCheckCond) {
2000 auto &SE = *MemCheckExp.getSE();
2001 // Memory runtime check generation creates compares that use expanded
2002 // values. Remove them before running the SCEVExpanderCleaners.
2003 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2004 if (MemCheckExp.isInsertedInstruction(&I))
2005 continue;
2006 SE.forgetValue(&I);
2007 I.eraseFromParent();
2008 }
2009 }
2010 MemCheckCleaner.cleanup();
2011 SCEVCleaner.cleanup();
2012
2013 if (SCEVCheckCond)
2014 SCEVCheckBlock->eraseFromParent();
2015 if (MemRuntimeCheckCond)
2016 MemCheckBlock->eraseFromParent();
2017 }
2018
2019 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2020 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2021 /// depending on the generated condition.
2022 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2023 BasicBlock *LoopVectorPreHeader,
2024 BasicBlock *LoopExitBlock) {
2025 if (!SCEVCheckCond)
2026 return nullptr;
2027
2028 Value *Cond = SCEVCheckCond;
2029 // Mark the check as used, to prevent it from being removed during cleanup.
2030 SCEVCheckCond = nullptr;
2031 if (auto *C = dyn_cast<ConstantInt>(Cond))
2032 if (C->isZero())
2033 return nullptr;
2034
2035 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2036
2037 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2038 // Create new preheader for vector loop.
2039 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2040 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2041
2042 SCEVCheckBlock->getTerminator()->eraseFromParent();
2043 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2044 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2045 SCEVCheckBlock);
2046
2047 DT->addNewBlock(SCEVCheckBlock, Pred);
2048 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2049
2050 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
2051 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
2052 return SCEVCheckBlock;
2053 }
2054
2055 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2056 /// the branches to branch to the vector preheader or \p Bypass, depending on
2057 /// the generated condition.
2058 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2059 BasicBlock *LoopVectorPreHeader) {
2060 // Check if we generated code that checks in runtime if arrays overlap.
2061 if (!MemRuntimeCheckCond)
2062 return nullptr;
2063
2064 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2065 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2066 MemCheckBlock);
2067
2068 DT->addNewBlock(MemCheckBlock, Pred);
2069 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2070 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2071
2072 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2073 PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2074
2075 ReplaceInstWithInst(
2076 MemCheckBlock->getTerminator(),
2077 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2078 MemCheckBlock->getTerminator()->setDebugLoc(
2079 Pred->getTerminator()->getDebugLoc());
2080
2081 // Mark the check as used, to prevent it from being removed during cleanup.
2082 MemRuntimeCheckCond = nullptr;
2083 return MemCheckBlock;
2084 }
2085};
2086
2087// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2088// vectorization. The loop needs to be annotated with #pragma omp simd
2089// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2090// vector length information is not provided, vectorization is not considered
2091// explicit. Interleave hints are not allowed either. These limitations will be
2092// relaxed in the future.
2093// Please, note that we are currently forced to abuse the pragma 'clang
2094// vectorize' semantics. This pragma provides *auto-vectorization hints*
2095// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2096// provides *explicit vectorization hints* (LV can bypass legal checks and
2097// assume that vectorization is legal). However, both hints are implemented
2098// using the same metadata (llvm.loop.vectorize, processed by
2099// LoopVectorizeHints). This will be fixed in the future when the native IR
2100// representation for pragma 'omp simd' is introduced.
2101static bool isExplicitVecOuterLoop(Loop *OuterLp,
2102 OptimizationRemarkEmitter *ORE) {
2103 assert(!OuterLp->isInnermost() && "This is not an outer loop")(static_cast <bool> (!OuterLp->isInnermost() &&
"This is not an outer loop") ? void (0) : __assert_fail ("!OuterLp->isInnermost() && \"This is not an outer loop\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2103, __extension__
__PRETTY_FUNCTION__))
;
2104 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2105
2106 // Only outer loops with an explicit vectorization hint are supported.
2107 // Unannotated outer loops are ignored.
2108 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2109 return false;
2110
2111 Function *Fn = OuterLp->getHeader()->getParent();
2112 if (!Hints.allowVectorization(Fn, OuterLp,
2113 true /*VectorizeOnlyWhenForced*/)) {
2114 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"
; } } while (false)
;
2115 return false;
2116 }
2117
2118 if (Hints.getInterleave() > 1) {
2119 // TODO: Interleave support is future work.
2120 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
2121 "outer loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
;
2122 Hints.emitRemarkWithHints();
2123 return false;
2124 }
2125
2126 return true;
2127}
2128
2129static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2130 OptimizationRemarkEmitter *ORE,
2131 SmallVectorImpl<Loop *> &V) {
2132 // Collect inner loops and outer loops without irreducible control flow. For
2133 // now, only collect outer loops that have explicit vectorization hints. If we
2134 // are stress testing the VPlan H-CFG construction, we collect the outermost
2135 // loop of every loop nest.
2136 if (L.isInnermost() || VPlanBuildStressTest ||
2137 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2138 LoopBlocksRPO RPOT(&L);
2139 RPOT.perform(LI);
2140 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2141 V.push_back(&L);
2142 // TODO: Collect inner loops inside marked outer loops in case
2143 // vectorization fails for the outer loop. Do not invoke
2144 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2145 // already known to be reducible. We can use an inherited attribute for
2146 // that.
2147 return;
2148 }
2149 }
2150 for (Loop *InnerL : L)
2151 collectSupportedLoops(*InnerL, LI, ORE, V);
2152}
2153
2154namespace {
2155
2156/// The LoopVectorize Pass.
2157struct LoopVectorize : public FunctionPass {
2158 /// Pass identification, replacement for typeid
2159 static char ID;
2160
2161 LoopVectorizePass Impl;
2162
2163 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2164 bool VectorizeOnlyWhenForced = false)
2165 : FunctionPass(ID),
2166 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2167 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2168 }
2169
2170 bool runOnFunction(Function &F) override {
2171 if (skipFunction(F))
2172 return false;
2173
2174 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2175 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2176 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2177 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2178 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2179 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2180 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2181 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2182 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2183 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2184 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2185 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2186 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2187
2188 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2189 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2190
2191 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2192 GetLAA, *ORE, PSI).MadeAnyChange;
2193 }
2194
2195 void getAnalysisUsage(AnalysisUsage &AU) const override {
2196 AU.addRequired<AssumptionCacheTracker>();
2197 AU.addRequired<BlockFrequencyInfoWrapperPass>();
2198 AU.addRequired<DominatorTreeWrapperPass>();
2199 AU.addRequired<LoopInfoWrapperPass>();
2200 AU.addRequired<ScalarEvolutionWrapperPass>();
2201 AU.addRequired<TargetTransformInfoWrapperPass>();
2202 AU.addRequired<AAResultsWrapperPass>();
2203 AU.addRequired<LoopAccessLegacyAnalysis>();
2204 AU.addRequired<DemandedBitsWrapperPass>();
2205 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2206 AU.addRequired<InjectTLIMappingsLegacy>();
2207
2208 // We currently do not preserve loopinfo/dominator analyses with outer loop
2209 // vectorization. Until this is addressed, mark these analyses as preserved
2210 // only for non-VPlan-native path.
2211 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2212 if (!EnableVPlanNativePath) {
2213 AU.addPreserved<LoopInfoWrapperPass>();
2214 AU.addPreserved<DominatorTreeWrapperPass>();
2215 }
2216
2217 AU.addPreserved<BasicAAWrapperPass>();
2218 AU.addPreserved<GlobalsAAWrapperPass>();
2219 AU.addRequired<ProfileSummaryInfoWrapperPass>();
2220 }
2221};
2222
2223} // end anonymous namespace
2224
2225//===----------------------------------------------------------------------===//
2226// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2227// LoopVectorizationCostModel and LoopVectorizationPlanner.
2228//===----------------------------------------------------------------------===//
2229
2230Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2231 // We need to place the broadcast of invariant variables outside the loop,
2232 // but only if it's proven safe to do so. Else, broadcast will be inside
2233 // vector loop body.
2234 Instruction *Instr = dyn_cast<Instruction>(V);
2235 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2236 (!Instr ||
2237 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2238 // Place the code for broadcasting invariant variables in the new preheader.
2239 IRBuilder<>::InsertPointGuard Guard(Builder);
2240 if (SafeToHoist)
2241 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2242
2243 // Broadcast the scalar into all locations in the vector.
2244 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2245
2246 return Shuf;
2247}
2248
2249/// This function adds
2250/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2251/// to each vector element of Val. The sequence starts at StartIndex.
2252/// \p Opcode is relevant for FP induction variable.
2253static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2254 Instruction::BinaryOps BinOp, ElementCount VF,
2255 IRBuilderBase &Builder) {
2256 assert(VF.isVector() && "only vector VFs are supported")(static_cast <bool> (VF.isVector() && "only vector VFs are supported"
) ? void (0) : __assert_fail ("VF.isVector() && \"only vector VFs are supported\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2256, __extension__
__PRETTY_FUNCTION__))
;
2257
2258 // Create and check the types.
2259 auto *ValVTy = cast<VectorType>(Val->getType());
2260 ElementCount VLen = ValVTy->getElementCount();
2261
2262 Type *STy = Val->getType()->getScalarType();
2263 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2264, __extension__
__PRETTY_FUNCTION__))
2264 "Induction Step must be an integer or FP")(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2264, __extension__
__PRETTY_FUNCTION__))
;
2265 assert(Step->getType() == STy && "Step has wrong type")(static_cast <bool> (Step->getType() == STy &&
"Step has wrong type") ? void (0) : __assert_fail ("Step->getType() == STy && \"Step has wrong type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2265, __extension__
__PRETTY_FUNCTION__))
;
2266
2267 SmallVector<Constant *, 8> Indices;
2268
2269 // Create a vector of consecutive numbers from zero to VF.
2270 VectorType *InitVecValVTy = ValVTy;
2271 if (STy->isFloatingPointTy()) {
2272 Type *InitVecValSTy =
2273 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2274 InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2275 }
2276 Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2277
2278 // Splat the StartIdx
2279 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2280
2281 if (STy->isIntegerTy()) {
2282 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2283 Step = Builder.CreateVectorSplat(VLen, Step);
2284 assert(Step->getType() == Val->getType() && "Invalid step vec")(static_cast <bool> (Step->getType() == Val->getType
() && "Invalid step vec") ? void (0) : __assert_fail (
"Step->getType() == Val->getType() && \"Invalid step vec\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2284, __extension__
__PRETTY_FUNCTION__))
;
2285 // FIXME: The newly created binary instructions should contain nsw/nuw
2286 // flags, which can be found from the original scalar operations.
2287 Step = Builder.CreateMul(InitVec, Step);
2288 return Builder.CreateAdd(Val, Step, "induction");
2289 }
2290
2291 // Floating point induction.
2292 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
== Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2293, __extension__
__PRETTY_FUNCTION__))
2293 "Binary Opcode should be specified for FP induction")(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
== Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2293, __extension__
__PRETTY_FUNCTION__))
;
2294 InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2295 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2296
2297 Step = Builder.CreateVectorSplat(VLen, Step);
2298 Value *MulOp = Builder.CreateFMul(InitVec, Step);
2299 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2300}
2301
2302/// Compute scalar induction steps. \p ScalarIV is the scalar induction
2303/// variable on which to base the steps, \p Step is the size of the step.
2304static void buildScalarSteps(Value *ScalarIV, Value *Step,
2305 const InductionDescriptor &ID, VPValue *Def,
2306 VPTransformState &State) {
2307 IRBuilderBase &Builder = State.Builder;
2308 // We shouldn't have to build scalar steps if we aren't vectorizing.
2309 assert(State.VF.isVector() && "VF should be greater than one")(static_cast <bool> (State.VF.isVector() && "VF should be greater than one"
) ? void (0) : __assert_fail ("State.VF.isVector() && \"VF should be greater than one\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2309, __extension__
__PRETTY_FUNCTION__))
;
2310 // Get the value type and ensure it and the step have the same integer type.
2311 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2312 assert(ScalarIVTy == Step->getType() &&(static_cast <bool> (ScalarIVTy == Step->getType() &&
"Val and Step should have the same type") ? void (0) : __assert_fail
("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2313, __extension__
__PRETTY_FUNCTION__))
2313 "Val and Step should have the same type")(static_cast <bool> (ScalarIVTy == Step->getType() &&
"Val and Step should have the same type") ? void (0) : __assert_fail
("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2313, __extension__
__PRETTY_FUNCTION__))
;
2314
2315 // We build scalar steps for both integer and floating-point induction
2316 // variables. Here, we determine the kind of arithmetic we will perform.
2317 Instruction::BinaryOps AddOp;
2318 Instruction::BinaryOps MulOp;
2319 if (ScalarIVTy->isIntegerTy()) {
2320 AddOp = Instruction::Add;
2321 MulOp = Instruction::Mul;
2322 } else {
2323 AddOp = ID.getInductionOpcode();
2324 MulOp = Instruction::FMul;
2325 }
2326
2327 // Determine the number of scalars we need to generate for each unroll
2328 // iteration.
2329 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2330 unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2331 // Compute the scalar steps and save the results in State.
2332 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2333 ScalarIVTy->getScalarSizeInBits());
2334 Type *VecIVTy = nullptr;
2335 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2336 if (!FirstLaneOnly && State.VF.isScalable()) {
2337 VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2338 UnitStepVec =
2339 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2340 SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2341 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2342 }
2343
2344 for (unsigned Part = 0; Part < State.UF; ++Part) {
2345 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2346
2347 if (!FirstLaneOnly && State.VF.isScalable()) {
2348 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2349 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2350 if (ScalarIVTy->isFloatingPointTy())
2351 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2352 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2353 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2354 State.set(Def, Add, Part);
2355 // It's useful to record the lane values too for the known minimum number
2356 // of elements so we do those below. This improves the code quality when
2357 // trying to extract the first element, for example.
2358 }
2359
2360 if (ScalarIVTy->isFloatingPointTy())
2361 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2362
2363 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2364 Value *StartIdx = Builder.CreateBinOp(
2365 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2366 // The step returned by `createStepForVF` is a runtime-evaluated value
2367 // when VF is scalable. Otherwise, it should be folded into a Constant.
2368 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&(static_cast <bool> ((State.VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2370, __extension__
__PRETTY_FUNCTION__))
2369 "Expected StartIdx to be folded to a constant when VF is not "(static_cast <bool> ((State.VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2370, __extension__
__PRETTY_FUNCTION__))
2370 "scalable")(static_cast <bool> ((State.VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2370, __extension__
__PRETTY_FUNCTION__))
;
2371 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2372 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2373 State.set(Def, Add, VPIteration(Part, Lane));
2374 }
2375 }
2376}
2377
2378// Generate code for the induction step. Note that induction steps are
2379// required to be loop-invariant
2380static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2381 Instruction *InsertBefore,
2382 Loop *OrigLoop = nullptr) {
2383 const DataLayout &DL = SE.getDataLayout();
2384 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&(static_cast <bool> ((!OrigLoop || SE.isLoopInvariant(Step
, OrigLoop)) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("(!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && \"Induction step should be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2385, __extension__
__PRETTY_FUNCTION__))
2385 "Induction step should be loop invariant")(static_cast <bool> ((!OrigLoop || SE.isLoopInvariant(Step
, OrigLoop)) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("(!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && \"Induction step should be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2385, __extension__
__PRETTY_FUNCTION__))
;
2386 if (auto *E = dyn_cast<SCEVUnknown>(Step))
2387 return E->getValue();
2388
2389 SCEVExpander Exp(SE, DL, "induction");
2390 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2391}
2392
2393/// Compute the transformed value of Index at offset StartValue using step
2394/// StepValue.
2395/// For integer induction, returns StartValue + Index * StepValue.
2396/// For pointer induction, returns StartValue[Index * StepValue].
2397/// FIXME: The newly created binary instructions should contain nsw/nuw
2398/// flags, which can be found from the original scalar operations.
2399static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
2400 Value *StartValue, Value *Step,
2401 const InductionDescriptor &ID) {
2402 assert(Index->getType()->getScalarType() == Step->getType() &&(static_cast <bool> (Index->getType()->getScalarType
() == Step->getType() && "Index scalar type does not match StepValue type"
) ? void (0) : __assert_fail ("Index->getType()->getScalarType() == Step->getType() && \"Index scalar type does not match StepValue type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2403, __extension__
__PRETTY_FUNCTION__))
2403 "Index scalar type does not match StepValue type")(static_cast <bool> (Index->getType()->getScalarType
() == Step->getType() && "Index scalar type does not match StepValue type"
) ? void (0) : __assert_fail ("Index->getType()->getScalarType() == Step->getType() && \"Index scalar type does not match StepValue type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2403, __extension__
__PRETTY_FUNCTION__))
;
2404
2405 // Note: the IR at this point is broken. We cannot use SE to create any new
2406 // SCEV and then expand it, hoping that SCEV's simplification will give us
2407 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2408 // lead to various SCEV crashes. So all we can do is to use builder and rely
2409 // on InstCombine for future simplifications. Here we handle some trivial
2410 // cases only.
2411 auto CreateAdd = [&B](Value *X, Value *Y) {
2412 assert(X->getType() == Y->getType() && "Types don't match!")(static_cast <bool> (X->getType() == Y->getType()
&& "Types don't match!") ? void (0) : __assert_fail (
"X->getType() == Y->getType() && \"Types don't match!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2412, __extension__
__PRETTY_FUNCTION__))
;
2413 if (auto *CX = dyn_cast<ConstantInt>(X))
2414 if (CX->isZero())
2415 return Y;
2416 if (auto *CY = dyn_cast<ConstantInt>(Y))
2417 if (CY->isZero())
2418 return X;
2419 return B.CreateAdd(X, Y);
2420 };
2421
2422 // We allow X to be a vector type, in which case Y will potentially be
2423 // splatted into a vector with the same element count.
2424 auto CreateMul = [&B](Value *X, Value *Y) {
2425 assert(X->getType()->getScalarType() == Y->getType() &&(static_cast <bool> (X->getType()->getScalarType(
) == Y->getType() && "Types don't match!") ? void (
0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2426, __extension__
__PRETTY_FUNCTION__))
2426 "Types don't match!")(static_cast <bool> (X->getType()->getScalarType(
) == Y->getType() && "Types don't match!") ? void (
0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2426, __extension__
__PRETTY_FUNCTION__))
;
2427 if (auto *CX = dyn_cast<ConstantInt>(X))
2428 if (CX->isOne())
2429 return Y;
2430 if (auto *CY = dyn_cast<ConstantInt>(Y))
2431 if (CY->isOne())
2432 return X;
2433 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2434 if (XVTy && !isa<VectorType>(Y->getType()))
2435 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2436 return B.CreateMul(X, Y);
2437 };
2438
2439 switch (ID.getKind()) {
2440 case InductionDescriptor::IK_IntInduction: {
2441 assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for integer inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2442, __extension__
__PRETTY_FUNCTION__))
2442 "Vector indices not supported for integer inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for integer inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2442, __extension__
__PRETTY_FUNCTION__))
;
2443 assert(Index->getType() == StartValue->getType() &&(static_cast <bool> (Index->getType() == StartValue->
getType() && "Index type does not match StartValue type"
) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2444, __extension__
__PRETTY_FUNCTION__))
2444 "Index type does not match StartValue type")(static_cast <bool> (Index->getType() == StartValue->
getType() && "Index type does not match StartValue type"
) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2444, __extension__
__PRETTY_FUNCTION__))
;
2445 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2446 return B.CreateSub(StartValue, Index);
2447 auto *Offset = CreateMul(Index, Step);
2448 return CreateAdd(StartValue, Offset);
2449 }
2450 case InductionDescriptor::IK_PtrInduction: {
2451 assert(isa<Constant>(Step) &&(static_cast <bool> (isa<Constant>(Step) &&
"Expected constant step for pointer induction") ? void (0) :
__assert_fail ("isa<Constant>(Step) && \"Expected constant step for pointer induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2452, __extension__
__PRETTY_FUNCTION__))
2452 "Expected constant step for pointer induction")(static_cast <bool> (isa<Constant>(Step) &&
"Expected constant step for pointer induction") ? void (0) :
__assert_fail ("isa<Constant>(Step) && \"Expected constant step for pointer induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2452, __extension__
__PRETTY_FUNCTION__))
;
2453 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2454 }
2455 case InductionDescriptor::IK_FpInduction: {
2456 assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for FP inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2457, __extension__
__PRETTY_FUNCTION__))
2457 "Vector indices not supported for FP inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for FP inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2457, __extension__
__PRETTY_FUNCTION__))
;
2458 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value")(static_cast <bool> (Step->getType()->isFloatingPointTy
() && "Expected FP Step value") ? void (0) : __assert_fail
("Step->getType()->isFloatingPointTy() && \"Expected FP Step value\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2458, __extension__
__PRETTY_FUNCTION__))
;
2459 auto InductionBinOp = ID.getInductionBinOp();
2460 assert(InductionBinOp &&(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2463, __extension__
__PRETTY_FUNCTION__))
2461 (InductionBinOp->getOpcode() == Instruction::FAdd ||(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2463, __extension__
__PRETTY_FUNCTION__))
2462 InductionBinOp->getOpcode() == Instruction::FSub) &&(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2463, __extension__
__PRETTY_FUNCTION__))
2463 "Original bin op should be defined for FP induction")(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2463, __extension__
__PRETTY_FUNCTION__))
;
2464
2465 Value *MulExp = B.CreateFMul(Step, Index);
2466 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2467 "induction");
2468 }
2469 case InductionDescriptor::IK_NoInduction:
2470 return nullptr;
2471 }
2472 llvm_unreachable("invalid enum")::llvm::llvm_unreachable_internal("invalid enum", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2472)
;
2473}
2474
2475void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2476 const VPIteration &Instance,
2477 VPTransformState &State) {
2478 Value *ScalarInst = State.get(Def, Instance);
2479 Value *VectorValue = State.get(Def, Instance.Part);
2480 VectorValue = Builder.CreateInsertElement(
2481 VectorValue, ScalarInst,
2482 Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2483 State.set(Def, VectorValue, Instance.Part);
2484}
2485
2486// Return whether we allow using masked interleave-groups (for dealing with
2487// strided loads/stores that reside in predicated blocks, or for dealing
2488// with gaps).
2489static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2490 // If an override option has been passed in for interleaved accesses, use it.
2491 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2492 return EnableMaskedInterleavedMemAccesses;
2493
2494 return TTI.enableMaskedInterleavedAccessVectorization();
2495}
2496
2497// Try to vectorize the interleave group that \p Instr belongs to.
2498//
2499// E.g. Translate following interleaved load group (factor = 3):
2500// for (i = 0; i < N; i+=3) {
2501// R = Pic[i]; // Member of index 0
2502// G = Pic[i+1]; // Member of index 1
2503// B = Pic[i+2]; // Member of index 2
2504// ... // do something to R, G, B
2505// }
2506// To:
2507// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2508// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2509// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2510// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2511//
2512// Or translate following interleaved store group (factor = 3):
2513// for (i = 0; i < N; i+=3) {
2514// ... do something to R, G, B
2515// Pic[i] = R; // Member of index 0
2516// Pic[i+1] = G; // Member of index 1
2517// Pic[i+2] = B; // Member of index 2
2518// }
2519// To:
2520// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2521// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2522// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2523// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2524// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2525void InnerLoopVectorizer::vectorizeInterleaveGroup(
2526 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2527 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2528 VPValue *BlockInMask) {
2529 Instruction *Instr = Group->getInsertPos();
2530 const DataLayout &DL = Instr->getModule()->getDataLayout();
2531
2532 // Prepare for the vector type of the interleaved load/store.
2533 Type *ScalarTy = getLoadStoreType(Instr);
2534 unsigned InterleaveFactor = Group->getFactor();
2535 assert(!VF.isScalable() && "scalable vectors not yet supported.")(static_cast <bool> (!VF.isScalable() && "scalable vectors not yet supported."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2535, __extension__
__PRETTY_FUNCTION__))
;
2536 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2537
2538 // Prepare for the new pointers.
2539 SmallVector<Value *, 2> AddrParts;
2540 unsigned Index = Group->getIndex(Instr);
2541
2542 // TODO: extend the masked interleaved-group support to reversed access.
2543 assert((!BlockInMask || !Group->isReverse()) &&(static_cast <bool> ((!BlockInMask || !Group->isReverse
()) && "Reversed masked interleave-group not supported."
) ? void (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2544, __extension__
__PRETTY_FUNCTION__))
2544 "Reversed masked interleave-group not supported.")(static_cast <bool> ((!BlockInMask || !Group->isReverse
()) && "Reversed masked interleave-group not supported."
) ? void (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2544, __extension__
__PRETTY_FUNCTION__))
;
2545
2546 // If the group is reverse, adjust the index to refer to the last vector lane
2547 // instead of the first. We adjust the index from the first vector lane,
2548 // rather than directly getting the pointer for lane VF - 1, because the
2549 // pointer operand of the interleaved access is supposed to be uniform. For
2550 // uniform instructions, we're only required to generate a value for the
2551 // first vector lane in each unroll iteration.
2552 if (Group->isReverse())
2553 Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2554
2555 for (unsigned Part = 0; Part < UF; Part++) {
2556 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2557 State.setDebugLocFromInst(AddrPart);
2558
2559 // Notice current instruction could be any index. Need to adjust the address
2560 // to the member of index 0.
2561 //
2562 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2563 // b = A[i]; // Member of index 0
2564 // Current pointer is pointed to A[i+1], adjust it to A[i].
2565 //
2566 // E.g. A[i+1] = a; // Member of index 1
2567 // A[i] = b; // Member of index 0
2568 // A[i+2] = c; // Member of index 2 (Current instruction)
2569 // Current pointer is pointed to A[i+2], adjust it to A[i].
2570
2571 bool InBounds = false;
2572 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2573 InBounds = gep->isInBounds();
2574 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2575 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2576
2577 // Cast to the vector pointer type.
2578 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2579 Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2580 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2581 }
2582
2583 State.setDebugLocFromInst(Instr);
2584 Value *PoisonVec = PoisonValue::get(VecTy);
2585
2586 Value *MaskForGaps = nullptr;
2587 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2588 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2589 assert(MaskForGaps && "Mask for Gaps is required but it is null")(static_cast <bool> (MaskForGaps && "Mask for Gaps is required but it is null"
) ? void (0) : __assert_fail ("MaskForGaps && \"Mask for Gaps is required but it is null\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2589, __extension__
__PRETTY_FUNCTION__))
;
2590 }
2591
2592 // Vectorize the interleaved load group.
2593 if (isa<LoadInst>(Instr)) {
2594 // For each unroll part, create a wide load for the group.
2595 SmallVector<Value *, 2> NewLoads;
2596 for (unsigned Part = 0; Part < UF; Part++) {
2597 Instruction *NewLoad;
2598 if (BlockInMask || MaskForGaps) {
2599 assert(useMaskedInterleavedAccesses(*TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(*TTI)
&& "masked interleaved groups are not allowed.") ? void
(0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2600, __extension__
__PRETTY_FUNCTION__))
2600 "masked interleaved groups are not allowed.")(static_cast <bool> (useMaskedInterleavedAccesses(*TTI)
&& "masked interleaved groups are not allowed.") ? void
(0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2600, __extension__
__PRETTY_FUNCTION__))
;
2601 Value *GroupMask = MaskForGaps;
2602 if (BlockInMask) {
2603 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2604 Value *ShuffledMask = Builder.CreateShuffleVector(
2605 BlockInMaskPart,
2606 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2607 "interleaved.mask");
2608 GroupMask = MaskForGaps
2609 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2610 MaskForGaps)
2611 : ShuffledMask;
2612 }
2613 NewLoad =
2614 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2615 GroupMask, PoisonVec, "wide.masked.vec");
2616 }
2617 else
2618 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2619 Group->getAlign(), "wide.vec");
2620 Group->addMetadata(NewLoad);
2621 NewLoads.push_back(NewLoad);
2622 }
2623
2624 // For each member in the group, shuffle out the appropriate data from the
2625 // wide loads.
2626 unsigned J = 0;
2627 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2628 Instruction *Member = Group->getMember(I);
2629
2630 // Skip the gaps in the group.
2631 if (!Member)
2632 continue;
2633
2634 auto StrideMask =
2635 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2636 for (unsigned Part = 0; Part < UF; Part++) {
2637 Value *StridedVec = Builder.CreateShuffleVector(
2638 NewLoads[Part], StrideMask, "strided.vec");
2639
2640 // If this member has different type, cast the result type.
2641 if (Member->getType() != ScalarTy) {
2642 assert(!VF.isScalable() && "VF is assumed to be non scalable.")(static_cast <bool> (!VF.isScalable() && "VF is assumed to be non scalable."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2642, __extension__
__PRETTY_FUNCTION__))
;
2643 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2644 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2645 }
2646
2647 if (Group->isReverse())
2648 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2649
2650 State.set(VPDefs[J], StridedVec, Part);
2651 }
2652 ++J;
2653 }
2654 return;
2655 }
2656
2657 // The sub vector type for current instruction.
2658 auto *SubVT = VectorType::get(ScalarTy, VF);
2659
2660 // Vectorize the interleaved store group.
2661 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2662 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&(static_cast <bool> ((!MaskForGaps || useMaskedInterleavedAccesses
(*TTI)) && "masked interleaved groups are not allowed."
) ? void (0) : __assert_fail ("(!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2663, __extension__
__PRETTY_FUNCTION__))
2663 "masked interleaved groups are not allowed.")(static_cast <bool> ((!MaskForGaps || useMaskedInterleavedAccesses
(*TTI)) && "masked interleaved groups are not allowed."
) ? void (0) : __assert_fail ("(!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2663, __extension__
__PRETTY_FUNCTION__))
;
2664 assert((!MaskForGaps || !VF.isScalable()) &&(static_cast <bool> ((!MaskForGaps || !VF.isScalable())
&& "masking gaps for scalable vectors is not yet supported."
) ? void (0) : __assert_fail ("(!MaskForGaps || !VF.isScalable()) && \"masking gaps for scalable vectors is not yet supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2665, __extension__
__PRETTY_FUNCTION__))
2665 "masking gaps for scalable vectors is not yet supported.")(static_cast <bool> ((!MaskForGaps || !VF.isScalable())
&& "masking gaps for scalable vectors is not yet supported."
) ? void (0) : __assert_fail ("(!MaskForGaps || !VF.isScalable()) && \"masking gaps for scalable vectors is not yet supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2665, __extension__
__PRETTY_FUNCTION__))
;
2666 for (unsigned Part = 0; Part < UF; Part++) {
2667 // Collect the stored vector from each member.
2668 SmallVector<Value *, 4> StoredVecs;
2669 for (unsigned i = 0; i < InterleaveFactor; i++) {
2670 assert((Group->getMember(i) || MaskForGaps) &&(static_cast <bool> ((Group->getMember(i) || MaskForGaps
) && "Fail to get a member from an interleaved store group"
) ? void (0) : __assert_fail ("(Group->getMember(i) || MaskForGaps) && \"Fail to get a member from an interleaved store group\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2671, __extension__
__PRETTY_FUNCTION__))
2671 "Fail to get a member from an interleaved store group")(static_cast <bool> ((Group->getMember(i) || MaskForGaps
) && "Fail to get a member from an interleaved store group"
) ? void (0) : __assert_fail ("(Group->getMember(i) || MaskForGaps) && \"Fail to get a member from an interleaved store group\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2671, __extension__
__PRETTY_FUNCTION__))
;
2672 Instruction *Member = Group->getMember(i);
2673
2674 // Skip the gaps in the group.
2675 if (!Member) {
2676 Value *Undef = PoisonValue::get(SubVT);
2677 StoredVecs.push_back(Undef);
2678 continue;
2679 }
2680
2681 Value *StoredVec = State.get(StoredValues[i], Part);
2682
2683 if (Group->isReverse())
2684 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2685
2686 // If this member has different type, cast it to a unified type.
2687
2688 if (StoredVec->getType() != SubVT)
2689 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2690
2691 StoredVecs.push_back(StoredVec);
2692 }
2693
2694 // Concatenate all vectors into a wide vector.
2695 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2696
2697 // Interleave the elements in the wide vector.
2698 Value *IVec = Builder.CreateShuffleVector(
2699 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2700 "interleaved.vec");
2701
2702 Instruction *NewStoreInstr;
2703 if (BlockInMask || MaskForGaps) {
2704 Value *GroupMask = MaskForGaps;
2705 if (BlockInMask) {
2706 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2707 Value *ShuffledMask = Builder.CreateShuffleVector(
2708 BlockInMaskPart,
2709 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2710 "interleaved.mask");
2711 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2712 ShuffledMask, MaskForGaps)
2713 : ShuffledMask;
2714 }
2715 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2716 Group->getAlign(), GroupMask);
2717 } else
2718 NewStoreInstr =
2719 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2720
2721 Group->addMetadata(NewStoreInstr);
2722 }
2723}
2724
2725void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2726 VPReplicateRecipe *RepRecipe,
2727 const VPIteration &Instance,
2728 bool IfPredicateInstr,
2729 VPTransformState &State) {
2730 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")(static_cast <bool> (!Instr->getType()->isAggregateType
() && "Can't handle vectors") ? void (0) : __assert_fail
("!Instr->getType()->isAggregateType() && \"Can't handle vectors\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2730, __extension__
__PRETTY_FUNCTION__))
;
2731
2732 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2733 // the first lane and part.
2734 if (isa<NoAliasScopeDeclInst>(Instr))
2735 if (!Instance.isFirstIteration())
2736 return;
2737
2738 // Does this instruction return a value ?
2739 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2740
2741 Instruction *Cloned = Instr->clone();
2742 if (!IsVoidRetTy)
2743 Cloned->setName(Instr->getName() + ".cloned");
2744
2745 // If the scalarized instruction contributes to the address computation of a
2746 // widen masked load/store which was in a basic block that needed predication
2747 // and is not predicated after vectorization, we can't propagate
2748 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2749 // instruction could feed a poison value to the base address of the widen
2750 // load/store.
2751 if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2752 Cloned->dropPoisonGeneratingFlags();
2753
2754 if (Instr->getDebugLoc())
2755 State.setDebugLocFromInst(Instr);
2756
2757 // Replace the operands of the cloned instructions with their scalar
2758 // equivalents in the new loop.
2759 for (const auto &I : enumerate(RepRecipe->operands())) {
2760 auto InputInstance = Instance;
2761 VPValue *Operand = I.value();
2762 if (vputils::isUniformAfterVectorization(Operand))
2763 InputInstance.Lane = VPLane::getFirstLane();
2764 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2765 }
2766 State.addNewMetadata(Cloned, Instr);
2767
2768 // Place the cloned scalar in the new loop.
2769 State.Builder.Insert(Cloned);
2770
2771 State.set(RepRecipe, Cloned, Instance);
2772
2773 // If we just cloned a new assumption, add it the assumption cache.
2774 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2775 AC->registerAssumption(II);
2776
2777 // End if-block.
2778 if (IfPredicateInstr)
2779 PredicatedInstructions.push_back(Cloned);
2780}
2781
2782Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
2783 if (TripCount)
2784 return TripCount;
2785
2786 assert(InsertBlock)(static_cast <bool> (InsertBlock) ? void (0) : __assert_fail
("InsertBlock", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2786, __extension__ __PRETTY_FUNCTION__))
;
2787 IRBuilder<> Builder(InsertBlock->getTerminator());
2788 // Find the loop boundaries.
2789 ScalarEvolution *SE = PSE.getSE();
2790 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2791 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&(static_cast <bool> (!isa<SCEVCouldNotCompute>(BackedgeTakenCount
) && "Invalid loop count") ? void (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2792, __extension__
__PRETTY_FUNCTION__))
2792 "Invalid loop count")(static_cast <bool> (!isa<SCEVCouldNotCompute>(BackedgeTakenCount
) && "Invalid loop count") ? void (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2792, __extension__
__PRETTY_FUNCTION__))
;
2793
2794 Type *IdxTy = Legal->getWidestInductionType();
2795 assert(IdxTy && "No type for induction")(static_cast <bool> (IdxTy && "No type for induction"
) ? void (0) : __assert_fail ("IdxTy && \"No type for induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2795, __extension__
__PRETTY_FUNCTION__))
;
2796
2797 // The exit count might have the type of i64 while the phi is i32. This can
2798 // happen if we have an induction variable that is sign extended before the
2799 // compare. The only way that we get a backedge taken count is that the
2800 // induction variable was signed and as such will not overflow. In such a case
2801 // truncation is legal.
2802 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2803 IdxTy->getPrimitiveSizeInBits())
2804 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2805 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2806
2807 // Get the total trip count from the count by adding 1.
2808 const SCEV *ExitCount = SE->getAddExpr(
2809 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2810
2811 const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
2812
2813 // Expand the trip count and place the new instructions in the preheader.
2814 // Notice that the pre-header does not change, only the loop body.
2815 SCEVExpander Exp(*SE, DL, "induction");
2816
2817 // Count holds the overall loop count (N).
2818 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2819 InsertBlock->getTerminator());
2820
2821 if (TripCount->getType()->isPointerTy())
2822 TripCount =
2823 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2824 InsertBlock->getTerminator());
2825
2826 return TripCount;
2827}
2828
2829Value *
2830InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2831 if (VectorTripCount)
2832 return VectorTripCount;
2833
2834 Value *TC = getOrCreateTripCount(InsertBlock);
2835 IRBuilder<> Builder(InsertBlock->getTerminator());
2836
2837 Type *Ty = TC->getType();
2838 // This is where we can make the step a runtime constant.
2839 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2840
2841 // If the tail is to be folded by masking, round the number of iterations N
2842 // up to a multiple of Step instead of rounding down. This is done by first
2843 // adding Step-1 and then rounding down. Note that it's ok if this addition
2844 // overflows: the vector induction variable will eventually wrap to zero given
2845 // that it starts at zero and its Step is a power of two; the loop will then
2846 // exit, with the last early-exit vector comparison also producing all-true.
2847 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2848 // is accounted for in emitIterationCountCheck that adds an overflow check.
2849 if (Cost->foldTailByMasking()) {
2850 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
) * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2851, __extension__
__PRETTY_FUNCTION__))
2851 "VF*UF must be a power of 2 when folding tail by masking")(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
) * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2851, __extension__
__PRETTY_FUNCTION__))
;
2852 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2853 TC = Builder.CreateAdd(
2854 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2855 }
2856
2857 // Now we need to generate the expression for the part of the loop that the
2858 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2859 // iterations are not required for correctness, or N - Step, otherwise. Step
2860 // is equal to the vectorization factor (number of SIMD elements) times the
2861 // unroll factor (number of SIMD instructions).
2862 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2863
2864 // There are cases where we *must* run at least one iteration in the remainder
2865 // loop. See the cost model for when this can happen. If the step evenly
2866 // divides the trip count, we set the remainder to be equal to the step. If
2867 // the step does not evenly divide the trip count, no adjustment is necessary
2868 // since there will already be scalar iterations. Note that the minimum
2869 // iterations check ensures that N >= Step.
2870 if (Cost->requiresScalarEpilogue(VF)) {
2871 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2872 R = Builder.CreateSelect(IsZero, Step, R);
2873 }
2874
2875 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2876
2877 return VectorTripCount;
2878}
2879
2880Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2881 const DataLayout &DL) {
2882 // Verify that V is a vector type with same number of elements as DstVTy.
2883 auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2884 unsigned VF = DstFVTy->getNumElements();
2885 auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2886 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match")(static_cast <bool> ((VF == SrcVecTy->getNumElements
()) && "Vector dimensions do not match") ? void (0) :
__assert_fail ("(VF == SrcVecTy->getNumElements()) && \"Vector dimensions do not match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2886, __extension__
__PRETTY_FUNCTION__))
;
2887 Type *SrcElemTy = SrcVecTy->getElementType();
2888 Type *DstElemTy = DstFVTy->getElementType();
2889 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2890, __extension__
__PRETTY_FUNCTION__))
2890 "Vector elements must have same size")(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2890, __extension__
__PRETTY_FUNCTION__))
;
2891
2892 // Do a direct cast if element types are castable.
2893 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2894 return Builder.CreateBitOrPointerCast(V, DstFVTy);
2895 }
2896 // V cannot be directly casted to desired vector type.
2897 // May happen when V is a floating point vector but DstVTy is a vector of
2898 // pointers or vice-versa. Handle this using a two-step bitcast using an
2899 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2900 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2901, __extension__
__PRETTY_FUNCTION__))
2901 "Only one type should be a pointer type")(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2901, __extension__
__PRETTY_FUNCTION__))
;
2902 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&(static_cast <bool> ((DstElemTy->isFloatingPointTy()
!= SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2903, __extension__
__PRETTY_FUNCTION__))
2903 "Only one type should be a floating point type")(static_cast <bool> ((DstElemTy->isFloatingPointTy()
!= SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2903, __extension__
__PRETTY_FUNCTION__))
;
2904 Type *IntTy =
2905 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2906 auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2907 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2908 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2909}
2910
2911void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2912 Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
2913 // Reuse existing vector loop preheader for TC checks.
2914 // Note that new preheader block is generated for vector loop.
2915 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2916 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2917
2918 // Generate code to check if the loop's trip count is less than VF * UF, or
2919 // equal to it in case a scalar epilogue is required; this implies that the
2920 // vector trip count is zero. This check also covers the case where adding one
2921 // to the backedge-taken count overflowed leading to an incorrect trip count
2922 // of zero. In this case we will also jump to the scalar loop.
2923 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2924 : ICmpInst::ICMP_ULT;
2925
2926 // If tail is to be folded, vector loop takes care of all iterations.
2927 Type *CountTy = Count->getType();
2928 Value *CheckMinIters = Builder.getFalse();
2929 auto CreateStep = [&]() -> Value * {
2930 // Create step with max(MinProTripCount, UF * VF).
2931 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2932 return createStepForVF(Builder, CountTy, VF, UF);
2933
2934 Value *MinProfTC =
2935 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2936 if (!VF.isScalable())
2937 return MinProfTC;
2938 return Builder.CreateBinaryIntrinsic(
2939 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2940 };
2941
2942 if (!Cost->foldTailByMasking())
2943 CheckMinIters =
2944 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2945 else if (VF.isScalable()) {
2946 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2947 // an overflow to zero when updating induction variables and so an
2948 // additional overflow check is required before entering the vector loop.
2949
2950 // Get the maximum unsigned value for the type.
2951 Value *MaxUIntTripCount =
2952 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2953 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2954
2955 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2956 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2957 }
2958
2959 // Create new preheader for vector loop.
2960 LoopVectorPreHeader =
2961 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2962 "vector.ph");
2963
2964 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2966, __extension__
__PRETTY_FUNCTION__))
2965 DT->getNode(Bypass)->getIDom()) &&(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2966, __extension__
__PRETTY_FUNCTION__))
2966 "TC check is expected to dominate Bypass")(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2966, __extension__
__PRETTY_FUNCTION__))
;
2967
2968 // Update dominator for Bypass & LoopExit (if needed).
2969 DT->changeImmediateDominator(Bypass, TCCheckBlock);
2970 if (!Cost->requiresScalarEpilogue(VF))
2971 // If there is an epilogue which must run, there's no edge from the
2972 // middle block to exit blocks and thus no need to update the immediate
2973 // dominator of the exit blocks.
2974 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2975
2976 ReplaceInstWithInst(
2977 TCCheckBlock->getTerminator(),
2978 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2979 LoopBypassBlocks.push_back(TCCheckBlock);
2980}
2981
2982BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2983 BasicBlock *const SCEVCheckBlock =
2984 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2985 if (!SCEVCheckBlock)
2986 return nullptr;
2987
2988 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2991, __extension__
__PRETTY_FUNCTION__))
2989 (OptForSizeBasedOnProfile &&(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2991, __extension__
__PRETTY_FUNCTION__))
2990 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2991, __extension__
__PRETTY_FUNCTION__))
2991 "Cannot SCEV check stride or overflow when optimizing for size")(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2991, __extension__
__PRETTY_FUNCTION__))
;
2992
2993
2994 // Update dominator only if this is first RT check.
2995 if (LoopBypassBlocks.empty()) {
2996 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2997 if (!Cost->requiresScalarEpilogue(VF))
2998 // If there is an epilogue which must run, there's no edge from the
2999 // middle block to exit blocks and thus no need to update the immediate
3000 // dominator of the exit blocks.
3001 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3002 }
3003
3004 LoopBypassBlocks.push_back(SCEVCheckBlock);
3005 AddedSafetyChecks = true;
3006 return SCEVCheckBlock;
3007}
3008
3009BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
3010 // VPlan-native path does not do any analysis for runtime checks currently.
3011 if (EnableVPlanNativePath)
3012 return nullptr;
3013
3014 BasicBlock *const MemCheckBlock =
3015 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
3016
3017 // Check if we generated code that checks in runtime if arrays overlap. We put
3018 // the checks into a separate block to make the more common case of few
3019 // elements faster.
3020 if (!MemCheckBlock)
3021 return nullptr;
3022
3023 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3024 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3026, __extension__
__PRETTY_FUNCTION__))
3025 "Cannot emit memory checks when optimizing for size, unless forced "(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3026, __extension__
__PRETTY_FUNCTION__))
3026 "to vectorize.")(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3026, __extension__
__PRETTY_FUNCTION__))
;
3027 ORE->emit([&]() {
3028 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationCodeSize",
3029 OrigLoop->getStartLoc(),
3030 OrigLoop->getHeader())
3031 << "Code-size may be reduced by not forcing "
3032 "vectorization, or by source-code modifications "
3033 "eliminating the need for runtime checks "
3034 "(e.g., adding 'restrict').";
3035 });
3036 }
3037
3038 LoopBypassBlocks.push_back(MemCheckBlock);
3039
3040 AddedSafetyChecks = true;
3041
3042 return MemCheckBlock;
3043}
3044
3045void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3046 LoopScalarBody = OrigLoop->getHeader();
3047 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3048 assert(LoopVectorPreHeader && "Invalid loop structure")(static_cast <bool> (LoopVectorPreHeader && "Invalid loop structure"
) ? void (0) : __assert_fail ("LoopVectorPreHeader && \"Invalid loop structure\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3048, __extension__
__PRETTY_FUNCTION__))
;
3049 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3050 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&(static_cast <bool> ((LoopExitBlock || Cost->requiresScalarEpilogue
(VF)) && "multiple exit loop without required epilogue?"
) ? void (0) : __assert_fail ("(LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && \"multiple exit loop without required epilogue?\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3051, __extension__
__PRETTY_FUNCTION__))
3051 "multiple exit loop without required epilogue?")(static_cast <bool> ((LoopExitBlock || Cost->requiresScalarEpilogue
(VF)) && "multiple exit loop without required epilogue?"
) ? void (0) : __assert_fail ("(LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && \"multiple exit loop without required epilogue?\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3051, __extension__
__PRETTY_FUNCTION__))
;
3052
3053 LoopMiddleBlock =
3054 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3055 LI, nullptr, Twine(Prefix) + "middle.block");
3056 LoopScalarPreHeader =
3057 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3058 nullptr, Twine(Prefix) + "scalar.ph");
3059
3060 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3061
3062 // Set up the middle block terminator. Two cases:
3063 // 1) If we know that we must execute the scalar epilogue, emit an
3064 // unconditional branch.
3065 // 2) Otherwise, we must have a single unique exit block (due to how we
3066 // implement the multiple exit case). In this case, set up a conditional
3067 // branch from the middle block to the loop scalar preheader, and the
3068 // exit block. completeLoopSkeleton will update the condition to use an
3069 // iteration check, if required to decide whether to execute the remainder.
3070 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3071 BranchInst::Create(LoopScalarPreHeader) :
3072 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3073 Builder.getTrue());
3074 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3075 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3076
3077 // Update dominator for loop exit. During skeleton creation, only the vector
3078 // pre-header and the middle block are created. The vector loop is entirely
3079 // created during VPlan exection.
3080 if (!Cost->requiresScalarEpilogue(VF))
3081 // If there is an epilogue which must run, there's no edge from the
3082 // middle block to exit blocks and thus no need to update the immediate
3083 // dominator of the exit blocks.
3084 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3085}
3086
3087void InnerLoopVectorizer::createInductionResumeValues(
3088 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3089 assert(((AdditionalBypass.first && AdditionalBypass.second) ||(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3091, __extension__
__PRETTY_FUNCTION__))
3090 (!AdditionalBypass.first && !AdditionalBypass.second)) &&(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3091, __extension__
__PRETTY_FUNCTION__))
3091 "Inconsistent information about additional bypass.")(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3091, __extension__
__PRETTY_FUNCTION__))
;
3092
3093 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3094 assert(VectorTripCount && "Expected valid arguments")(static_cast <bool> (VectorTripCount && "Expected valid arguments"
) ? void (0) : __assert_fail ("VectorTripCount && \"Expected valid arguments\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3094, __extension__
__PRETTY_FUNCTION__))
;
3095 // We are going to resume the execution of the scalar loop.
3096 // Go over all of the induction variables that we found and fix the
3097 // PHIs that are left in the scalar version of the loop.
3098 // The starting values of PHI nodes depend on the counter of the last
3099 // iteration in the vectorized loop.
3100 // If we come from a bypass edge then we need to start from the original
3101 // start value.
3102 Instruction *OldInduction = Legal->getPrimaryInduction();
3103 for (const auto &InductionEntry : Legal->getInductionVars()) {
3104 PHINode *OrigPhi = InductionEntry.first;
3105 InductionDescriptor II = InductionEntry.second;
3106
3107 Value *&EndValue = IVEndValues[OrigPhi];
3108 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3109 if (OrigPhi == OldInduction) {
3110 // We know what the end value is.
3111 EndValue = VectorTripCount;
3112 } else {
3113 IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3114
3115 // Fast-math-flags propagate from the original induction instruction.
3116 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3117 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3118
3119 Type *StepType = II.getStep()->getType();
3120 Instruction::CastOps CastOp =
3121 CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3122 Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc");
3123 Value *Step =
3124 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3125 EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
3126 EndValue->setName("ind.end");
3127
3128 // Compute the end value for the additional bypass (if applicable).
3129 if (AdditionalBypass.first) {
3130 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3131 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3132 StepType, true);
3133 Value *Step =
3134 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3135 VTC =
3136 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc");
3137 EndValueFromAdditionalBypass =
3138 emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
3139 EndValueFromAdditionalBypass->setName("ind.end");
3140 }
3141 }
3142
3143 // Create phi nodes to merge from the backedge-taken check block.
3144 PHINode *BCResumeVal =
3145 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3146 LoopScalarPreHeader->getTerminator());
3147 // Copy original phi DL over to the new one.
3148 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3149
3150 // The new PHI merges the original incoming value, in case of a bypass,
3151 // or the value at the end of the vectorized loop.
3152 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3153
3154 // Fix the scalar body counter (PHI node).
3155 // The old induction's phi node in the scalar body needs the truncated
3156 // value.
3157 for (BasicBlock *BB : LoopBypassBlocks)
3158 BCResumeVal->addIncoming(II.getStartValue(), BB);
3159
3160 if (AdditionalBypass.first)
3161 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3162 EndValueFromAdditionalBypass);
3163
3164 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3165 }
3166}
3167
3168BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) {
3169 // The trip counts should be cached by now.
3170 Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
3171 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3172
3173 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3174
3175 // Add a check in the middle block to see if we have completed
3176 // all of the iterations in the first vector loop. Three cases:
3177 // 1) If we require a scalar epilogue, there is no conditional branch as
3178 // we unconditionally branch to the scalar preheader. Do nothing.
3179 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3180 // Thus if tail is to be folded, we know we don't need to run the
3181 // remainder and we can use the previous value for the condition (true).
3182 // 3) Otherwise, construct a runtime check.
3183 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3184 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3185 Count, VectorTripCount, "cmp.n",
3186 LoopMiddleBlock->getTerminator());
3187
3188 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3189 // of the corresponding compare because they may have ended up with
3190 // different line numbers and we want to avoid awkward line stepping while
3191 // debugging. Eg. if the compare has got a line number inside the loop.
3192 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3193 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3194 }
3195
3196#ifdef EXPENSIVE_CHECKS
3197 assert(DT->verify(DominatorTree::VerificationLevel::Fast))(static_cast <bool> (DT->verify(DominatorTree::VerificationLevel
::Fast)) ? void (0) : __assert_fail ("DT->verify(DominatorTree::VerificationLevel::Fast)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3197, __extension__
__PRETTY_FUNCTION__))
;
3198#endif
3199
3200 return LoopVectorPreHeader;
3201}
3202
3203std::pair<BasicBlock *, Value *>
3204InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3205 /*
3206 In this function we generate a new loop. The new loop will contain
3207 the vectorized instructions while the old loop will continue to run the
3208 scalar remainder.
3209
3210 [ ] <-- loop iteration number check.
3211 / |
3212 / v
3213 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3214 | / |
3215 | / v
3216 || [ ] <-- vector pre header.
3217 |/ |
3218 | v
3219 | [ ] \
3220 | [ ]_| <-- vector loop (created during VPlan execution).
3221 | |
3222 | v
3223 \ -[ ] <--- middle-block.
3224 \/ |
3225 /\ v
3226 | ->[ ] <--- new preheader.
3227 | |
3228 (opt) v <-- edge from middle to exit iff epilogue is not required.
3229 | [ ] \
3230 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3231 \ |
3232 \ v
3233 >[ ] <-- exit block(s).
3234 ...
3235 */
3236
3237 // Get the metadata of the original loop before it gets modified.
3238 MDNode *OrigLoopID = OrigLoop->getLoopID();
3239
3240 // Workaround! Compute the trip count of the original loop and cache it
3241 // before we start modifying the CFG. This code has a systemic problem
3242 // wherein it tries to run analysis over partially constructed IR; this is
3243 // wrong, and not simply for SCEV. The trip count of the original loop
3244 // simply happens to be prone to hitting this in practice. In theory, we
3245 // can hit the same issue for any SCEV, or ValueTracking query done during
3246 // mutation. See PR49900.
3247 getOrCreateTripCount(OrigLoop->getLoopPreheader());
3248
3249 // Create an empty vector loop, and prepare basic blocks for the runtime
3250 // checks.
3251 createVectorLoopSkeleton("");
3252
3253 // Now, compare the new count to zero. If it is zero skip the vector loop and
3254 // jump to the scalar loop. This check also covers the case where the
3255 // backedge-taken count is uint##_max: adding one to it will overflow leading
3256 // to an incorrect trip count of zero. In this (rare) case we will also jump
3257 // to the scalar loop.
3258 emitIterationCountCheck(LoopScalarPreHeader);
3259
3260 // Generate the code to check any assumptions that we've made for SCEV
3261 // expressions.
3262 emitSCEVChecks(LoopScalarPreHeader);
3263
3264 // Generate the code that checks in runtime if arrays overlap. We put the
3265 // checks into a separate block to make the more common case of few elements
3266 // faster.
3267 emitMemRuntimeChecks(LoopScalarPreHeader);
3268
3269 // Emit phis for the new starting index of the scalar loop.
3270 createInductionResumeValues();
3271
3272 return {completeLoopSkeleton(OrigLoopID), nullptr};
3273}
3274
3275// Fix up external users of the induction variable. At this point, we are
3276// in LCSSA form, with all external PHIs that use the IV having one input value,
3277// coming from the remainder loop. We need those PHIs to also have a correct
3278// value for the IV when arriving directly from the middle block.
3279void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3280 const InductionDescriptor &II,
3281 Value *VectorTripCount, Value *EndValue,
3282 BasicBlock *MiddleBlock,
3283 BasicBlock *VectorHeader, VPlan &Plan) {
3284 // There are two kinds of external IV usages - those that use the value
3285 // computed in the last iteration (the PHI) and those that use the penultimate
3286 // value (the value that feeds into the phi from the loop latch).
3287 // We allow both, but they, obviously, have different values.
3288
3289 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block")(static_cast <bool> (OrigLoop->getUniqueExitBlock() &&
"Expected a single exit block") ? void (0) : __assert_fail (
"OrigLoop->getUniqueExitBlock() && \"Expected a single exit block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3289, __extension__
__PRETTY_FUNCTION__))
;
3290
3291 DenseMap<Value *, Value *> MissingVals;
3292
3293 // An external user of the last iteration's value should see the value that
3294 // the remainder loop uses to initialize its own IV.
3295 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3296 for (User *U : PostInc->users()) {
3297 Instruction *UI = cast<Instruction>(U);
3298 if (!OrigLoop->contains(UI)) {
3299 assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3299, __extension__
__PRETTY_FUNCTION__))
;
3300 MissingVals[UI] = EndValue;
3301 }
3302 }
3303
3304 // An external user of the penultimate value need to see EndValue - Step.
3305 // The simplest way to get this is to recompute it from the constituent SCEVs,
3306 // that is Start + (Step * (CRD - 1)).
3307 for (User *U : OrigPhi->users()) {
3308 auto *UI = cast<Instruction>(U);
3309 if (!OrigLoop->contains(UI)) {
3310 assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3310, __extension__
__PRETTY_FUNCTION__))
;
3311
3312 IRBuilder<> B(MiddleBlock->getTerminator());
3313
3314 // Fast-math-flags propagate from the original induction instruction.
3315 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3316 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3317
3318 Value *CountMinusOne = B.CreateSub(
3319 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3320 Value *CMO =
3321 !II.getStep()->getType()->isIntegerTy()
3322 ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3323 II.getStep()->getType())
3324 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3325 CMO->setName("cast.cmo");
3326
3327 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3328 VectorHeader->getTerminator());
3329 Value *Escape =
3330 emitTransformedIndex(B, CMO, II.getStartValue(), Step, II);
3331 Escape->setName("ind.escape");
3332 MissingVals[UI] = Escape;
3333 }
3334 }
3335
3336 for (auto &I : MissingVals) {
3337 PHINode *PHI = cast<PHINode>(I.first);
3338 // One corner case we have to handle is two IVs "chasing" each-other,
3339 // that is %IV2 = phi [...], [ %IV1, %latch ]
3340 // In this case, if IV1 has an external use, we need to avoid adding both
3341 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3342 // don't already have an incoming value for the middle block.
3343 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3344 PHI->addIncoming(I.second, MiddleBlock);
3345 Plan.removeLiveOut(PHI);
3346 }
3347 }
3348}
3349
3350namespace {
3351
3352struct CSEDenseMapInfo {
3353 static bool canHandle(const Instruction *I) {
3354 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3355 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3356 }
3357
3358 static inline Instruction *getEmptyKey() {
3359 return DenseMapInfo<Instruction *>::getEmptyKey();
3360 }
3361
3362 static inline Instruction *getTombstoneKey() {
3363 return DenseMapInfo<Instruction *>::getTombstoneKey();
3364 }
3365
3366 static unsigned getHashValue(const Instruction *I) {
3367 assert(canHandle(I) && "Unknown instruction!")(static_cast <bool> (canHandle(I) && "Unknown instruction!"
) ? void (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3367, __extension__
__PRETTY_FUNCTION__))
;
3368 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3369 I->value_op_end()));
3370 }
3371
3372 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3373 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3374 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3375 return LHS == RHS;
3376 return LHS->isIdenticalTo(RHS);
3377 }
3378};
3379
3380} // end anonymous namespace
3381
3382///Perform cse of induction variable instructions.
3383static void cse(BasicBlock *BB) {
3384 // Perform simple cse.
3385 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3386 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3387 if (!CSEDenseMapInfo::canHandle(&In))
3388 continue;
3389
3390 // Check if we can replace this instruction with any of the
3391 // visited instructions.
3392 if (Instruction *V = CSEMap.lookup(&In)) {
3393 In.replaceAllUsesWith(V);
3394 In.eraseFromParent();
3395 continue;
3396 }
3397
3398 CSEMap[&In] = &In;
3399 }
3400}
3401
3402InstructionCost
3403LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3404 bool &NeedToScalarize) const {
3405 Function *F = CI->getCalledFunction();
3406 Type *ScalarRetTy = CI->getType();
3407 SmallVector<Type *, 4> Tys, ScalarTys;
3408 for (auto &ArgOp : CI->args())
3409 ScalarTys.push_back(ArgOp->getType());
3410
3411 // Estimate cost of scalarized vector call. The source operands are assumed
3412 // to be vectors, so we need to extract individual elements from there,
3413 // execute VF scalar calls, and then gather the result into the vector return
3414 // value.
3415 InstructionCost ScalarCallCost =
3416 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3417 if (VF.isScalar())
3418 return ScalarCallCost;
3419
3420 // Compute corresponding vector type for return value and arguments.
3421 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3422 for (Type *ScalarTy : ScalarTys)
3423 Tys.push_back(ToVectorTy(ScalarTy, VF));
3424
3425 // Compute costs of unpacking argument values for the scalar calls and
3426 // packing the return values to a vector.
3427 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3428
3429 InstructionCost Cost =
3430 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3431
3432 // If we can't emit a vector call for this function, then the currently found
3433 // cost is the cost we need to return.
3434 NeedToScalarize = true;
3435 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3436 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3437
3438 if (!TLI || CI->isNoBuiltin() || !VecFunc)
3439 return Cost;
3440
3441 // If the corresponding vector cost is cheaper, return its cost.
3442 InstructionCost VectorCallCost =
3443 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3444 if (VectorCallCost < Cost) {
3445 NeedToScalarize = false;
3446 Cost = VectorCallCost;
3447 }
3448 return Cost;
3449}
3450
3451static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3452 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3453 return Elt;
3454 return VectorType::get(Elt, VF);
3455}
3456
3457InstructionCost
3458LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3459 ElementCount VF) const {
3460 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3461 assert(ID && "Expected intrinsic call!")(static_cast <bool> (ID && "Expected intrinsic call!"
) ? void (0) : __assert_fail ("ID && \"Expected intrinsic call!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3461, __extension__
__PRETTY_FUNCTION__))
;
3462 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3463 FastMathFlags FMF;
3464 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3465 FMF = FPMO->getFastMathFlags();
3466
3467 SmallVector<const Value *> Arguments(CI->args());
3468 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3469 SmallVector<Type *> ParamTys;
3470 std::transform(FTy->param_begin(), FTy->param_end(),
3471 std::back_inserter(ParamTys),
3472 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3473
3474 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3475 dyn_cast<IntrinsicInst>(CI));
3476 return TTI.getIntrinsicInstrCost(CostAttrs,
3477 TargetTransformInfo::TCK_RecipThroughput);
3478}
3479
3480static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3481 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3482 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3483 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3484}
3485
3486static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3487 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3488 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3489 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3490}
3491
3492void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3493 // For every instruction `I` in MinBWs, truncate the operands, create a
3494 // truncated version of `I` and reextend its result. InstCombine runs
3495 // later and will remove any ext/trunc pairs.
3496 SmallPtrSet<Value *, 4> Erased;
3497 for (const auto &KV : Cost->getMinimalBitwidths()) {
3498 // If the value wasn't vectorized, we must maintain the original scalar
3499 // type. The absence of the value from State indicates that it
3500 // wasn't vectorized.
3501 // FIXME: Should not rely on getVPValue at this point.
3502 VPValue *Def = State.Plan->getVPValue(KV.first, true);
3503 if (!State.hasAnyVectorValue(Def))
3504 continue;
3505 for (unsigned Part = 0; Part < UF; ++Part) {
3506 Value *I = State.get(Def, Part);
3507 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3508 continue;
3509 Type *OriginalTy = I->getType();
3510 Type *ScalarTruncatedTy =
3511 IntegerType::get(OriginalTy->getContext(), KV.second);
3512 auto *TruncatedTy = VectorType::get(
3513 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3514 if (TruncatedTy == OriginalTy)
3515 continue;
3516
3517 IRBuilder<> B(cast<Instruction>(I));
3518 auto ShrinkOperand = [&](Value *V) -> Value * {
3519 if (auto *ZI = dyn_cast<ZExtInst>(V))
3520 if (ZI->getSrcTy() == TruncatedTy)
3521 return ZI->getOperand(0);
3522 return B.CreateZExtOrTrunc(V, TruncatedTy);
3523 };
3524
3525 // The actual instruction modification depends on the instruction type,
3526 // unfortunately.
3527 Value *NewI = nullptr;
3528 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3529 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3530 ShrinkOperand(BO->getOperand(1)));
3531
3532 // Any wrapping introduced by shrinking this operation shouldn't be
3533 // considered undefined behavior. So, we can't unconditionally copy
3534 // arithmetic wrapping flags to NewI.
3535 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3536 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3537 NewI =
3538 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3539 ShrinkOperand(CI->getOperand(1)));
3540 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3541 NewI = B.CreateSelect(SI->getCondition(),
3542 ShrinkOperand(SI->getTrueValue()),
3543 ShrinkOperand(SI->getFalseValue()));
3544 } else if (auto *CI = dyn_cast<CastInst>(I)) {
3545 switch (CI->getOpcode()) {
3546 default:
3547 llvm_unreachable("Unhandled cast!")::llvm::llvm_unreachable_internal("Unhandled cast!", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3547)
;
3548 case Instruction::Trunc:
3549 NewI = ShrinkOperand(CI->getOperand(0));
3550 break;
3551 case Instruction::SExt:
3552 NewI = B.CreateSExtOrTrunc(
3553 CI->getOperand(0),
3554 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3555 break;
3556 case Instruction::ZExt:
3557 NewI = B.CreateZExtOrTrunc(
3558 CI->getOperand(0),
3559 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3560 break;
3561 }
3562 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3563 auto Elements0 =
3564 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3565 auto *O0 = B.CreateZExtOrTrunc(
3566 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3567 auto Elements1 =
3568 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3569 auto *O1 = B.CreateZExtOrTrunc(
3570 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3571
3572 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3573 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3574 // Don't do anything with the operands, just extend the result.
3575 continue;
3576 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3577 auto Elements =
3578 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3579 auto *O0 = B.CreateZExtOrTrunc(
3580 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3581 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3582 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3583 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3584 auto Elements =
3585 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3586 auto *O0 = B.CreateZExtOrTrunc(
3587 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3588 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3589 } else {
3590 // If we don't know what to do, be conservative and don't do anything.
3591 continue;
3592 }
3593
3594 // Lastly, extend the result.
3595 NewI->takeName(cast<Instruction>(I));
3596 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3597 I->replaceAllUsesWith(Res);
3598 cast<Instruction>(I)->eraseFromParent();
3599 Erased.insert(I);
3600 State.reset(Def, Res, Part);
3601 }
3602 }
3603
3604 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3605 for (const auto &KV : Cost->getMinimalBitwidths()) {
3606 // If the value wasn't vectorized, we must maintain the original scalar
3607 // type. The absence of the value from State indicates that it
3608 // wasn't vectorized.
3609 // FIXME: Should not rely on getVPValue at this point.
3610 VPValue *Def = State.Plan->getVPValue(KV.first, true);
3611 if (!State.hasAnyVectorValue(Def))
3612 continue;
3613 for (unsigned Part = 0; Part < UF; ++Part) {
3614 Value *I = State.get(Def, Part);
3615 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3616 if (Inst && Inst->use_empty()) {
3617 Value *NewI = Inst->getOperand(0);
3618 Inst->eraseFromParent();
3619 State.reset(Def, NewI, Part);
3620 }
3621 }
3622 }
3623}
3624
3625void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3626 VPlan &Plan) {
3627 // Insert truncates and extends for any truncated instructions as hints to
3628 // InstCombine.
3629 if (VF.isVector())
3630 truncateToMinimalBitwidths(State);
3631
3632 // Fix widened non-induction PHIs by setting up the PHI operands.
3633 if (EnableVPlanNativePath)
3634 fixNonInductionPHIs(Plan, State);
3635
3636 // At this point every instruction in the original loop is widened to a
3637 // vector form. Now we need to fix the recurrences in the loop. These PHI
3638 // nodes are currently empty because we did not want to introduce cycles.
3639 // This is the second stage of vectorizing recurrences.
3640 fixCrossIterationPHIs(State);
3641
3642 // Forget the original basic block.
3643 PSE.getSE()->forgetLoop(OrigLoop);
3644
3645 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
3646 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3647 if (Cost->requiresScalarEpilogue(VF)) {
3648 // No edge from the middle block to the unique exit block has been inserted
3649 // and there is nothing to fix from vector loop; phis should have incoming
3650 // from scalar loop only.
3651 Plan.clearLiveOuts();
3652 } else {
3653 // If we inserted an edge from the middle block to the unique exit block,
3654 // update uses outside the loop (phis) to account for the newly inserted
3655 // edge.
3656
3657 // Fix-up external users of the induction variables.
3658 for (const auto &Entry : Legal->getInductionVars())
3659 fixupIVUsers(Entry.first, Entry.second,
3660 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3661 IVEndValues[Entry.first], LoopMiddleBlock,
3662 VectorLoop->getHeader(), Plan);
3663 }
3664
3665 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3666 // in the exit block, so update the builder.
3667 State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
3668 for (const auto &KV : Plan.getLiveOuts())
3669 KV.second->fixPhi(Plan, State);
3670
3671 for (Instruction *PI : PredicatedInstructions)
3672 sinkScalarOperands(&*PI);
3673
3674 // Remove redundant induction instructions.
3675 cse(VectorLoop->getHeader());
3676
3677 // Set/update profile weights for the vector and remainder loops as original
3678 // loop iterations are now distributed among them. Note that original loop
3679 // represented by LoopScalarBody becomes remainder loop after vectorization.
3680 //
3681 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3682 // end up getting slightly roughened result but that should be OK since
3683 // profile is not inherently precise anyway. Note also possible bypass of
3684 // vector code caused by legality checks is ignored, assigning all the weight
3685 // to the vector loop, optimistically.
3686 //
3687 // For scalable vectorization we can't know at compile time how many iterations
3688 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3689 // vscale of '1'.
3690 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3691 LI->getLoopFor(LoopScalarBody),
3692 VF.getKnownMinValue() * UF);
3693}
3694
3695void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3696 // In order to support recurrences we need to be able to vectorize Phi nodes.
3697 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3698 // stage #2: We now need to fix the recurrences by adding incoming edges to
3699 // the currently empty PHI nodes. At this point every instruction in the
3700 // original loop is widened to a vector form so we can use them to construct
3701 // the incoming edges.
3702 VPBasicBlock *Header =
3703 State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
3704 for (VPRecipeBase &R : Header->phis()) {
3705 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3706 fixReduction(ReductionPhi, State);
3707 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3708 fixFixedOrderRecurrence(FOR, State);
3709 }
3710}
3711
3712void InnerLoopVectorizer::fixFixedOrderRecurrence(
3713 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3714 // This is the second phase of vectorizing first-order recurrences. An
3715 // overview of the transformation is described below. Suppose we have the
3716 // following loop.
3717 //
3718 // for (int i = 0; i < n; ++i)
3719 // b[i] = a[i] - a[i - 1];
3720 //
3721 // There is a first-order recurrence on "a". For this loop, the shorthand
3722 // scalar IR looks like:
3723 //
3724 // scalar.ph:
3725 // s_init = a[-1]
3726 // br scalar.body
3727 //
3728 // scalar.body:
3729 // i = phi [0, scalar.ph], [i+1, scalar.body]
3730 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3731 // s2 = a[i]
3732 // b[i] = s2 - s1
3733 // br cond, scalar.body, ...
3734 //
3735 // In this example, s1 is a recurrence because it's value depends on the
3736 // previous iteration. In the first phase of vectorization, we created a
3737 // vector phi v1 for s1. We now complete the vectorization and produce the
3738 // shorthand vector IR shown below (for VF = 4, UF = 1).
3739 //
3740 // vector.ph:
3741 // v_init = vector(..., ..., ..., a[-1])
3742 // br vector.body
3743 //
3744 // vector.body
3745 // i = phi [0, vector.ph], [i+4, vector.body]
3746 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3747 // v2 = a[i, i+1, i+2, i+3];
3748 // v3 = vector(v1(3), v2(0, 1, 2))
3749 // b[i, i+1, i+2, i+3] = v2 - v3
3750 // br cond, vector.body, middle.block
3751 //
3752 // middle.block:
3753 // x = v2(3)
3754 // br scalar.ph
3755 //
3756 // scalar.ph:
3757 // s_init = phi [x, middle.block], [a[-1], otherwise]
3758 // br scalar.body
3759 //
3760 // After execution completes the vector loop, we extract the next value of
3761 // the recurrence (x) to use as the initial value in the scalar loop.
3762
3763 // Extract the last vector element in the middle block. This will be the
3764 // initial value for the recurrence when jumping to the scalar loop.
3765 VPValue *PreviousDef = PhiR->getBackedgeValue();
3766 Value *Incoming = State.get(PreviousDef, UF - 1);
3767 auto *ExtractForScalar = Incoming;
3768 auto *IdxTy = Builder.getInt32Ty();
3769 if (VF.isVector()) {
3770 auto *One = ConstantInt::get(IdxTy, 1);
3771 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3772 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3773 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3774 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
3775 "vector.recur.extract");
3776 }
3777 // Extract the second last element in the middle block if the
3778 // Phi is used outside the loop. We need to extract the phi itself
3779 // and not the last element (the phi update in the current iteration). This
3780 // will be the value when jumping to the exit block from the LoopMiddleBlock,
3781 // when the scalar loop is not run at all.
3782 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3783 if (VF.isVector()) {
3784 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3785 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3786 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3787 Incoming, Idx, "vector.recur.extract.for.phi");
3788 } else if (UF > 1)
3789 // When loop is unrolled without vectorizing, initialize
3790 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
3791 // of `Incoming`. This is analogous to the vectorized case above: extracting
3792 // the second last element when VF > 1.
3793 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3794
3795 // Fix the initial value of the original recurrence in the scalar loop.
3796 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3797 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3798 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3799 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3800 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3801 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3802 Start->addIncoming(Incoming, BB);
3803 }
3804
3805 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3806 Phi->setName("scalar.recur");
3807
3808 // Finally, fix users of the recurrence outside the loop. The users will need
3809 // either the last value of the scalar recurrence or the last value of the
3810 // vector recurrence we extracted in the middle block. Since the loop is in
3811 // LCSSA form, we just need to find all the phi nodes for the original scalar
3812 // recurrence in the exit block, and then add an edge for the middle block.
3813 // Note that LCSSA does not imply single entry when the original scalar loop
3814 // had multiple exiting edges (as we always run the last iteration in the
3815 // scalar epilogue); in that case, there is no edge from middle to exit and
3816 // and thus no phis which needed updated.
3817 if (!Cost->requiresScalarEpilogue(VF))
3818 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3819 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
3820 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3821 State.Plan->removeLiveOut(&LCSSAPhi);
3822 }
3823}
3824
3825void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3826 VPTransformState &State) {
3827 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3828 // Get it's reduction variable descriptor.
3829 assert(Legal->isReductionVariable(OrigPhi) &&(static_cast <bool> (Legal->isReductionVariable(OrigPhi
) && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3830, __extension__
__PRETTY_FUNCTION__))
3830 "Unable to find the reduction variable")(static_cast <bool> (Legal->isReductionVariable(OrigPhi
) && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3830, __extension__
__PRETTY_FUNCTION__))
;
3831 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3832
3833 RecurKind RK = RdxDesc.getRecurrenceKind();
3834 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3835 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3836 State.setDebugLocFromInst(ReductionStartValue);
3837
3838 VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3839 // This is the vector-clone of the value that leaves the loop.
3840 Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3841
3842 // Wrap flags are in general invalid after vectorization, clear them.
3843 clearReductionWrapFlags(PhiR, State);
3844
3845 // Before each round, move the insertion point right between
3846 // the PHIs and the values we are going to write.
3847 // This allows us to write both PHINodes and the extractelement
3848 // instructions.
3849 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3850
3851 State.setDebugLocFromInst(LoopExitInst);
3852
3853 Type *PhiTy = OrigPhi->getType();
3854
3855 VPBasicBlock *LatchVPBB =
3856 PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
3857 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
3858 // If tail is folded by masking, the vector value to leave the loop should be
3859 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3860 // instead of the former. For an inloop reduction the reduction will already
3861 // be predicated, and does not need to be handled here.
3862 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3863 for (unsigned Part = 0; Part < UF; ++Part) {
3864 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3865 SelectInst *Sel = nullptr;
3866 for (User *U : VecLoopExitInst->users()) {
3867 if (isa<SelectInst>(U)) {
3868 assert(!Sel && "Reduction exit feeding two selects")(static_cast <bool> (!Sel && "Reduction exit feeding two selects"
) ? void (0) : __assert_fail ("!Sel && \"Reduction exit feeding two selects\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3868, __extension__
__PRETTY_FUNCTION__))
;
3869 Sel = cast<SelectInst>(U);
3870 } else
3871 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select")(static_cast <bool> (isa<PHINode>(U) && "Reduction exit must feed Phi's or select"
) ? void (0) : __assert_fail ("isa<PHINode>(U) && \"Reduction exit must feed Phi's or select\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3871, __extension__
__PRETTY_FUNCTION__))
;
3872 }
3873 assert(Sel && "Reduction exit feeds no select")(static_cast <bool> (Sel && "Reduction exit feeds no select"
) ? void (0) : __assert_fail ("Sel && \"Reduction exit feeds no select\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3873, __extension__
__PRETTY_FUNCTION__))
;
3874 State.reset(LoopExitInstDef, Sel, Part);
3875
3876 if (isa<FPMathOperator>(Sel))
3877 Sel->setFastMathFlags(RdxDesc.getFastMathFlags());
3878
3879 // If the target can create a predicated operator for the reduction at no
3880 // extra cost in the loop (for example a predicated vadd), it can be
3881 // cheaper for the select to remain in the loop than be sunk out of it,
3882 // and so use the select value for the phi instead of the old
3883 // LoopExitValue.
3884 if (PreferPredicatedReductionSelect ||
3885 TTI->preferPredicatedReductionSelect(
3886 RdxDesc.getOpcode(), PhiTy,
3887 TargetTransformInfo::ReductionFlags())) {
3888 auto *VecRdxPhi =
3889 cast<PHINode>(State.get(PhiR, Part));
3890 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3891 }
3892 }
3893 }
3894
3895 // If the vector reduction can be performed in a smaller type, we truncate
3896 // then extend the loop exit value to enable InstCombine to evaluate the
3897 // entire expression in the smaller type.
3898 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3899 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!")(static_cast <bool> (!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"
) ? void (0) : __assert_fail ("!PhiR->isInLoop() && \"Unexpected truncated inloop reduction!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3899, __extension__
__PRETTY_FUNCTION__))
;
3900 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3901 Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3902 VectorParts RdxParts(UF);
3903 for (unsigned Part = 0; Part < UF; ++Part) {
3904 RdxParts[Part] = State.get(LoopExitInstDef, Part);
3905 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3906 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3907 : Builder.CreateZExt(Trunc, VecTy);
3908 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3909 if (U != Trunc) {
3910 U->replaceUsesOfWith(RdxParts[Part], Extnd);
3911 RdxParts[Part] = Extnd;
3912 }
3913 }
3914 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3915 for (unsigned Part = 0; Part < UF; ++Part) {
3916 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3917 State.reset(LoopExitInstDef, RdxParts[Part], Part);
3918 }
3919 }
3920
3921 // Reduce all of the unrolled parts into a single vector.
3922 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3923 unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3924
3925 // The middle block terminator has already been assigned a DebugLoc here (the
3926 // OrigLoop's single latch terminator). We want the whole middle block to
3927 // appear to execute on this line because: (a) it is all compiler generated,
3928 // (b) these instructions are always executed after evaluating the latch
3929 // conditional branch, and (c) other passes may add new predecessors which
3930 // terminate on this line. This is the easiest way to ensure we don't
3931 // accidentally cause an extra step back into the loop while debugging.
3932 State.setDebugLocFromInst(LoopMiddleBlock->getTerminator());
3933 if (PhiR->isOrdered())
3934 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3935 else {
3936 // Floating-point operations should have some FMF to enable the reduction.
3937 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3938 Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3939 for (unsigned Part = 1; Part < UF; ++Part) {
3940 Value *RdxPart = State.get(LoopExitInstDef, Part);
3941 if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
3942 ReducedPartRdx = Builder.CreateBinOp(
3943 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
3944 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
3945 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
3946 ReducedPartRdx, RdxPart);
3947 else
3948 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
3949 }
3950 }
3951
3952 // Create the reduction after the loop. Note that inloop reductions create the
3953 // target reduction in the loop using a Reduction recipe.
3954 if (VF.isVector() && !PhiR->isInLoop()) {
3955 ReducedPartRdx =
3956 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
3957 // If the reduction can be performed in a smaller type, we need to extend
3958 // the reduction to the wider type before we branch to the original loop.
3959 if (PhiTy != RdxDesc.getRecurrenceType())
3960 ReducedPartRdx = RdxDesc.isSigned()
3961 ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
3962 : Builder.CreateZExt(ReducedPartRdx, PhiTy);
3963 }
3964
3965 PHINode *ResumePhi =
3966 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
3967
3968 // Create a phi node that merges control-flow from the backedge-taken check
3969 // block and the middle block.
3970 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
3971 LoopScalarPreHeader->getTerminator());
3972
3973 // If we are fixing reductions in the epilogue loop then we should already
3974 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
3975 // we carry over the incoming values correctly.
3976 for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
3977 if (Incoming == LoopMiddleBlock)
3978 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
3979 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
3980 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
3981 Incoming);
3982 else
3983 BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
3984 }
3985
3986 // Set the resume value for this reduction
3987 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
3988
3989 // If there were stores of the reduction value to a uniform memory address
3990 // inside the loop, create the final store here.
3991 if (StoreInst *SI = RdxDesc.IntermediateStore) {
3992 StoreInst *NewSI =
3993 Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
3994 propagateMetadata(NewSI, SI);
3995
3996 // If the reduction value is used in other places,
3997 // then let the code below create PHI's for that.
3998 }
3999
4000 // Now, we need to fix the users of the reduction variable
4001 // inside and outside of the scalar remainder loop.
4002
4003 // We know that the loop is in LCSSA form. We need to update the PHI nodes
4004 // in the exit blocks. See comment on analogous loop in
4005 // fixFixedOrderRecurrence for a more complete explaination of the logic.
4006 if (!Cost->requiresScalarEpilogue(VF))
4007 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4008 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
4009 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4010 State.Plan->removeLiveOut(&LCSSAPhi);
4011 }
4012
4013 // Fix the scalar loop reduction variable with the incoming reduction sum
4014 // from the vector body and from the backedge value.
4015 int IncomingEdgeBlockIdx =
4016 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4017 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")(static_cast <bool> (IncomingEdgeBlockIdx >= 0 &&
"Invalid block index") ? void (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4017, __extension__
__PRETTY_FUNCTION__))
;
4018 // Pick the other block.
4019 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4020 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4021 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4022}
4023
4024void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
4025 VPTransformState &State) {
4026 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4027 RecurKind RK = RdxDesc.getRecurrenceKind();
4028 if (RK != RecurKind::Add && RK != RecurKind::Mul)
4029 return;
4030
4031 SmallVector<VPValue *, 8> Worklist;
4032 SmallPtrSet<VPValue *, 8> Visited;
4033 Worklist.push_back(PhiR);
4034 Visited.insert(PhiR);
4035
4036 while (!Worklist.empty()) {
4037 VPValue *Cur = Worklist.pop_back_val();
4038 for (unsigned Part = 0; Part < UF; ++Part) {
4039 Value *V = State.get(Cur, Part);
4040 if (!isa<OverflowingBinaryOperator>(V))
4041 break;
4042 cast<Instruction>(V)->dropPoisonGeneratingFlags();
4043 }
4044
4045 for (VPUser *U : Cur->users()) {
4046 auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
4047 if (!UserRecipe)
4048 continue;
4049 for (VPValue *V : UserRecipe->definedValues())
4050 if (Visited.insert(V).second)
4051 Worklist.push_back(V);
4052 }
4053 }
4054}
4055
4056void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4057 // The basic block and loop containing the predicated instruction.
4058 auto *PredBB = PredInst->getParent();
4059 auto *VectorLoop = LI->getLoopFor(PredBB);
4060
4061 // Initialize a worklist with the operands of the predicated instruction.
4062 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4063
4064 // Holds instructions that we need to analyze again. An instruction may be
4065 // reanalyzed if we don't yet know if we can sink it or not.
4066 SmallVector<Instruction *, 8> InstsToReanalyze;
4067
4068 // Returns true if a given use occurs in the predicated block. Phi nodes use
4069 // their operands in their corresponding predecessor blocks.
4070 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4071 auto *I = cast<Instruction>(U.getUser());
4072 BasicBlock *BB = I->getParent();
4073 if (auto *Phi = dyn_cast<PHINode>(I))
4074 BB = Phi->getIncomingBlock(
4075 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4076 return BB == PredBB;
4077 };
4078
4079 // Iteratively sink the scalarized operands of the predicated instruction
4080 // into the block we created for it. When an instruction is sunk, it's
4081 // operands are then added to the worklist. The algorithm ends after one pass
4082 // through the worklist doesn't sink a single instruction.
4083 bool Changed;
4084 do {
4085 // Add the instructions that need to be reanalyzed to the worklist, and
4086 // reset the changed indicator.
4087 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4088 InstsToReanalyze.clear();
4089 Changed = false;
4090
4091 while (!Worklist.empty()) {
4092 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4093
4094 // We can't sink an instruction if it is a phi node, is not in the loop,
4095 // or may have side effects.
4096 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4097 I->mayHaveSideEffects())
4098 continue;
4099
4100 // If the instruction is already in PredBB, check if we can sink its
4101 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4102 // sinking the scalar instruction I, hence it appears in PredBB; but it
4103 // may have failed to sink I's operands (recursively), which we try
4104 // (again) here.
4105 if (I->getParent() == PredBB) {
4106 Worklist.insert(I->op_begin(), I->op_end());
4107 continue;
4108 }
4109
4110 // It's legal to sink the instruction if all its uses occur in the
4111 // predicated block. Otherwise, there's nothing to do yet, and we may
4112 // need to reanalyze the instruction.
4113 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4114 InstsToReanalyze.push_back(I);
4115 continue;
4116 }
4117
4118 // Move the instruction to the beginning of the predicated block, and add
4119 // it's operands to the worklist.
4120 I->moveBefore(&*PredBB->getFirstInsertionPt());
4121 Worklist.insert(I->op_begin(), I->op_end());
4122
4123 // The sinking may have enabled other instructions to be sunk, so we will
4124 // need to iterate.
4125 Changed = true;
4126 }
4127 } while (Changed);
4128}
4129
4130void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
4131 VPTransformState &State) {
4132 auto Iter = depth_first(
4133 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry()));
4134 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4135 for (VPRecipeBase &P : VPBB->phis()) {
4136 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
4137 if (!VPPhi)
4138 continue;
4139 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4140 // Make sure the builder has a valid insert point.
4141 Builder.SetInsertPoint(NewPhi);
4142 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4143 VPValue *Inc = VPPhi->getIncomingValue(i);
4144 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4145 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4146 }
4147 }
4148 }
4149}
4150
4151bool InnerLoopVectorizer::useOrderedReductions(
4152 const RecurrenceDescriptor &RdxDesc) {
4153 return Cost->useOrderedReductions(RdxDesc);
4154}
4155
4156void InnerLoopVectorizer::widenCallInstruction(
4157 CallInst &CI, VPValue *Def, VPUser &ArgOperands, VPTransformState &State,
4158 Intrinsic::ID VectorIntrinsicID) {
4159 assert(!isa<DbgInfoIntrinsic>(CI) &&(static_cast <bool> (!isa<DbgInfoIntrinsic>(CI) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction"
) ? void (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(CI) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4160, __extension__
__PRETTY_FUNCTION__))
4160 "DbgInfoIntrinsic should have been dropped during VPlan construction")(static_cast <bool> (!isa<DbgInfoIntrinsic>(CI) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction"
) ? void (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(CI) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4160, __extension__
__PRETTY_FUNCTION__))
;
4161 State.setDebugLocFromInst(&CI);
4162
4163 SmallVector<Type *, 4> Tys;
4164 for (Value *ArgOperand : CI.args())
4165 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4166
4167 for (unsigned Part = 0; Part < UF; ++Part) {
4168 SmallVector<Type *, 2> TysForDecl = {CI.getType()};
4169 SmallVector<Value *, 4> Args;
4170 for (const auto &I : enumerate(ArgOperands.operands())) {
4171 // Some intrinsics have a scalar argument - don't replace it with a
4172 // vector.
4173 Value *Arg;
4174 if (!VectorIntrinsicID ||
4175 !isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
4176 Arg = State.get(I.value(), Part);
4177 else
4178 Arg = State.get(I.value(), VPIteration(0, 0));
4179 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
4180 TysForDecl.push_back(Arg->getType());
4181 Args.push_back(Arg);
4182 }
4183
4184 Function *VectorF;
4185 if (VectorIntrinsicID) {
4186 // Use vector version of the intrinsic.
4187 if (VF.isVector())
4188 TysForDecl[0] = VectorType::get(CI.getType()->getScalarType(), VF);
4189 Module *M = State.Builder.GetInsertBlock()->getModule();
4190 VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
4191 assert(VectorF && "Can't retrieve vector intrinsic.")(static_cast <bool> (VectorF && "Can't retrieve vector intrinsic."
) ? void (0) : __assert_fail ("VectorF && \"Can't retrieve vector intrinsic.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4191, __extension__
__PRETTY_FUNCTION__))
;
4192 } else {
4193 // Use vector version of the function call.
4194 const VFShape Shape = VFShape::get(CI, VF, false /*HasGlobalPred*/);
4195#ifndef NDEBUG
4196 assert(VFDatabase(CI).getVectorizedFunction(Shape) != nullptr &&(static_cast <bool> (VFDatabase(CI).getVectorizedFunction
(Shape) != nullptr && "Can't create vector function."
) ? void (0) : __assert_fail ("VFDatabase(CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4197, __extension__
__PRETTY_FUNCTION__))
4197 "Can't create vector function.")(static_cast <bool> (VFDatabase(CI).getVectorizedFunction
(Shape) != nullptr && "Can't create vector function."
) ? void (0) : __assert_fail ("VFDatabase(CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4197, __extension__
__PRETTY_FUNCTION__))
;
4198#endif
4199 VectorF = VFDatabase(CI).getVectorizedFunction(Shape);
4200 }
4201 SmallVector<OperandBundleDef, 1> OpBundles;
4202 CI.getOperandBundlesAsDefs(OpBundles);
4203 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4204
4205 if (isa<FPMathOperator>(V))
4206 V->copyFastMathFlags(&CI);
4207
4208 State.set(Def, V, Part);
4209 State.addMetadata(V, &CI);
4210 }
4211}
4212
4213void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4214 // We should not collect Scalars more than once per VF. Right now, this
4215 // function is called from collectUniformsAndScalars(), which already does
4216 // this check. Collecting Scalars for VF=1 does not make any sense.
4217 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&(static_cast <bool> (VF.isVector() && Scalars.find
(VF) == Scalars.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4218, __extension__
__PRETTY_FUNCTION__))
4218 "This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Scalars.find
(VF) == Scalars.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4218, __extension__
__PRETTY_FUNCTION__))
;
4219
4220 // This avoids any chances of creating a REPLICATE recipe during planning
4221 // since that would result in generation of scalarized code during execution,
4222 // which is not supported for scalable vectors.
4223 if (VF.isScalable()) {
4224 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
4225 return;
4226 }
4227
4228 SmallSetVector<Instruction *, 8> Worklist;
4229
4230 // These sets are used to seed the analysis with pointers used by memory
4231 // accesses that will remain scalar.
4232 SmallSetVector<Instruction *, 8> ScalarPtrs;
4233 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4234 auto *Latch = TheLoop->getLoopLatch();
4235
4236 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4237 // The pointer operands of loads and stores will be scalar as long as the
4238 // memory access is not a gather or scatter operation. The value operand of a
4239 // store will remain scalar if the store is scalarized.
4240 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4241 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4242 assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4243, __extension__
__PRETTY_FUNCTION__))
4243 "Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4243, __extension__
__PRETTY_FUNCTION__))
;
4244 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4245 if (Ptr == Store->getValueOperand())
4246 return WideningDecision == CM_Scalarize;
4247 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4248, __extension__
__PRETTY_FUNCTION__))
4248 "Ptr is neither a value or pointer operand")(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4248, __extension__
__PRETTY_FUNCTION__))
;
4249 return WideningDecision != CM_GatherScatter;
4250 };
4251
4252 // A helper that returns true if the given value is a bitcast or
4253 // getelementptr instruction contained in the loop.
4254 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4255 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4256 isa<GetElementPtrInst>(V)) &&
4257 !TheLoop->isLoopInvariant(V);
4258 };
4259
4260 // A helper that evaluates a memory access's use of a pointer. If the use will
4261 // be a scalar use and the pointer is only used by memory accesses, we place
4262 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4263 // PossibleNonScalarPtrs.
4264 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4265 // We only care about bitcast and getelementptr instructions contained in
4266 // the loop.
4267 if (!isLoopVaryingBitCastOrGEP(Ptr))
4268 return;
4269
4270 // If the pointer has already been identified as scalar (e.g., if it was
4271 // also identified as uniform), there's nothing to do.
4272 auto *I = cast<Instruction>(Ptr);
4273 if (Worklist.count(I))
4274 return;
4275
4276 // If the use of the pointer will be a scalar use, and all users of the
4277 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4278 // place the pointer in PossibleNonScalarPtrs.
4279 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4280 return isa<LoadInst>(U) || isa<StoreInst>(U);
4281 }))
4282 ScalarPtrs.insert(I);
4283 else
4284 PossibleNonScalarPtrs.insert(I);
4285 };
4286
4287 // We seed the scalars analysis with three classes of instructions: (1)
4288 // instructions marked uniform-after-vectorization and (2) bitcast,
4289 // getelementptr and (pointer) phi instructions used by memory accesses
4290 // requiring a scalar use.
4291 //
4292 // (1) Add to the worklist all instructions that have been identified as
4293 // uniform-after-vectorization.
4294 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4295
4296 // (2) Add to the worklist all bitcast and getelementptr instructions used by
4297 // memory accesses requiring a scalar use. The pointer operands of loads and
4298 // stores will be scalar as long as the memory accesses is not a gather or
4299 // scatter operation. The value operand of a store will remain scalar if the
4300 // store is scalarized.
4301 for (auto *BB : TheLoop->blocks())
4302 for (auto &I : *BB) {
4303 if (auto *Load = dyn_cast<LoadInst>(&I)) {
4304 evaluatePtrUse(Load, Load->getPointerOperand());
4305 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4306 evaluatePtrUse(Store, Store->getPointerOperand());
4307 evaluatePtrUse(Store, Store->getValueOperand());
4308 }
4309 }
4310 for (auto *I : ScalarPtrs)
4311 if (!PossibleNonScalarPtrs.count(I)) {
4312 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *I << "\n"; } } while (false)
;
4313 Worklist.insert(I);
4314 }
4315
4316 // Insert the forced scalars.
4317 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
4318 // induction variable when the PHI user is scalarized.
4319 auto ForcedScalar = ForcedScalars.find(VF);
4320 if (ForcedScalar != ForcedScalars.end())
4321 for (auto *I : ForcedScalar->second) {
4322 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found (forced) scalar instruction: "
<< *I << "\n"; } } while (false)
;
4323 Worklist.insert(I);
4324 }
4325
4326 // Expand the worklist by looking through any bitcasts and getelementptr
4327 // instructions we've already identified as scalar. This is similar to the
4328 // expansion step in collectLoopUniforms(); however, here we're only
4329 // expanding to include additional bitcasts and getelementptr instructions.
4330 unsigned Idx = 0;
4331 while (Idx != Worklist.size()) {
4332 Instruction *Dst = Worklist[Idx++];
4333 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4334 continue;
4335 auto *Src = cast<Instruction>(Dst->getOperand(0));
4336 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4337 auto *J = cast<Instruction>(U);
4338 return !TheLoop->contains(J) || Worklist.count(J) ||
4339 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4340 isScalarUse(J, Src));
4341 })) {
4342 Worklist.insert(Src);
4343 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Src << "\n"; } } while (false)
;
4344 }
4345 }
4346
4347 // An induction variable will remain scalar if all users of the induction
4348 // variable and induction variable update remain scalar.
4349 for (const auto &Induction : Legal->getInductionVars()) {
4350 auto *Ind = Induction.first;
4351 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4352
4353 // If tail-folding is applied, the primary induction variable will be used
4354 // to feed a vector compare.
4355 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4356 continue;
4357
4358 // Returns true if \p Indvar is a pointer induction that is used directly by
4359 // load/store instruction \p I.
4360 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4361 Instruction *I) {
4362 return Induction.second.getKind() ==
4363 InductionDescriptor::IK_PtrInduction &&
4364 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4365 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4366 };
4367
4368 // Determine if all users of the induction variable are scalar after
4369 // vectorization.
4370 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4371 auto *I = cast<Instruction>(U);
4372 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4373 IsDirectLoadStoreFromPtrIndvar(Ind, I);
4374 });
4375 if (!ScalarInd)
4376 continue;
4377
4378 // Determine if all users of the induction variable update instruction are
4379 // scalar after vectorization.
4380 auto ScalarIndUpdate =
4381 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4382 auto *I = cast<Instruction>(U);
4383 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4384 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4385 });
4386 if (!ScalarIndUpdate)
4387 continue;
4388
4389 // The induction variable and its update instruction will remain scalar.
4390 Worklist.insert(Ind);
4391 Worklist.insert(IndUpdate);
4392 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Ind << "\n"; } } while (false)
;
4393 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
4394 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
;
4395 }
4396
4397 Scalars[VF].insert(Worklist.begin(), Worklist.end());
4398}
4399
4400bool LoopVectorizationCostModel::isScalarWithPredication(
4401 Instruction *I, ElementCount VF) const {
4402 if (!isPredicatedInst(I))
4403 return false;
4404
4405 // Do we have a non-scalar lowering for this predicated
4406 // instruction? No - it is scalar with predication.
4407 switch(I->getOpcode()) {
4408 default:
4409 return true;
4410 case Instruction::Load:
4411 case Instruction::Store: {
4412 auto *Ptr = getLoadStorePointerOperand(I);
4413 auto *Ty = getLoadStoreType(I);
4414 Type *VTy = Ty;
4415 if (VF.isVector())
4416 VTy = VectorType::get(Ty, VF);
4417 const Align Alignment = getLoadStoreAlignment(I);
4418 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4419 TTI.isLegalMaskedGather(VTy, Alignment))
4420 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4421 TTI.isLegalMaskedScatter(VTy, Alignment));
4422 }
4423 case Instruction::UDiv:
4424 case Instruction::SDiv:
4425 case Instruction::SRem:
4426 case Instruction::URem:
4427 // We have the option to use the safe-divisor idiom to avoid predication.
4428 // At the moment this is only used for scalable (which legally can't
4429 // scalarize), but long term we want to make a cost based decision
4430 // for fixed length vectors as well.
4431 return !VF.isScalable();
4432 }
4433}
4434
4435bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
4436 if (!blockNeedsPredicationForAnyReason(I->getParent()))
4437 return false;
4438
4439 // Can we prove this instruction is safe to unconditionally execute?
4440 // If not, we must use some form of predication.
4441 switch(I->getOpcode()) {
4442 default:
4443 return false;
4444 case Instruction::Load:
4445 case Instruction::Store: {
4446 if (!Legal->isMaskRequired(I))
4447 return false;
4448 // When we know the load's address is loop invariant and the instruction
4449 // in the original scalar loop was unconditionally executed then we
4450 // don't need to mark it as a predicated instruction. Tail folding may
4451 // introduce additional predication, but we're guaranteed to always have
4452 // at least one active lane. We call Legal->blockNeedsPredication here
4453 // because it doesn't query tail-folding. For stores, we need to prove
4454 // both speculation safety (which follows from the same argument as loads),
4455 // but also must prove the value being stored is correct. The easiest
4456 // form of the later is to require that all values stored are the same.
4457 if (Legal->isUniformMemOp(*I) &&
4458 (isa<LoadInst>(I) ||
4459 (isa<StoreInst>(I) &&
4460 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
4461 !Legal->blockNeedsPredication(I->getParent()))
4462 return false;
4463 return true;
4464 }
4465 case Instruction::UDiv:
4466 case Instruction::SDiv:
4467 case Instruction::SRem:
4468 case Instruction::URem:
4469 // TODO: We can use the loop-preheader as context point here and get
4470 // context sensitive reasoning
4471 return !isSafeToSpeculativelyExecute(I);
4472 }
4473}
4474
4475bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4476 Instruction *I, ElementCount VF) {
4477 assert(isAccessInterleaved(I) && "Expecting interleaved access.")(static_cast <bool> (isAccessInterleaved(I) && "Expecting interleaved access."
) ? void (0) : __assert_fail ("isAccessInterleaved(I) && \"Expecting interleaved access.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4477, __extension__
__PRETTY_FUNCTION__))
;
4478 assert(getWideningDecision(I, VF) == CM_Unknown &&(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown
&& "Decision should not be set yet.") ? void (0) : __assert_fail
("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4479, __extension__
__PRETTY_FUNCTION__))
4479 "Decision should not be set yet.")(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown
&& "Decision should not be set yet.") ? void (0) : __assert_fail
("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4479, __extension__
__PRETTY_FUNCTION__))
;
4480 auto *Group = getInterleavedAccessGroup(I);
4481 assert(Group && "Must have a group.")(static_cast <bool> (Group && "Must have a group."
) ? void (0) : __assert_fail ("Group && \"Must have a group.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4481, __extension__
__PRETTY_FUNCTION__))
;
4482
4483 // If the instruction's allocated size doesn't equal it's type size, it
4484 // requires padding and will be scalarized.
4485 auto &DL = I->getModule()->getDataLayout();
4486 auto *ScalarTy = getLoadStoreType(I);
4487 if (hasIrregularType(ScalarTy, DL))
4488 return false;
4489
4490 // If the group involves a non-integral pointer, we may not be able to
4491 // losslessly cast all values to a common type.
4492 unsigned InterleaveFactor = Group->getFactor();
4493 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4494 for (unsigned i = 0; i < InterleaveFactor; i++) {
4495 Instruction *Member = Group->getMember(i);
4496 if (!Member)
4497 continue;
4498 auto *MemberTy = getLoadStoreType(Member);
4499 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4500 // Don't coerce non-integral pointers to integers or vice versa.
4501 if (MemberNI != ScalarNI) {
4502 // TODO: Consider adding special nullptr value case here
4503 return false;
4504 } else if (MemberNI && ScalarNI &&
4505 ScalarTy->getPointerAddressSpace() !=
4506 MemberTy->getPointerAddressSpace()) {
4507 return false;
4508 }
4509 }
4510
4511 // Check if masking is required.
4512 // A Group may need masking for one of two reasons: it resides in a block that
4513 // needs predication, or it was decided to use masking to deal with gaps
4514 // (either a gap at the end of a load-access that may result in a speculative
4515 // load, or any gaps in a store-access).
4516 bool PredicatedAccessRequiresMasking =
4517 blockNeedsPredicationForAnyReason(I->getParent()) &&
4518 Legal->isMaskRequired(I);
4519 bool LoadAccessWithGapsRequiresEpilogMasking =
4520 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4521 !isScalarEpilogueAllowed();
4522 bool StoreAccessWithGapsRequiresMasking =
4523 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4524 if (!PredicatedAccessRequiresMasking &&
4525 !LoadAccessWithGapsRequiresEpilogMasking &&
4526 !StoreAccessWithGapsRequiresMasking)
4527 return true;
4528
4529 // If masked interleaving is required, we expect that the user/target had
4530 // enabled it, because otherwise it either wouldn't have been created or
4531 // it should have been invalidated by the CostModel.
4532 assert(useMaskedInterleavedAccesses(TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled."
) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4533, __extension__
__PRETTY_FUNCTION__))
4533 "Masked interleave-groups for predicated accesses are not enabled.")(static_cast <bool> (useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled."
) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4533, __extension__
__PRETTY_FUNCTION__))
;
4534
4535 if (Group->isReverse())
4536 return false;
4537
4538 auto *Ty = getLoadStoreType(I);
4539 const Align Alignment = getLoadStoreAlignment(I);
4540 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4541 : TTI.isLegalMaskedStore(Ty, Alignment);
4542}
4543
4544bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4545 Instruction *I, ElementCount VF) {
4546 // Get and ensure we have a valid memory instruction.
4547 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction")(static_cast <bool> ((isa<LoadInst, StoreInst>(I)
) && "Invalid memory instruction") ? void (0) : __assert_fail
("(isa<LoadInst, StoreInst>(I)) && \"Invalid memory instruction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4547, __extension__
__PRETTY_FUNCTION__))
;
4548
4549 auto *Ptr = getLoadStorePointerOperand(I);
4550 auto *ScalarTy = getLoadStoreType(I);
4551
4552 // In order to be widened, the pointer should be consecutive, first of all.
4553 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4554 return false;
4555
4556 // If the instruction is a store located in a predicated block, it will be
4557 // scalarized.
4558 if (isScalarWithPredication(I, VF))
4559 return false;
4560
4561 // If the instruction's allocated size doesn't equal it's type size, it
4562 // requires padding and will be scalarized.
4563 auto &DL = I->getModule()->getDataLayout();
4564 if (hasIrregularType(ScalarTy, DL))
4565 return false;
4566
4567 return true;
4568}
4569
4570void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4571 // We should not collect Uniforms more than once per VF. Right now,
4572 // this function is called from collectUniformsAndScalars(), which
4573 // already does this check. Collecting Uniforms for VF=1 does not make any
4574 // sense.
4575
4576 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&(static_cast <bool> (VF.isVector() && Uniforms.
find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4577, __extension__
__PRETTY_FUNCTION__))
4577 "This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Uniforms.
find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4577, __extension__
__PRETTY_FUNCTION__))
;
4578
4579 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4580 // not analyze again. Uniforms.count(VF) will return 1.
4581 Uniforms[VF].clear();
4582
4583 // We now know that the loop is vectorizable!
4584 // Collect instructions inside the loop that will remain uniform after
4585 // vectorization.
4586
4587 // Global values, params and instructions outside of current loop are out of
4588 // scope.
4589 auto isOutOfScope = [&](Value *V) -> bool {
4590 Instruction *I = dyn_cast<Instruction>(V);
4591 return (!I || !TheLoop->contains(I));
4592 };
4593
4594 // Worklist containing uniform instructions demanding lane 0.
4595 SetVector<Instruction *> Worklist;
4596 BasicBlock *Latch = TheLoop->getLoopLatch();
4597
4598 // Add uniform instructions demanding lane 0 to the worklist. Instructions
4599 // that are scalar with predication must not be considered uniform after
4600 // vectorization, because that would create an erroneous replicating region
4601 // where only a single instance out of VF should be formed.
4602 // TODO: optimize such seldom cases if found important, see PR40816.
4603 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4604 if (isOutOfScope(I)) {
4605 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
4606 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
;
4607 return;
4608 }
4609 if (isScalarWithPredication(I, VF)) {
4610 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
4611 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
;
4612 return;
4613 }
4614 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
<< *I << "\n"; } } while (false)
;
4615 Worklist.insert(I);
4616 };
4617
4618 // Start with the conditional branch. If the branch condition is an
4619 // instruction contained in the loop that is only used by the branch, it is
4620 // uniform.
4621 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4622 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4623 addToWorklistIfAllowed(Cmp);
4624
4625 // Return true if all lanes perform the same memory operation, and we can
4626 // thus chose to execute only one.
4627 auto isUniformMemOpUse = [&](Instruction *I) {
4628 if (!Legal->isUniformMemOp(*I))
4629 return false;
4630 if (isa<LoadInst>(I))
4631 // Loading the same address always produces the same result - at least
4632 // assuming aliasing and ordering which have already been checked.
4633 return true;
4634 // Storing the same value on every iteration.
4635 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4636 };
4637
4638 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4639 InstWidening WideningDecision = getWideningDecision(I, VF);
4640 assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4641, __extension__
__PRETTY_FUNCTION__))
4641 "Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4641, __extension__
__PRETTY_FUNCTION__))
;
4642
4643 if (isUniformMemOpUse(I))
4644 return true;
4645
4646 return (WideningDecision == CM_Widen ||
4647 WideningDecision == CM_Widen_Reverse ||
4648 WideningDecision == CM_Interleave);
4649 };
4650
4651
4652 // Returns true if Ptr is the pointer operand of a memory access instruction
4653 // I, and I is known to not require scalarization.
4654 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4655 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4656 };
4657
4658 // Holds a list of values which are known to have at least one uniform use.
4659 // Note that there may be other uses which aren't uniform. A "uniform use"
4660 // here is something which only demands lane 0 of the unrolled iterations;
4661 // it does not imply that all lanes produce the same value (e.g. this is not
4662 // the usual meaning of uniform)
4663 SetVector<Value *> HasUniformUse;
4664
4665 // Scan the loop for instructions which are either a) known to have only
4666 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4667 for (auto *BB : TheLoop->blocks())
4668 for (auto &I : *BB) {
4669 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4670 switch (II->getIntrinsicID()) {
4671 case Intrinsic::sideeffect:
4672 case Intrinsic::experimental_noalias_scope_decl:
4673 case Intrinsic::assume:
4674 case Intrinsic::lifetime_start:
4675 case Intrinsic::lifetime_end:
4676 if (TheLoop->hasLoopInvariantOperands(&I))
4677 addToWorklistIfAllowed(&I);
4678 break;
4679 default:
4680 break;
4681 }
4682 }
4683
4684 // ExtractValue instructions must be uniform, because the operands are
4685 // known to be loop-invariant.
4686 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4687 assert(isOutOfScope(EVI->getAggregateOperand()) &&(static_cast <bool> (isOutOfScope(EVI->getAggregateOperand
()) && "Expected aggregate value to be loop invariant"
) ? void (0) : __assert_fail ("isOutOfScope(EVI->getAggregateOperand()) && \"Expected aggregate value to be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4688, __extension__
__PRETTY_FUNCTION__))
4688 "Expected aggregate value to be loop invariant")(static_cast <bool> (isOutOfScope(EVI->getAggregateOperand
()) && "Expected aggregate value to be loop invariant"
) ? void (0) : __assert_fail ("isOutOfScope(EVI->getAggregateOperand()) && \"Expected aggregate value to be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4688, __extension__
__PRETTY_FUNCTION__))
;
4689 addToWorklistIfAllowed(EVI);
4690 continue;
4691 }
4692
4693 // If there's no pointer operand, there's nothing to do.
4694 auto *Ptr = getLoadStorePointerOperand(&I);
4695 if (!Ptr)
4696 continue;
4697
4698 if (isUniformMemOpUse(&I))
4699 addToWorklistIfAllowed(&I);
4700
4701 if (isUniformDecision(&I, VF)) {
4702 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check")(static_cast <bool> (isVectorizedMemAccessUse(&I, Ptr
) && "consistency check") ? void (0) : __assert_fail (
"isVectorizedMemAccessUse(&I, Ptr) && \"consistency check\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4702, __extension__
__PRETTY_FUNCTION__))
;
4703 HasUniformUse.insert(Ptr);
4704 }
4705 }
4706
4707 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4708 // demanding) users. Since loops are assumed to be in LCSSA form, this
4709 // disallows uses outside the loop as well.
4710 for (auto *V : HasUniformUse) {
4711 if (isOutOfScope(V))
4712 continue;
4713 auto *I = cast<Instruction>(V);
4714 auto UsersAreMemAccesses =
4715 llvm::all_of(I->users(), [&](User *U) -> bool {
4716 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4717 });
4718 if (UsersAreMemAccesses)
4719 addToWorklistIfAllowed(I);
4720 }
4721
4722 // Expand Worklist in topological order: whenever a new instruction
4723 // is added , its users should be already inside Worklist. It ensures
4724 // a uniform instruction will only be used by uniform instructions.
4725 unsigned idx = 0;
4726 while (idx != Worklist.size()) {
4727 Instruction *I = Worklist[idx++];
4728
4729 for (auto *OV : I->operand_values()) {
4730 // isOutOfScope operands cannot be uniform instructions.
4731 if (isOutOfScope(OV))
4732 continue;
4733 // First order recurrence Phi's should typically be considered
4734 // non-uniform.
4735 auto *OP = dyn_cast<PHINode>(OV);
4736 if (OP && Legal->isFixedOrderRecurrence(OP))
4737 continue;
4738 // If all the users of the operand are uniform, then add the
4739 // operand into the uniform worklist.
4740 auto *OI = cast<Instruction>(OV);
4741 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4742 auto *J = cast<Instruction>(U);
4743 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4744 }))
4745 addToWorklistIfAllowed(OI);
4746 }
4747 }
4748
4749 // For an instruction to be added into Worklist above, all its users inside
4750 // the loop should also be in Worklist. However, this condition cannot be
4751 // true for phi nodes that form a cyclic dependence. We must process phi
4752 // nodes separately. An induction variable will remain uniform if all users
4753 // of the induction variable and induction variable update remain uniform.
4754 // The code below handles both pointer and non-pointer induction variables.
4755 for (const auto &Induction : Legal->getInductionVars()) {
4756 auto *Ind = Induction.first;
4757 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4758
4759 // Determine if all users of the induction variable are uniform after
4760 // vectorization.
4761 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4762 auto *I = cast<Instruction>(U);
4763 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4764 isVectorizedMemAccessUse(I, Ind);
4765 });
4766 if (!UniformInd)
4767 continue;
4768
4769 // Determine if all users of the induction variable update instruction are
4770 // uniform after vectorization.
4771 auto UniformIndUpdate =
4772 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4773 auto *I = cast<Instruction>(U);
4774 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4775 isVectorizedMemAccessUse(I, IndUpdate);
4776 });
4777 if (!UniformIndUpdate)
4778 continue;
4779
4780 // The induction variable and its update instruction will remain uniform.
4781 addToWorklistIfAllowed(Ind);
4782 addToWorklistIfAllowed(IndUpdate);
4783 }
4784
4785 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4786}
4787
4788bool LoopVectorizationCostModel::runtimeChecksRequired() {
4789 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Performing code size checks.\n"
; } } while (false)
;
4790
4791 if (Legal->getRuntimePointerChecking()->Need) {
4792 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4793 "runtime pointer checks needed. Enable vectorization of this "
4794 "loop with '#pragma clang loop vectorize(enable)' when "
4795 "compiling with -Os/-Oz",
4796 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4797 return true;
4798 }
4799
4800 if (!PSE.getPredicate().isAlwaysTrue()) {
4801 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4802 "runtime SCEV checks needed. Enable vectorization of this "
4803 "loop with '#pragma clang loop vectorize(enable)' when "
4804 "compiling with -Os/-Oz",
4805 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4806 return true;
4807 }
4808
4809 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4810 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4811 reportVectorizationFailure("Runtime stride check for small trip count",
4812 "runtime stride == 1 checks needed. Enable vectorization of "
4813 "this loop without such check by compiling with -Os/-Oz",
4814 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4815 return true;
4816 }
4817
4818 return false;
4819}
4820
4821ElementCount
4822LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4823 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4824 return ElementCount::getScalable(0);
4825
4826 if (Hints->isScalableVectorizationDisabled()) {
4827 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4828 "ScalableVectorizationDisabled", ORE, TheLoop);
4829 return ElementCount::getScalable(0);
4830 }
4831
4832 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalable vectorization is available\n"
; } } while (false)
;
4833
4834 auto MaxScalableVF = ElementCount::getScalable(
4835 std::numeric_limits<ElementCount::ScalarTy>::max());
4836
4837 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4838 // FIXME: While for scalable vectors this is currently sufficient, this should
4839 // be replaced by a more detailed mechanism that filters out specific VFs,
4840 // instead of invalidating vectorization for a whole set of VFs based on the
4841 // MaxVF.
4842
4843 // Disable scalable vectorization if the loop contains unsupported reductions.
4844 if (!canVectorizeReductions(MaxScalableVF)) {
4845 reportVectorizationInfo(
4846 "Scalable vectorization not supported for the reduction "
4847 "operations found in this loop.",
4848 "ScalableVFUnfeasible", ORE, TheLoop);
4849 return ElementCount::getScalable(0);
4850 }
4851
4852 // Disable scalable vectorization if the loop contains any instructions
4853 // with element types not supported for scalable vectors.
4854 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4855 return !Ty->isVoidTy() &&
4856 !this->TTI.isElementTypeLegalForScalableVector(Ty);
4857 })) {
4858 reportVectorizationInfo("Scalable vectorization is not supported "
4859 "for all element types found in this loop.",
4860 "ScalableVFUnfeasible", ORE, TheLoop);
4861 return ElementCount::getScalable(0);
4862 }
4863
4864 if (Legal->isSafeForAnyVectorWidth())
4865 return MaxScalableVF;
4866
4867 // Limit MaxScalableVF by the maximum safe dependence distance.
4868 Optional<unsigned> MaxVScale = TTI.getMaxVScale();
4869 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
4870 MaxVScale =
4871 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
4872 MaxScalableVF = ElementCount::getScalable(
4873 MaxVScale ? (MaxSafeElements / MaxVScale.value()) : 0);
4874 if (!MaxScalableVF)
4875 reportVectorizationInfo(
4876 "Max legal vector width too small, scalable vectorization "
4877 "unfeasible.",
4878 "ScalableVFUnfeasible", ORE, TheLoop);
4879
4880 return MaxScalableVF;
4881}
4882
4883FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4884 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4885 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4886 unsigned SmallestType, WidestType;
4887 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4888
4889 // Get the maximum safe dependence distance in bits computed by LAA.
4890 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4891 // the memory accesses that is most restrictive (involved in the smallest
4892 // dependence distance).
4893 unsigned MaxSafeElements =
4894 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4895
4896 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4897 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4898
4899 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe fixed VF is: "
<< MaxSafeFixedVF << ".\n"; } } while (false)
4900 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe fixed VF is: "
<< MaxSafeFixedVF << ".\n"; } } while (false)
;
4901 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe scalable VF is: "
<< MaxSafeScalableVF << ".\n"; } } while (false)
4902 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe scalable VF is: "
<< MaxSafeScalableVF << ".\n"; } } while (false)
;
4903
4904 // First analyze the UserVF, fall back if the UserVF should be ignored.
4905 if (UserVF) {
4906 auto MaxSafeUserVF =
4907 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4908
4909 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4910 // If `VF=vscale x N` is safe, then so is `VF=N`
4911 if (UserVF.isScalable())
4912 return FixedScalableVFPair(
4913 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4914 else
4915 return UserVF;
4916 }
4917
4918 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF))(static_cast <bool> (ElementCount::isKnownGT(UserVF, MaxSafeUserVF
)) ? void (0) : __assert_fail ("ElementCount::isKnownGT(UserVF, MaxSafeUserVF)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4918, __extension__
__PRETTY_FUNCTION__))
;
4919
4920 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4921 // is better to ignore the hint and let the compiler choose a suitable VF.
4922 if (!UserVF.isScalable()) {
4923 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeFixedVF << ".\n"; } } while (false)
4924 << " is unsafe, clamping to max safe VF="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeFixedVF << ".\n"; } } while (false)
4925 << MaxSafeFixedVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeFixedVF << ".\n"; } } while (false)
;
4926 ORE->emit([&]() {
4927 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
4928 TheLoop->getStartLoc(),
4929 TheLoop->getHeader())
4930 << "User-specified vectorization factor "
4931 << ore::NV("UserVectorizationFactor", UserVF)
4932 << " is unsafe, clamping to maximum safe vectorization factor "
4933 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4934 });
4935 return MaxSafeFixedVF;
4936 }
4937
4938 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4939 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is ignored because scalable vectors are not "
"available.\n"; } } while (false)
4940 << " is ignored because scalable vectors are not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is ignored because scalable vectors are not "
"available.\n"; } } while (false)
4941 "available.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is ignored because scalable vectors are not "
"available.\n"; } } while (false)
;
4942 ORE->emit([&]() {
4943 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
4944 TheLoop->getStartLoc(),
4945 TheLoop->getHeader())
4946 << "User-specified vectorization factor "
4947 << ore::NV("UserVectorizationFactor", UserVF)
4948 << " is ignored because the target does not support scalable "
4949 "vectors. The compiler will pick a more suitable value.";
4950 });
4951 } else {
4952 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe. Ignoring scalable UserVF.\n"; }
} while (false)
4953 << " is unsafe. Ignoring scalable UserVF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe. Ignoring scalable UserVF.\n"; }
} while (false)
;
4954 ORE->emit([&]() {
4955 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
4956 TheLoop->getStartLoc(),
4957 TheLoop->getHeader())
4958 << "User-specified vectorization factor "
4959 << ore::NV("UserVectorizationFactor", UserVF)
4960 << " is unsafe. Ignoring the hint to let the compiler pick a "
4961 "more suitable value.";
4962 });
4963 }
4964 }
4965
4966 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestTypedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
4967 << " / " << WidestType << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
;
4968
4969 FixedScalableVFPair Result(ElementCount::getFixed(1),
4970 ElementCount::getScalable(0));
4971 if (auto MaxVF =
4972 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
4973 MaxSafeFixedVF, FoldTailByMasking))
4974 Result.FixedVF = MaxVF;
4975
4976 if (auto MaxVF =
4977 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
4978 MaxSafeScalableVF, FoldTailByMasking))
4979 if (MaxVF.isScalable()) {
4980 Result.ScalableVF = MaxVF;
4981 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found feasible scalable VF = "
<< MaxVF << "\n"; } } while (false)
4982 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found feasible scalable VF = "
<< MaxVF << "\n"; } } while (false)
;
4983 }
4984
4985 return Result;
4986}
4987
4988FixedScalableVFPair
4989LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4990 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4991 // TODO: It may by useful to do since it's still likely to be dynamically
4992 // uniform if the target can skip.
4993 reportVectorizationFailure(
4994 "Not inserting runtime ptr check for divergent target",
4995 "runtime pointer checks needed. Not enabled for divergent target",
4996 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4997 return FixedScalableVFPair::getNone();
4998 }
4999
5000 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5001 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found trip count: "
<< TC << '\n'; } } while (false)
;
5002 if (TC == 1) {
5003 reportVectorizationFailure("Single iteration (non) loop",
5004 "loop trip count is one, irrelevant for vectorization",
5005 "SingleIterationLoop", ORE, TheLoop);
5006 return FixedScalableVFPair::getNone();
5007 }
5008
5009 switch (ScalarEpilogueStatus) {
5010 case CM_ScalarEpilogueAllowed:
5011 return computeFeasibleMaxVF(TC, UserVF, false);
5012 case CM_ScalarEpilogueNotAllowedUsePredicate:
5013 [[fallthrough]];
5014 case CM_ScalarEpilogueNotNeededUsePredicate:
5015 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5016 dbgs() << "LV: vector predicate hint/switch found.\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5017 << "LV: Not allowing scalar epilogue, creating predicated "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5018 << "vector loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
;
5019 break;
5020 case CM_ScalarEpilogueNotAllowedLowTripLoop:
5021 // fallthrough as a special case of OptForSize
5022 case CM_ScalarEpilogueNotAllowedOptSize:
5023 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5024 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
5025 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
;
5026 else
5027 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
<< "count.\n"; } } while (false)
5028 << "count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
<< "count.\n"; } } while (false)
;
5029
5030 // Bail if runtime checks are required, which are not good when optimising
5031 // for size.
5032 if (runtimeChecksRequired())
5033 return FixedScalableVFPair::getNone();
5034
5035 break;
5036 }
5037
5038 // The only loops we can vectorize without a scalar epilogue, are loops with
5039 // a bottom-test and a single exiting block. We'd have to handle the fact
5040 // that not every instruction executes on the last iteration. This will
5041 // require a lane mask which varies through the vector loop body. (TODO)
5042 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5043 // If there was a tail-folding hint/switch, but we can't fold the tail by
5044 // masking, fallback to a vectorization with a scalar epilogue.
5045 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5046 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
5047 "scalar epilogue instead.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
;
5048 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5049 return computeFeasibleMaxVF(TC, UserVF, false);
5050 }
5051 return FixedScalableVFPair::getNone();
5052 }
5053
5054 // Now try the tail folding
5055
5056 // Invalidate interleave groups that require an epilogue if we can't mask
5057 // the interleave-group.
5058 if (!useMaskedInterleavedAccesses(TTI)) {
5059 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&(static_cast <bool> (WideningDecisions.empty() &&
Uniforms.empty() && Scalars.empty() && "No decisions should have been taken at this point"
) ? void (0) : __assert_fail ("WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && \"No decisions should have been taken at this point\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5060, __extension__
__PRETTY_FUNCTION__))
5060 "No decisions should have been taken at this point")(static_cast <bool> (WideningDecisions.empty() &&
Uniforms.empty() && Scalars.empty() && "No decisions should have been taken at this point"
) ? void (0) : __assert_fail ("WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && \"No decisions should have been taken at this point\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5060, __extension__
__PRETTY_FUNCTION__))
;
5061 // Note: There is no need to invalidate any cost modeling decisions here, as
5062 // non where taken so far.
5063 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5064 }
5065
5066 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5067 // Avoid tail folding if the trip count is known to be a multiple of any VF
5068 // we chose.
5069 // FIXME: The condition below pessimises the case for fixed-width vectors,
5070 // when scalable VFs are also candidates for vectorization.
5071 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5072 ElementCount MaxFixedVF = MaxFactors.FixedVF;
5073 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&(static_cast <bool> ((UserVF.isNonZero() || isPowerOf2_32
(MaxFixedVF.getFixedValue())) && "MaxFixedVF must be a power of 2"
) ? void (0) : __assert_fail ("(UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && \"MaxFixedVF must be a power of 2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5074, __extension__
__PRETTY_FUNCTION__))
5074 "MaxFixedVF must be a power of 2")(static_cast <bool> ((UserVF.isNonZero() || isPowerOf2_32
(MaxFixedVF.getFixedValue())) && "MaxFixedVF must be a power of 2"
) ? void (0) : __assert_fail ("(UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && \"MaxFixedVF must be a power of 2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5074, __extension__
__PRETTY_FUNCTION__))
;
5075 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5076 : MaxFixedVF.getFixedValue();
5077 ScalarEvolution *SE = PSE.getSE();
5078 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5079 const SCEV *ExitCount = SE->getAddExpr(
5080 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5081 const SCEV *Rem = SE->getURemExpr(
5082 SE->applyLoopGuards(ExitCount, TheLoop),
5083 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5084 if (Rem->isZero()) {
5085 // Accept MaxFixedVF if we do not have a tail.
5086 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: No tail will remain for any chosen VF.\n"
; } } while (false)
;
5087 return MaxFactors;
5088 }
5089 }
5090
5091 // If we don't know the precise trip count, or if the trip count that we
5092 // found modulo the vectorization factor is not zero, try to fold the tail
5093 // by masking.
5094 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5095 if (Legal->prepareToFoldTailByMasking()) {
5096 FoldTailByMasking = true;
5097 return MaxFactors;
5098 }
5099
5100 // If there was a tail-folding hint/switch, but we can't fold the tail by
5101 // masking, fallback to a vectorization with a scalar epilogue.
5102 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5103 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
5104 "scalar epilogue instead.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
;
5105 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5106 return MaxFactors;
5107 }
5108
5109 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5110 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"
; } } while (false)
;
5111 return FixedScalableVFPair::getNone();
5112 }
5113
5114 if (TC == 0) {
5115 reportVectorizationFailure(
5116 "Unable to calculate the loop count due to complex control flow",
5117 "unable to calculate the loop count due to complex control flow",
5118 "UnknownLoopCountComplexCFG", ORE, TheLoop);
5119 return FixedScalableVFPair::getNone();
5120 }
5121
5122 reportVectorizationFailure(
5123 "Cannot optimize for size and vectorize at the same time.",
5124 "cannot optimize for size and vectorize at the same time. "
5125 "Enable vectorization of this loop with '#pragma clang loop "
5126 "vectorize(enable)' when compiling with -Os/-Oz",
5127 "NoTailLoopWithOptForSize", ORE, TheLoop);
5128 return FixedScalableVFPair::getNone();
5129}
5130
5131ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5132 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5133 ElementCount MaxSafeVF, bool FoldTailByMasking) {
5134 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5135 TypeSize WidestRegister = TTI.getRegisterBitWidth(
5136 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5137 : TargetTransformInfo::RGK_FixedWidthVector);
5138
5139 // Convenience function to return the minimum of two ElementCounts.
5140 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5141 assert((LHS.isScalable() == RHS.isScalable()) &&(static_cast <bool> ((LHS.isScalable() == RHS.isScalable
()) && "Scalable flags must match") ? void (0) : __assert_fail
("(LHS.isScalable() == RHS.isScalable()) && \"Scalable flags must match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5142, __extension__
__PRETTY_FUNCTION__))
5142 "Scalable flags must match")(static_cast <bool> ((LHS.isScalable() == RHS.isScalable
()) && "Scalable flags must match") ? void (0) : __assert_fail
("(LHS.isScalable() == RHS.isScalable()) && \"Scalable flags must match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5142, __extension__
__PRETTY_FUNCTION__))
;
5143 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5144 };
5145
5146 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5147 // Note that both WidestRegister and WidestType may not be a powers of 2.
5148 auto MaxVectorElementCount = ElementCount::get(
5149 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5150 ComputeScalableMaxVF);
5151 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5152 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< (MaxVectorElementCount * WidestType) << " bits.\n"
; } } while (false)
5153 << (MaxVectorElementCount * WidestType) << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< (MaxVectorElementCount * WidestType) << " bits.\n"
; } } while (false)
;
5154
5155 if (!MaxVectorElementCount) {
5156 LLVM_DEBUG(dbgs() << "LV: The target has no "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no "
<< (ComputeScalableMaxVF ? "scalable" : "fixed") <<
" vector registers.\n"; } } while (false)
5157 << (ComputeScalableMaxVF ? "scalable" : "fixed")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no "
<< (ComputeScalableMaxVF ? "scalable" : "fixed") <<
" vector registers.\n"; } } while (false)
5158 << " vector registers.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no "
<< (ComputeScalableMaxVF ? "scalable" : "fixed") <<
" vector registers.\n"; } } while (false)
;
5159 return ElementCount::getFixed(1);
5160 }
5161
5162 const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5163 if (ConstTripCount &&
5164 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5165 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5166 // If loop trip count (TC) is known at compile time there is no point in
5167 // choosing VF greater than TC (as done in the loop below). Select maximum
5168 // power of two which doesn't exceed TC.
5169 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5170 // when the TC is less than or equal to the known number of lanes.
5171 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5172 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: " << ClampedConstTripCount
<< "\n"; } } while (false)
5173 "exceeding the constant trip count: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: " << ClampedConstTripCount
<< "\n"; } } while (false)
5174 << ClampedConstTripCount << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: " << ClampedConstTripCount
<< "\n"; } } while (false)
;
5175 return ElementCount::getFixed(ClampedConstTripCount);
5176 }
5177
5178 TargetTransformInfo::RegisterKind RegKind =
5179 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5180 : TargetTransformInfo::RGK_FixedWidthVector;
5181 ElementCount MaxVF = MaxVectorElementCount;
5182 if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
5183 TTI.shouldMaximizeVectorBandwidth(RegKind))) {
5184 auto MaxVectorElementCountMaxBW = ElementCount::get(
5185 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5186 ComputeScalableMaxVF);
5187 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5188
5189 // Collect all viable vectorization factors larger than the default MaxVF
5190 // (i.e. MaxVectorElementCount).
5191 SmallVector<ElementCount, 8> VFs;
5192 for (ElementCount VS = MaxVectorElementCount * 2;
5193 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5194 VFs.push_back(VS);
5195
5196 // For each VF calculate its register usage.
5197 auto RUs = calculateRegisterUsage(VFs);
5198
5199 // Select the largest VF which doesn't require more registers than existing
5200 // ones.
5201 for (int i = RUs.size() - 1; i >= 0; --i) {
5202 bool Selected = true;
5203 for (auto &pair : RUs[i].MaxLocalUsers) {
5204 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5205 if (pair.second > TargetNumRegisters)
5206 Selected = false;
5207 }
5208 if (Selected) {
5209 MaxVF = VFs[i];
5210 break;
5211 }
5212 }
5213 if (ElementCount MinVF =
5214 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5215 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5216 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
<< MaxVF << ") with target's minimum: " <<
MinVF << '\n'; } } while (false)
5217 << ") with target's minimum: " << MinVF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
<< MaxVF << ") with target's minimum: " <<
MinVF << '\n'; } } while (false)
;
5218 MaxVF = MinVF;
5219 }
5220 }
5221
5222 // Invalidate any widening decisions we might have made, in case the loop
5223 // requires prediction (decided later), but we have already made some
5224 // load/store widening decisions.
5225 invalidateCostModelingDecisions();
5226 }
5227 return MaxVF;
5228}
5229
5230Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5231 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5232 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5233 auto Min = Attr.getVScaleRangeMin();
5234 auto Max = Attr.getVScaleRangeMax();
5235 if (Max && Min == Max)
5236 return Max;
5237 }
5238
5239 return TTI.getVScaleForTuning();
5240}
5241
5242bool LoopVectorizationCostModel::isMoreProfitable(
5243 const VectorizationFactor &A, const VectorizationFactor &B) const {
5244 InstructionCost CostA = A.Cost;
5245 InstructionCost CostB = B.Cost;
5246
5247 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5248
5249 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5250 MaxTripCount) {
5251 // If we are folding the tail and the trip count is a known (possibly small)
5252 // constant, the trip count will be rounded up to an integer number of
5253 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5254 // which we compare directly. When not folding the tail, the total cost will
5255 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5256 // approximated with the per-lane cost below instead of using the tripcount
5257 // as here.
5258 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5259 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5260 return RTCostA < RTCostB;
5261 }
5262
5263 // Improve estimate for the vector width if it is scalable.
5264 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5265 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5266 if (Optional<unsigned> VScale = getVScaleForTuning()) {
5267 if (A.Width.isScalable())
5268 EstimatedWidthA *= VScale.value();
5269 if (B.Width.isScalable())
5270 EstimatedWidthB *= VScale.value();
5271 }
5272
5273 // Assume vscale may be larger than 1 (or the value being tuned for),
5274 // so that scalable vectorization is slightly favorable over fixed-width
5275 // vectorization.
5276 if (A.Width.isScalable() && !B.Width.isScalable())
5277 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5278
5279 // To avoid the need for FP division:
5280 // (CostA / A.Width) < (CostB / B.Width)
5281 // <=> (CostA * B.Width) < (CostB * A.Width)
5282 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5283}
5284
5285VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5286 const ElementCountSet &VFCandidates) {
5287 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5288 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalar loop costs: "
<< ExpectedCost << ".\n"; } } while (false)
;
5289 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop")(static_cast <bool> (ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"
) ? void (0) : __assert_fail ("ExpectedCost.isValid() && \"Unexpected invalid cost for scalar loop\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5289, __extension__
__PRETTY_FUNCTION__))
;
5290 assert(VFCandidates.count(ElementCount::getFixed(1)) &&(static_cast <bool> (VFCandidates.count(ElementCount::getFixed
(1)) && "Expected Scalar VF to be a candidate") ? void
(0) : __assert_fail ("VFCandidates.count(ElementCount::getFixed(1)) && \"Expected Scalar VF to be a candidate\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5291, __extension__
__PRETTY_FUNCTION__))
5291 "Expected Scalar VF to be a candidate")(static_cast <bool> (VFCandidates.count(ElementCount::getFixed
(1)) && "Expected Scalar VF to be a candidate") ? void
(0) : __assert_fail ("VFCandidates.count(ElementCount::getFixed(1)) && \"Expected Scalar VF to be a candidate\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5291, __extension__
__PRETTY_FUNCTION__))
;
5292
5293 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
5294 ExpectedCost);
5295 VectorizationFactor ChosenFactor = ScalarCost;
5296
5297 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5298 if (ForceVectorization && VFCandidates.size() > 1) {
5299 // Ignore scalar width, because the user explicitly wants vectorization.
5300 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5301 // evaluation.
5302 ChosenFactor.Cost = InstructionCost::getMax();
5303 }
5304
5305 SmallVector<InstructionVFPair> InvalidCosts;
5306 for (const auto &i : VFCandidates) {
5307 // The cost for scalar VF=1 is already calculated, so ignore it.
5308 if (i.isScalar())
5309 continue;
5310
5311 VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5312 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5313
5314#ifndef NDEBUG
5315 unsigned AssumedMinimumVscale = 1;
5316 if (Optional<unsigned> VScale = getVScaleForTuning())
5317 AssumedMinimumVscale = *VScale;
5318 unsigned Width =
5319 Candidate.Width.isScalable()
5320 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5321 : Candidate.Width.getFixedValue();
5322 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (Candidate.Cost / Width
); } } while (false)
5323 << " costs: " << (Candidate.Cost / Width))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (Candidate.Cost / Width
); } } while (false)
;
5324 if (i.isScalable())
5325 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " (assuming a minimum vscale of "
<< AssumedMinimumVscale << ")"; } } while (false
)
5326 << AssumedMinimumVscale << ")")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " (assuming a minimum vscale of "
<< AssumedMinimumVscale << ")"; } } while (false
)
;
5327 LLVM_DEBUG(dbgs() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << ".\n"; } } while (false
)
;
5328#endif
5329
5330 if (!C.second && !ForceVectorization) {
5331 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
5332 dbgs() << "LV: Not considering vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
5333 << " because it will not generate any vector instructions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
;
5334 continue;
5335 }
5336
5337 // If profitable add it to ProfitableVF list.
5338 if (isMoreProfitable(Candidate, ScalarCost))
5339 ProfitableVFs.push_back(Candidate);
5340
5341 if (isMoreProfitable(Candidate, ChosenFactor))
5342 ChosenFactor = Candidate;
5343 }
5344
5345 // Emit a report of VFs with invalid costs in the loop.
5346 if (!InvalidCosts.empty()) {
5347 // Group the remarks per instruction, keeping the instruction order from
5348 // InvalidCosts.
5349 std::map<Instruction *, unsigned> Numbering;
5350 unsigned I = 0;
5351 for (auto &Pair : InvalidCosts)
5352 if (!Numbering.count(Pair.first))
5353 Numbering[Pair.first] = I++;
5354
5355 // Sort the list, first on instruction(number) then on VF.
5356 llvm::sort(InvalidCosts,
5357 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5358 if (Numbering[A.first] != Numbering[B.first])
5359 return Numbering[A.first] < Numbering[B.first];
5360 ElementCountComparator ECC;
5361 return ECC(A.second, B.second);
5362 });
5363
5364 // For a list of ordered instruction-vf pairs:
5365 // [(load, vf1), (load, vf2), (store, vf1)]
5366 // Group the instructions together to emit separate remarks for:
5367 // load (vf1, vf2)
5368 // store (vf1)
5369 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5370 auto Subset = ArrayRef<InstructionVFPair>();
5371 do {
5372 if (Subset.empty())
5373 Subset = Tail.take_front(1);
5374
5375 Instruction *I = Subset.front().first;
5376
5377 // If the next instruction is different, or if there are no other pairs,
5378 // emit a remark for the collated subset. e.g.
5379 // [(load, vf1), (load, vf2))]
5380 // to emit:
5381 // remark: invalid costs for 'load' at VF=(vf, vf2)
5382 if (Subset == Tail || Tail[Subset.size()].first != I) {
5383 std::string OutString;
5384 raw_string_ostream OS(OutString);
5385 assert(!Subset.empty() && "Unexpected empty range")(static_cast <bool> (!Subset.empty() && "Unexpected empty range"
) ? void (0) : __assert_fail ("!Subset.empty() && \"Unexpected empty range\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5385, __extension__
__PRETTY_FUNCTION__))
;
5386 OS << "Instruction with invalid costs prevented vectorization at VF=(";
5387 for (const auto &Pair : Subset)
5388 OS << (Pair.second == Subset.front().second ? "" : ", ")
5389 << Pair.second;
5390 OS << "):";
5391 if (auto *CI = dyn_cast<CallInst>(I))
5392 OS << " call to " << CI->getCalledFunction()->getName();
5393 else
5394 OS << " " << I->getOpcodeName();
5395 OS.flush();
5396 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5397 Tail = Tail.drop_front(Subset.size());
5398 Subset = {};
5399 } else
5400 // Grow the subset by one element
5401 Subset = Tail.take_front(Subset.size() + 1);
5402 } while (!Tail.empty());
5403 }
5404
5405 if (!EnableCondStoresVectorization && NumPredStores) {
5406 reportVectorizationFailure("There are conditional stores.",
5407 "store that is conditionally executed prevents vectorization",
5408 "ConditionalStore", ORE, TheLoop);
5409 ChosenFactor = ScalarCost;
5410 }
5411
5412 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && !ChosenFactor
.Width.isScalar() && !isMoreProfitable(ChosenFactor, ScalarCost
)) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (false)
5413 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && !ChosenFactor
.Width.isScalar() && !isMoreProfitable(ChosenFactor, ScalarCost
)) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (false)
5414 << "LV: Vectorization seems to be not beneficial, "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && !ChosenFactor
.Width.isScalar() && !isMoreProfitable(ChosenFactor, ScalarCost
)) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (false)
5415 << "but was forced by a user.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && !ChosenFactor
.Width.isScalar() && !isMoreProfitable(ChosenFactor, ScalarCost
)) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (false)
;
5416 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Selecting VF: " <<
ChosenFactor.Width << ".\n"; } } while (false)
;
5417 return ChosenFactor;
5418}
5419
5420bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5421 const Loop &L, ElementCount VF)