Bug Summary

File:build/source/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Warning:line 8987, column 3
Use of memory after it is freed

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name LoopVectorize.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm -resource-dir /usr/lib/llvm-16/lib/clang/16 -I lib/Transforms/Vectorize -I /build/source/llvm/lib/Transforms/Vectorize -I include -I /build/source/llvm/include -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-16/lib/clang/16/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm=build-llvm -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm=build-llvm -fcoverage-prefix-map=/build/source/= -source-date-epoch 1671833309 -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm -fdebug-prefix-map=/build/source/build-llvm=build-llvm -fdebug-prefix-map=/build/source/= -fdebug-prefix-map=/build/source/build-llvm=build-llvm -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-12-24-002659-1137794-1 -x c++ /build/source/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

/build/source/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/Proposal/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanHCFGBuilder.h"
61#include "VPlanTransforms.h"
62#include "llvm/ADT/APInt.h"
63#include "llvm/ADT/ArrayRef.h"
64#include "llvm/ADT/DenseMap.h"
65#include "llvm/ADT/DenseMapInfo.h"
66#include "llvm/ADT/Hashing.h"
67#include "llvm/ADT/MapVector.h"
68#include "llvm/ADT/STLExtras.h"
69#include "llvm/ADT/SmallPtrSet.h"
70#include "llvm/ADT/SmallSet.h"
71#include "llvm/ADT/SmallVector.h"
72#include "llvm/ADT/Statistic.h"
73#include "llvm/ADT/StringRef.h"
74#include "llvm/ADT/Twine.h"
75#include "llvm/ADT/iterator_range.h"
76#include "llvm/Analysis/AssumptionCache.h"
77#include "llvm/Analysis/BasicAliasAnalysis.h"
78#include "llvm/Analysis/BlockFrequencyInfo.h"
79#include "llvm/Analysis/CFG.h"
80#include "llvm/Analysis/CodeMetrics.h"
81#include "llvm/Analysis/DemandedBits.h"
82#include "llvm/Analysis/GlobalsModRef.h"
83#include "llvm/Analysis/LoopAccessAnalysis.h"
84#include "llvm/Analysis/LoopAnalysisManager.h"
85#include "llvm/Analysis/LoopInfo.h"
86#include "llvm/Analysis/LoopIterator.h"
87#include "llvm/Analysis/OptimizationRemarkEmitter.h"
88#include "llvm/Analysis/ProfileSummaryInfo.h"
89#include "llvm/Analysis/ScalarEvolution.h"
90#include "llvm/Analysis/ScalarEvolutionExpressions.h"
91#include "llvm/Analysis/TargetLibraryInfo.h"
92#include "llvm/Analysis/TargetTransformInfo.h"
93#include "llvm/Analysis/ValueTracking.h"
94#include "llvm/Analysis/VectorUtils.h"
95#include "llvm/IR/Attributes.h"
96#include "llvm/IR/BasicBlock.h"
97#include "llvm/IR/CFG.h"
98#include "llvm/IR/Constant.h"
99#include "llvm/IR/Constants.h"
100#include "llvm/IR/DataLayout.h"
101#include "llvm/IR/DebugInfoMetadata.h"
102#include "llvm/IR/DebugLoc.h"
103#include "llvm/IR/DerivedTypes.h"
104#include "llvm/IR/DiagnosticInfo.h"
105#include "llvm/IR/Dominators.h"
106#include "llvm/IR/Function.h"
107#include "llvm/IR/IRBuilder.h"
108#include "llvm/IR/InstrTypes.h"
109#include "llvm/IR/Instruction.h"
110#include "llvm/IR/Instructions.h"
111#include "llvm/IR/IntrinsicInst.h"
112#include "llvm/IR/Intrinsics.h"
113#include "llvm/IR/Metadata.h"
114#include "llvm/IR/Module.h"
115#include "llvm/IR/Operator.h"
116#include "llvm/IR/PatternMatch.h"
117#include "llvm/IR/Type.h"
118#include "llvm/IR/Use.h"
119#include "llvm/IR/User.h"
120#include "llvm/IR/Value.h"
121#include "llvm/IR/ValueHandle.h"
122#include "llvm/IR/Verifier.h"
123#include "llvm/InitializePasses.h"
124#include "llvm/Pass.h"
125#include "llvm/Support/Casting.h"
126#include "llvm/Support/CommandLine.h"
127#include "llvm/Support/Compiler.h"
128#include "llvm/Support/Debug.h"
129#include "llvm/Support/ErrorHandling.h"
130#include "llvm/Support/InstructionCost.h"
131#include "llvm/Support/MathExtras.h"
132#include "llvm/Support/raw_ostream.h"
133#include "llvm/Transforms/Utils/BasicBlockUtils.h"
134#include "llvm/Transforms/Utils/InjectTLIMappings.h"
135#include "llvm/Transforms/Utils/LoopSimplify.h"
136#include "llvm/Transforms/Utils/LoopUtils.h"
137#include "llvm/Transforms/Utils/LoopVersioning.h"
138#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
139#include "llvm/Transforms/Utils/SizeOpts.h"
140#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
141#include <algorithm>
142#include <cassert>
143#include <cmath>
144#include <cstdint>
145#include <functional>
146#include <iterator>
147#include <limits>
148#include <map>
149#include <memory>
150#include <string>
151#include <tuple>
152#include <utility>
153
154using namespace llvm;
155
156#define LV_NAME"loop-vectorize" "loop-vectorize"
157#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"
158
159#ifndef NDEBUG
160const char VerboseDebug[] = DEBUG_TYPE"loop-vectorize" "-verbose";
161#endif
162
163/// @{
164/// Metadata attribute names
165const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
166const char LLVMLoopVectorizeFollowupVectorized[] =
167 "llvm.loop.vectorize.followup_vectorized";
168const char LLVMLoopVectorizeFollowupEpilogue[] =
169 "llvm.loop.vectorize.followup_epilogue";
170/// @}
171
172STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = {"loop-vectorize", "LoopsVectorized"
, "Number of loops vectorized"}
;
173STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = {"loop-vectorize", "LoopsAnalyzed"
, "Number of loops analyzed for vectorization"}
;
174STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized")static llvm::Statistic LoopsEpilogueVectorized = {"loop-vectorize"
, "LoopsEpilogueVectorized", "Number of epilogues vectorized"
}
;
175
176static cl::opt<bool> EnableEpilogueVectorization(
177 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
178 cl::desc("Enable vectorization of epilogue loops."));
179
180static cl::opt<unsigned> EpilogueVectorizationForceVF(
181 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
182 cl::desc("When epilogue vectorization is enabled, and a value greater than "
183 "1 is specified, forces the given VF for all applicable epilogue "
184 "loops."));
185
186static cl::opt<unsigned> EpilogueVectorizationMinVF(
187 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
188 cl::desc("Only loops with vectorization factor equal to or larger than "
189 "the specified value are considered for epilogue vectorization."));
190
191/// Loops with a known constant trip count below this number are vectorized only
192/// if no scalar iteration overheads are incurred.
193static cl::opt<unsigned> TinyTripCountVectorThreshold(
194 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
195 cl::desc("Loops with a constant trip count that is smaller than this "
196 "value are vectorized only if no scalar iteration overheads "
197 "are incurred."));
198
199static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
200 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
201 cl::desc("The maximum allowed number of runtime memory checks"));
202
203// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
204// that predication is preferred, and this lists all options. I.e., the
205// vectorizer will try to fold the tail-loop (epilogue) into the vector body
206// and predicate the instructions accordingly. If tail-folding fails, there are
207// different fallback strategies depending on these values:
208namespace PreferPredicateTy {
209 enum Option {
210 ScalarEpilogue = 0,
211 PredicateElseScalarEpilogue,
212 PredicateOrDontVectorize
213 };
214} // namespace PreferPredicateTy
215
216static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
217 "prefer-predicate-over-epilogue",
218 cl::init(PreferPredicateTy::ScalarEpilogue),
219 cl::Hidden,
220 cl::desc("Tail-folding and predication preferences over creating a scalar "
221 "epilogue loop."),
222 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
223 "scalar-epilogue",llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
224 "Don't tail-predicate loops, create scalar epilogue")llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
,
225 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
226 "predicate-else-scalar-epilogue",llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
227 "prefer tail-folding, create scalar epilogue if tail "llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
228 "folding fails.")llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
,
229 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
230 "predicate-dont-vectorize",llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
231 "prefers tail-folding, don't attempt vectorization if "llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
232 "tail-folding fails.")llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
));
233
234static cl::opt<bool> MaximizeBandwidth(
235 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
236 cl::desc("Maximize bandwidth when selecting vectorization factor which "
237 "will be determined by the smallest type in loop."));
238
239static cl::opt<bool> EnableInterleavedMemAccesses(
240 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
241 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
242
243/// An interleave-group may need masking if it resides in a block that needs
244/// predication, or in order to mask away gaps.
245static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
246 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
247 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
248
249static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
250 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
251 cl::desc("We don't interleave loops with a estimated constant trip count "
252 "below this number"));
253
254static cl::opt<unsigned> ForceTargetNumScalarRegs(
255 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
256 cl::desc("A flag that overrides the target's number of scalar registers."));
257
258static cl::opt<unsigned> ForceTargetNumVectorRegs(
259 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
260 cl::desc("A flag that overrides the target's number of vector registers."));
261
262static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
263 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
264 cl::desc("A flag that overrides the target's max interleave factor for "
265 "scalar loops."));
266
267static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
268 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
269 cl::desc("A flag that overrides the target's max interleave factor for "
270 "vectorized loops."));
271
272static cl::opt<unsigned> ForceTargetInstructionCost(
273 "force-target-instruction-cost", cl::init(0), cl::Hidden,
274 cl::desc("A flag that overrides the target's expected cost for "
275 "an instruction to a single constant value. Mostly "
276 "useful for getting consistent testing."));
277
278static cl::opt<bool> ForceTargetSupportsScalableVectors(
279 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
280 cl::desc(
281 "Pretend that scalable vectors are supported, even if the target does "
282 "not support them. This flag should only be used for testing."));
283
284static cl::opt<unsigned> SmallLoopCost(
285 "small-loop-cost", cl::init(20), cl::Hidden,
286 cl::desc(
287 "The cost of a loop that is considered 'small' by the interleaver."));
288
289static cl::opt<bool> LoopVectorizeWithBlockFrequency(
290 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
291 cl::desc("Enable the use of the block frequency analysis to access PGO "
292 "heuristics minimizing code growth in cold regions and being more "
293 "aggressive in hot regions."));
294
295// Runtime interleave loops for load/store throughput.
296static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
297 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
298 cl::desc(
299 "Enable runtime interleaving until load/store ports are saturated"));
300
301/// Interleave small loops with scalar reductions.
302static cl::opt<bool> InterleaveSmallLoopScalarReduction(
303 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
304 cl::desc("Enable interleaving for loops with small iteration counts that "
305 "contain scalar reductions to expose ILP."));
306
307/// The number of stores in a loop that are allowed to need predication.
308static cl::opt<unsigned> NumberOfStoresToPredicate(
309 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
310 cl::desc("Max number of stores to be predicated behind an if."));
311
312static cl::opt<bool> EnableIndVarRegisterHeur(
313 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
314 cl::desc("Count the induction variable only once when interleaving"));
315
316static cl::opt<bool> EnableCondStoresVectorization(
317 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
318 cl::desc("Enable if predication of stores during vectorization."));
319
320static cl::opt<unsigned> MaxNestedScalarReductionIC(
321 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
322 cl::desc("The maximum interleave count to use when interleaving a scalar "
323 "reduction in a nested loop."));
324
325static cl::opt<bool>
326 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
327 cl::Hidden,
328 cl::desc("Prefer in-loop vector reductions, "
329 "overriding the targets preference."));
330
331static cl::opt<bool> ForceOrderedReductions(
332 "force-ordered-reductions", cl::init(false), cl::Hidden,
333 cl::desc("Enable the vectorisation of loops with in-order (strict) "
334 "FP reductions"));
335
336static cl::opt<bool> PreferPredicatedReductionSelect(
337 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
338 cl::desc(
339 "Prefer predicating a reduction operation over an after loop select."));
340
341cl::opt<bool> EnableVPlanNativePath(
342 "enable-vplan-native-path", cl::init(false), cl::Hidden,
343 cl::desc("Enable VPlan-native vectorization path with "
344 "support for outer loop vectorization."));
345
346// This flag enables the stress testing of the VPlan H-CFG construction in the
347// VPlan-native vectorization path. It must be used in conjuction with
348// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
349// verification of the H-CFGs built.
350static cl::opt<bool> VPlanBuildStressTest(
351 "vplan-build-stress-test", cl::init(false), cl::Hidden,
352 cl::desc(
353 "Build VPlan for every supported loop nest in the function and bail "
354 "out right after the build (stress test the VPlan H-CFG construction "
355 "in the VPlan-native vectorization path)."));
356
357cl::opt<bool> llvm::EnableLoopInterleaving(
358 "interleave-loops", cl::init(true), cl::Hidden,
359 cl::desc("Enable loop interleaving in Loop vectorization passes"));
360cl::opt<bool> llvm::EnableLoopVectorization(
361 "vectorize-loops", cl::init(true), cl::Hidden,
362 cl::desc("Run the Loop vectorization passes"));
363
364static cl::opt<bool> PrintVPlansInDotFormat(
365 "vplan-print-in-dot-format", cl::Hidden,
366 cl::desc("Use dot format instead of plain text when dumping VPlans"));
367
368static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
369 "force-widen-divrem-via-safe-divisor", cl::Hidden,
370 cl::desc(
371 "Override cost based safe divisor widening for div/rem instructions"));
372
373/// A helper function that returns true if the given type is irregular. The
374/// type is irregular if its allocated size doesn't equal the store size of an
375/// element of the corresponding vector type.
376static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
377 // Determine if an array of N elements of type Ty is "bitcast compatible"
378 // with a <N x Ty> vector.
379 // This is only true if there is no padding between the array elements.
380 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
381}
382
383/// A helper function that returns the reciprocal of the block probability of
384/// predicated blocks. If we return X, we are assuming the predicated block
385/// will execute once for every X iterations of the loop header.
386///
387/// TODO: We should use actual block probability here, if available. Currently,
388/// we always assume predicated blocks have a 50% chance of executing.
389static unsigned getReciprocalPredBlockProb() { return 2; }
390
391/// A helper function that returns an integer or floating-point constant with
392/// value C.
393static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
394 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
395 : ConstantFP::get(Ty, C);
396}
397
398/// Returns "best known" trip count for the specified loop \p L as defined by
399/// the following procedure:
400/// 1) Returns exact trip count if it is known.
401/// 2) Returns expected trip count according to profile data if any.
402/// 3) Returns upper bound estimate if it is known.
403/// 4) Returns std::nullopt if all of the above failed.
404static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
405 Loop *L) {
406 // Check if exact trip count is known.
407 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
408 return ExpectedTC;
409
410 // Check if there is an expected trip count available from profile data.
411 if (LoopVectorizeWithBlockFrequency)
412 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
413 return *EstimatedTC;
414
415 // Check if upper bound estimate is known.
416 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
417 return ExpectedTC;
418
419 return std::nullopt;
420}
421
422// Forward declare GeneratedRTChecks.
423class GeneratedRTChecks;
424
425namespace llvm {
426
427AnalysisKey ShouldRunExtraVectorPasses::Key;
428
429/// InnerLoopVectorizer vectorizes loops which contain only one basic
430/// block to a specified vectorization factor (VF).
431/// This class performs the widening of scalars into vectors, or multiple
432/// scalars. This class also implements the following features:
433/// * It inserts an epilogue loop for handling loops that don't have iteration
434/// counts that are known to be a multiple of the vectorization factor.
435/// * It handles the code generation for reduction variables.
436/// * Scalarization (implementation using scalars) of un-vectorizable
437/// instructions.
438/// InnerLoopVectorizer does not perform any vectorization-legality
439/// checks, and relies on the caller to check for the different legality
440/// aspects. The InnerLoopVectorizer relies on the
441/// LoopVectorizationLegality class to provide information about the induction
442/// and reduction variables that were found to a given vectorization factor.
443class InnerLoopVectorizer {
444public:
445 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
446 LoopInfo *LI, DominatorTree *DT,
447 const TargetLibraryInfo *TLI,
448 const TargetTransformInfo *TTI, AssumptionCache *AC,
449 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
450 ElementCount MinProfitableTripCount,
451 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
452 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
453 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
454 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
455 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
456 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
457 PSI(PSI), RTChecks(RTChecks) {
458 // Query this against the original loop and save it here because the profile
459 // of the original loop header may change as the transformation happens.
460 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
461 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
462
463 if (MinProfitableTripCount.isZero())
464 this->MinProfitableTripCount = VecWidth;
465 else
466 this->MinProfitableTripCount = MinProfitableTripCount;
467 }
468
469 virtual ~InnerLoopVectorizer() = default;
470
471 /// Create a new empty loop that will contain vectorized instructions later
472 /// on, while the old loop will be used as the scalar remainder. Control flow
473 /// is generated around the vectorized (and scalar epilogue) loops consisting
474 /// of various checks and bypasses. Return the pre-header block of the new
475 /// loop and the start value for the canonical induction, if it is != 0. The
476 /// latter is the case when vectorizing the epilogue loop. In the case of
477 /// epilogue vectorization, this function is overriden to handle the more
478 /// complex control flow around the loops.
479 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
480
481 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
482 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
483
484 // Return true if any runtime check is added.
485 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
486
487 /// A type for vectorized values in the new loop. Each value from the
488 /// original loop, when vectorized, is represented by UF vector values in the
489 /// new unrolled loop, where UF is the unroll factor.
490 using VectorParts = SmallVector<Value *, 2>;
491
492 /// A helper function to scalarize a single Instruction in the innermost loop.
493 /// Generates a sequence of scalar instances for each lane between \p MinLane
494 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
495 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
496 /// Instr's operands.
497 void scalarizeInstruction(const Instruction *Instr,
498 VPReplicateRecipe *RepRecipe,
499 const VPIteration &Instance, bool IfPredicateInstr,
500 VPTransformState &State);
501
502 /// Construct the vector value of a scalarized value \p V one lane at a time.
503 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
504 VPTransformState &State);
505
506 /// Try to vectorize interleaved access group \p Group with the base address
507 /// given in \p Addr, optionally masking the vector operations if \p
508 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
509 /// values in the vectorized loop.
510 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
511 ArrayRef<VPValue *> VPDefs,
512 VPTransformState &State, VPValue *Addr,
513 ArrayRef<VPValue *> StoredValues,
514 VPValue *BlockInMask = nullptr);
515
516 /// Fix the non-induction PHIs in \p Plan.
517 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
518
519 /// Returns true if the reordering of FP operations is not allowed, but we are
520 /// able to vectorize with strict in-order reductions for the given RdxDesc.
521 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
522
523 /// Create a broadcast instruction. This method generates a broadcast
524 /// instruction (shuffle) for loop invariant values and for the induction
525 /// value. If this is the induction variable then we extend it to N, N+1, ...
526 /// this is needed because each iteration in the loop corresponds to a SIMD
527 /// element.
528 virtual Value *getBroadcastInstrs(Value *V);
529
530 // Returns the resume value (bc.merge.rdx) for a reduction as
531 // generated by fixReduction.
532 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
533
534 /// Create a new phi node for the induction variable \p OrigPhi to resume
535 /// iteration count in the scalar epilogue, from where the vectorized loop
536 /// left off. In cases where the loop skeleton is more complicated (eg.
537 /// epilogue vectorization) and the resume values can come from an additional
538 /// bypass block, the \p AdditionalBypass pair provides information about the
539 /// bypass block and the end value on the edge from bypass to this loop.
540 PHINode *createInductionResumeValue(
541 PHINode *OrigPhi, const InductionDescriptor &ID,
542 ArrayRef<BasicBlock *> BypassBlocks,
543 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
544
545protected:
546 friend class LoopVectorizationPlanner;
547
548 /// A small list of PHINodes.
549 using PhiVector = SmallVector<PHINode *, 4>;
550
551 /// A type for scalarized values in the new loop. Each value from the
552 /// original loop, when scalarized, is represented by UF x VF scalar values
553 /// in the new unrolled loop, where UF is the unroll factor and VF is the
554 /// vectorization factor.
555 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
556
557 /// Set up the values of the IVs correctly when exiting the vector loop.
558 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
559 Value *VectorTripCount, Value *EndValue,
560 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
561 VPlan &Plan);
562
563 /// Handle all cross-iteration phis in the header.
564 void fixCrossIterationPHIs(VPTransformState &State);
565
566 /// Create the exit value of first order recurrences in the middle block and
567 /// update their users.
568 void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
569 VPTransformState &State);
570
571 /// Create code for the loop exit value of the reduction.
572 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
573
574 /// Clear NSW/NUW flags from reduction instructions if necessary.
575 void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
576 VPTransformState &State);
577
578 /// Iteratively sink the scalarized operands of a predicated instruction into
579 /// the block that was created for it.
580 void sinkScalarOperands(Instruction *PredInst);
581
582 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
583 /// represented as.
584 void truncateToMinimalBitwidths(VPTransformState &State);
585
586 /// Returns (and creates if needed) the original loop trip count.
587 Value *getOrCreateTripCount(BasicBlock *InsertBlock);
588
589 /// Returns (and creates if needed) the trip count of the widened loop.
590 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
591
592 /// Returns a bitcasted value to the requested vector type.
593 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
594 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
595 const DataLayout &DL);
596
597 /// Emit a bypass check to see if the vector trip count is zero, including if
598 /// it overflows.
599 void emitIterationCountCheck(BasicBlock *Bypass);
600
601 /// Emit a bypass check to see if all of the SCEV assumptions we've
602 /// had to make are correct. Returns the block containing the checks or
603 /// nullptr if no checks have been added.
604 BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
605
606 /// Emit bypass checks to check any memory assumptions we may have made.
607 /// Returns the block containing the checks or nullptr if no checks have been
608 /// added.
609 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
610
611 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
612 /// vector loop preheader, middle block and scalar preheader.
613 void createVectorLoopSkeleton(StringRef Prefix);
614
615 /// Create new phi nodes for the induction variables to resume iteration count
616 /// in the scalar epilogue, from where the vectorized loop left off.
617 /// In cases where the loop skeleton is more complicated (eg. epilogue
618 /// vectorization) and the resume values can come from an additional bypass
619 /// block, the \p AdditionalBypass pair provides information about the bypass
620 /// block and the end value on the edge from bypass to this loop.
621 void createInductionResumeValues(
622 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
623
624 /// Complete the loop skeleton by adding debug MDs, creating appropriate
625 /// conditional branches in the middle block, preparing the builder and
626 /// running the verifier. Return the preheader of the completed vector loop.
627 BasicBlock *completeLoopSkeleton();
628
629 /// Collect poison-generating recipes that may generate a poison value that is
630 /// used after vectorization, even when their operands are not poison. Those
631 /// recipes meet the following conditions:
632 /// * Contribute to the address computation of a recipe generating a widen
633 /// memory load/store (VPWidenMemoryInstructionRecipe or
634 /// VPInterleaveRecipe).
635 /// * Such a widen memory load/store has at least one underlying Instruction
636 /// that is in a basic block that needs predication and after vectorization
637 /// the generated instruction won't be predicated.
638 void collectPoisonGeneratingRecipes(VPTransformState &State);
639
640 /// Allow subclasses to override and print debug traces before/after vplan
641 /// execution, when trace information is requested.
642 virtual void printDebugTracesAtStart(){};
643 virtual void printDebugTracesAtEnd(){};
644
645 /// The original loop.
646 Loop *OrigLoop;
647
648 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
649 /// dynamic knowledge to simplify SCEV expressions and converts them to a
650 /// more usable form.
651 PredicatedScalarEvolution &PSE;
652
653 /// Loop Info.
654 LoopInfo *LI;
655
656 /// Dominator Tree.
657 DominatorTree *DT;
658
659 /// Target Library Info.
660 const TargetLibraryInfo *TLI;
661
662 /// Target Transform Info.
663 const TargetTransformInfo *TTI;
664
665 /// Assumption Cache.
666 AssumptionCache *AC;
667
668 /// Interface to emit optimization remarks.
669 OptimizationRemarkEmitter *ORE;
670
671 /// The vectorization SIMD factor to use. Each vector will have this many
672 /// vector elements.
673 ElementCount VF;
674
675 ElementCount MinProfitableTripCount;
676
677 /// The vectorization unroll factor to use. Each scalar is vectorized to this
678 /// many different vector instructions.
679 unsigned UF;
680
681 /// The builder that we use
682 IRBuilder<> Builder;
683
684 // --- Vectorization state ---
685
686 /// The vector-loop preheader.
687 BasicBlock *LoopVectorPreHeader;
688
689 /// The scalar-loop preheader.
690 BasicBlock *LoopScalarPreHeader;
691
692 /// Middle Block between the vector and the scalar.
693 BasicBlock *LoopMiddleBlock;
694
695 /// The unique ExitBlock of the scalar loop if one exists. Note that
696 /// there can be multiple exiting edges reaching this block.
697 BasicBlock *LoopExitBlock;
698
699 /// The scalar loop body.
700 BasicBlock *LoopScalarBody;
701
702 /// A list of all bypass blocks. The first block is the entry of the loop.
703 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
704
705 /// Store instructions that were predicated.
706 SmallVector<Instruction *, 4> PredicatedInstructions;
707
708 /// Trip count of the original loop.
709 Value *TripCount = nullptr;
710
711 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
712 Value *VectorTripCount = nullptr;
713
714 /// The legality analysis.
715 LoopVectorizationLegality *Legal;
716
717 /// The profitablity analysis.
718 LoopVectorizationCostModel *Cost;
719
720 // Record whether runtime checks are added.
721 bool AddedSafetyChecks = false;
722
723 // Holds the end values for each induction variable. We save the end values
724 // so we can later fix-up the external users of the induction variables.
725 DenseMap<PHINode *, Value *> IVEndValues;
726
727 /// BFI and PSI are used to check for profile guided size optimizations.
728 BlockFrequencyInfo *BFI;
729 ProfileSummaryInfo *PSI;
730
731 // Whether this loop should be optimized for size based on profile guided size
732 // optimizatios.
733 bool OptForSizeBasedOnProfile;
734
735 /// Structure to hold information about generated runtime checks, responsible
736 /// for cleaning the checks, if vectorization turns out unprofitable.
737 GeneratedRTChecks &RTChecks;
738
739 // Holds the resume values for reductions in the loops, used to set the
740 // correct start value of reduction PHIs when vectorizing the epilogue.
741 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
742 ReductionResumeValues;
743};
744
745class InnerLoopUnroller : public InnerLoopVectorizer {
746public:
747 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
748 LoopInfo *LI, DominatorTree *DT,
749 const TargetLibraryInfo *TLI,
750 const TargetTransformInfo *TTI, AssumptionCache *AC,
751 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
752 LoopVectorizationLegality *LVL,
753 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
754 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
755 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
756 ElementCount::getFixed(1),
757 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
758 BFI, PSI, Check) {}
759
760private:
761 Value *getBroadcastInstrs(Value *V) override;
762};
763
764/// Encapsulate information regarding vectorization of a loop and its epilogue.
765/// This information is meant to be updated and used across two stages of
766/// epilogue vectorization.
767struct EpilogueLoopVectorizationInfo {
768 ElementCount MainLoopVF = ElementCount::getFixed(0);
769 unsigned MainLoopUF = 0;
770 ElementCount EpilogueVF = ElementCount::getFixed(0);
771 unsigned EpilogueUF = 0;
772 BasicBlock *MainLoopIterationCountCheck = nullptr;
773 BasicBlock *EpilogueIterationCountCheck = nullptr;
774 BasicBlock *SCEVSafetyCheck = nullptr;
775 BasicBlock *MemSafetyCheck = nullptr;
776 Value *TripCount = nullptr;
777 Value *VectorTripCount = nullptr;
778
779 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
780 ElementCount EVF, unsigned EUF)
781 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
782 assert(EUF == 1 &&(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 783, __extension__
__PRETTY_FUNCTION__))
783 "A high UF for the epilogue loop is likely not beneficial.")(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 783, __extension__
__PRETTY_FUNCTION__))
;
784 }
785};
786
787/// An extension of the inner loop vectorizer that creates a skeleton for a
788/// vectorized loop that has its epilogue (residual) also vectorized.
789/// The idea is to run the vplan on a given loop twice, firstly to setup the
790/// skeleton and vectorize the main loop, and secondly to complete the skeleton
791/// from the first step and vectorize the epilogue. This is achieved by
792/// deriving two concrete strategy classes from this base class and invoking
793/// them in succession from the loop vectorizer planner.
794class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
795public:
796 InnerLoopAndEpilogueVectorizer(
797 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
798 DominatorTree *DT, const TargetLibraryInfo *TLI,
799 const TargetTransformInfo *TTI, AssumptionCache *AC,
800 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
801 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
802 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
803 GeneratedRTChecks &Checks)
804 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
805 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
806 CM, BFI, PSI, Checks),
807 EPI(EPI) {}
808
809 // Override this function to handle the more complex control flow around the
810 // three loops.
811 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton() final {
812 return createEpilogueVectorizedLoopSkeleton();
813 }
814
815 /// The interface for creating a vectorized skeleton using one of two
816 /// different strategies, each corresponding to one execution of the vplan
817 /// as described above.
818 virtual std::pair<BasicBlock *, Value *>
819 createEpilogueVectorizedLoopSkeleton() = 0;
820
821 /// Holds and updates state information required to vectorize the main loop
822 /// and its epilogue in two separate passes. This setup helps us avoid
823 /// regenerating and recomputing runtime safety checks. It also helps us to
824 /// shorten the iteration-count-check path length for the cases where the
825 /// iteration count of the loop is so small that the main vector loop is
826 /// completely skipped.
827 EpilogueLoopVectorizationInfo &EPI;
828};
829
830/// A specialized derived class of inner loop vectorizer that performs
831/// vectorization of *main* loops in the process of vectorizing loops and their
832/// epilogues.
833class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
834public:
835 EpilogueVectorizerMainLoop(
836 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
837 DominatorTree *DT, const TargetLibraryInfo *TLI,
838 const TargetTransformInfo *TTI, AssumptionCache *AC,
839 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
840 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
841 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
842 GeneratedRTChecks &Check)
843 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
844 EPI, LVL, CM, BFI, PSI, Check) {}
845 /// Implements the interface for creating a vectorized skeleton using the
846 /// *main loop* strategy (ie the first pass of vplan execution).
847 std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
848
849protected:
850 /// Emits an iteration count bypass check once for the main loop (when \p
851 /// ForEpilogue is false) and once for the epilogue loop (when \p
852 /// ForEpilogue is true).
853 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
854 void printDebugTracesAtStart() override;
855 void printDebugTracesAtEnd() override;
856};
857
858// A specialized derived class of inner loop vectorizer that performs
859// vectorization of *epilogue* loops in the process of vectorizing loops and
860// their epilogues.
861class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
862public:
863 EpilogueVectorizerEpilogueLoop(
864 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
865 DominatorTree *DT, const TargetLibraryInfo *TLI,
866 const TargetTransformInfo *TTI, AssumptionCache *AC,
867 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
868 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
869 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
870 GeneratedRTChecks &Checks)
871 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
872 EPI, LVL, CM, BFI, PSI, Checks) {
873 TripCount = EPI.TripCount;
874 }
875 /// Implements the interface for creating a vectorized skeleton using the
876 /// *epilogue loop* strategy (ie the second pass of vplan execution).
877 std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
878
879protected:
880 /// Emits an iteration count bypass check after the main vector loop has
881 /// finished to see if there are any iterations left to execute by either
882 /// the vector epilogue or the scalar epilogue.
883 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
884 BasicBlock *Bypass,
885 BasicBlock *Insert);
886 void printDebugTracesAtStart() override;
887 void printDebugTracesAtEnd() override;
888};
889} // end namespace llvm
890
891/// Look for a meaningful debug location on the instruction or it's
892/// operands.
893static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
894 if (!I)
895 return I;
896
897 DebugLoc Empty;
898 if (I->getDebugLoc() != Empty)
899 return I;
900
901 for (Use &Op : I->operands()) {
902 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
903 if (OpInst->getDebugLoc() != Empty)
904 return OpInst;
905 }
906
907 return I;
908}
909
910/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
911/// is passed, the message relates to that particular instruction.
912#ifndef NDEBUG
913static void debugVectorizationMessage(const StringRef Prefix,
914 const StringRef DebugMsg,
915 Instruction *I) {
916 dbgs() << "LV: " << Prefix << DebugMsg;
917 if (I != nullptr)
918 dbgs() << " " << *I;
919 else
920 dbgs() << '.';
921 dbgs() << '\n';
922}
923#endif
924
925/// Create an analysis remark that explains why vectorization failed
926///
927/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
928/// RemarkName is the identifier for the remark. If \p I is passed it is an
929/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
930/// the location of the remark. \return the remark object that can be
931/// streamed to.
932static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
933 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
934 Value *CodeRegion = TheLoop->getHeader();
935 DebugLoc DL = TheLoop->getStartLoc();
936
937 if (I) {
938 CodeRegion = I->getParent();
939 // If there is no debug location attached to the instruction, revert back to
940 // using the loop's.
941 if (I->getDebugLoc())
942 DL = I->getDebugLoc();
943 }
944
945 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
946}
947
948namespace llvm {
949
950/// Return a value for Step multiplied by VF.
951Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
952 int64_t Step) {
953 assert(Ty->isIntegerTy() && "Expected an integer step")(static_cast <bool> (Ty->isIntegerTy() && "Expected an integer step"
) ? void (0) : __assert_fail ("Ty->isIntegerTy() && \"Expected an integer step\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 953, __extension__
__PRETTY_FUNCTION__))
;
954 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
955 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
956}
957
958/// Return the runtime value for VF.
959Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
960 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
961 return VF.isScalable() ? B.CreateVScale(EC) : EC;
962}
963
964const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE) {
965 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
966 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count")(static_cast <bool> (!isa<SCEVCouldNotCompute>(BackedgeTakenCount
) && "Invalid loop count") ? void (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 966, __extension__
__PRETTY_FUNCTION__))
;
967
968 ScalarEvolution &SE = *PSE.getSE();
969
970 // The exit count might have the type of i64 while the phi is i32. This can
971 // happen if we have an induction variable that is sign extended before the
972 // compare. The only way that we get a backedge taken count is that the
973 // induction variable was signed and as such will not overflow. In such a case
974 // truncation is legal.
975 if (SE.getTypeSizeInBits(BackedgeTakenCount->getType()) >
976 IdxTy->getPrimitiveSizeInBits())
977 BackedgeTakenCount = SE.getTruncateOrNoop(BackedgeTakenCount, IdxTy);
978 BackedgeTakenCount = SE.getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
979
980 // Get the total trip count from the count by adding 1.
981 return SE.getAddExpr(BackedgeTakenCount,
982 SE.getOne(BackedgeTakenCount->getType()));
983}
984
985static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
986 ElementCount VF) {
987 assert(FTy->isFloatingPointTy() && "Expected floating point type!")(static_cast <bool> (FTy->isFloatingPointTy() &&
"Expected floating point type!") ? void (0) : __assert_fail (
"FTy->isFloatingPointTy() && \"Expected floating point type!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 987, __extension__
__PRETTY_FUNCTION__))
;
988 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
989 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
990 return B.CreateUIToFP(RuntimeVF, FTy);
991}
992
993void reportVectorizationFailure(const StringRef DebugMsg,
994 const StringRef OREMsg, const StringRef ORETag,
995 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
996 Instruction *I) {
997 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationMessage("Not vectorizing: "
, DebugMsg, I); } } while (false)
;
998 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
999 ORE->emit(
1000 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1001 << "loop not vectorized: " << OREMsg);
1002}
1003
1004void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1005 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1006 Instruction *I) {
1007 LLVM_DEBUG(debugVectorizationMessage("", Msg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationMessage("", Msg, I); }
} while (false)
;
1008 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1009 ORE->emit(
1010 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1011 << Msg);
1012}
1013
1014} // end namespace llvm
1015
1016#ifndef NDEBUG
1017/// \return string containing a file name and a line # for the given loop.
1018static std::string getDebugLocString(const Loop *L) {
1019 std::string Result;
1020 if (L) {
1021 raw_string_ostream OS(Result);
1022 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1023 LoopDbgLoc.print(OS);
1024 else
1025 // Just print the module name.
1026 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1027 OS.flush();
1028 }
1029 return Result;
1030}
1031#endif
1032
1033void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1034 VPTransformState &State) {
1035
1036 // Collect recipes in the backward slice of `Root` that may generate a poison
1037 // value that is used after vectorization.
1038 SmallPtrSet<VPRecipeBase *, 16> Visited;
1039 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1040 SmallVector<VPRecipeBase *, 16> Worklist;
1041 Worklist.push_back(Root);
1042
1043 // Traverse the backward slice of Root through its use-def chain.
1044 while (!Worklist.empty()) {
1045 VPRecipeBase *CurRec = Worklist.back();
1046 Worklist.pop_back();
1047
1048 if (!Visited.insert(CurRec).second)
1049 continue;
1050
1051 // Prune search if we find another recipe generating a widen memory
1052 // instruction. Widen memory instructions involved in address computation
1053 // will lead to gather/scatter instructions, which don't need to be
1054 // handled.
1055 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1056 isa<VPInterleaveRecipe>(CurRec) ||
1057 isa<VPScalarIVStepsRecipe>(CurRec) ||
1058 isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1059 isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1060 continue;
1061
1062 // This recipe contributes to the address computation of a widen
1063 // load/store. Collect recipe if its underlying instruction has
1064 // poison-generating flags.
1065 Instruction *Instr = CurRec->getUnderlyingInstr();
1066 if (Instr && Instr->hasPoisonGeneratingFlags())
1067 State.MayGeneratePoisonRecipes.insert(CurRec);
1068
1069 // Add new definitions to the worklist.
1070 for (VPValue *operand : CurRec->operands())
1071 if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
1072 Worklist.push_back(OpDef);
1073 }
1074 });
1075
1076 // Traverse all the recipes in the VPlan and collect the poison-generating
1077 // recipes in the backward slice starting at the address of a VPWidenRecipe or
1078 // VPInterleaveRecipe.
1079 auto Iter = depth_first(
1080 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1081 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1082 for (VPRecipeBase &Recipe : *VPBB) {
1083 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1084 Instruction &UnderlyingInstr = WidenRec->getIngredient();
1085 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
1086 if (AddrDef && WidenRec->isConsecutive() &&
1087 Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1088 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1089 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1090 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
1091 if (AddrDef) {
1092 // Check if any member of the interleave group needs predication.
1093 const InterleaveGroup<Instruction> *InterGroup =
1094 InterleaveRec->getInterleaveGroup();
1095 bool NeedPredication = false;
1096 for (int I = 0, NumMembers = InterGroup->getNumMembers();
1097 I < NumMembers; ++I) {
1098 Instruction *Member = InterGroup->getMember(I);
1099 if (Member)
1100 NeedPredication |=
1101 Legal->blockNeedsPredication(Member->getParent());
1102 }
1103
1104 if (NeedPredication)
1105 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1106 }
1107 }
1108 }
1109 }
1110}
1111
1112PHINode *InnerLoopVectorizer::getReductionResumeValue(
1113 const RecurrenceDescriptor &RdxDesc) {
1114 auto It = ReductionResumeValues.find(&RdxDesc);
1115 assert(It != ReductionResumeValues.end() &&(static_cast <bool> (It != ReductionResumeValues.end() &&
"Expected to find a resume value for the reduction.") ? void
(0) : __assert_fail ("It != ReductionResumeValues.end() && \"Expected to find a resume value for the reduction.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1116, __extension__
__PRETTY_FUNCTION__))
1116 "Expected to find a resume value for the reduction.")(static_cast <bool> (It != ReductionResumeValues.end() &&
"Expected to find a resume value for the reduction.") ? void
(0) : __assert_fail ("It != ReductionResumeValues.end() && \"Expected to find a resume value for the reduction.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1116, __extension__
__PRETTY_FUNCTION__))
;
1117 return It->second;
1118}
1119
1120namespace llvm {
1121
1122// Loop vectorization cost-model hints how the scalar epilogue loop should be
1123// lowered.
1124enum ScalarEpilogueLowering {
1125
1126 // The default: allowing scalar epilogues.
1127 CM_ScalarEpilogueAllowed,
1128
1129 // Vectorization with OptForSize: don't allow epilogues.
1130 CM_ScalarEpilogueNotAllowedOptSize,
1131
1132 // A special case of vectorisation with OptForSize: loops with a very small
1133 // trip count are considered for vectorization under OptForSize, thereby
1134 // making sure the cost of their loop body is dominant, free of runtime
1135 // guards and scalar iteration overheads.
1136 CM_ScalarEpilogueNotAllowedLowTripLoop,
1137
1138 // Loop hint predicate indicating an epilogue is undesired.
1139 CM_ScalarEpilogueNotNeededUsePredicate,
1140
1141 // Directive indicating we must either tail fold or not vectorize
1142 CM_ScalarEpilogueNotAllowedUsePredicate
1143};
1144
1145/// ElementCountComparator creates a total ordering for ElementCount
1146/// for the purposes of using it in a set structure.
1147struct ElementCountComparator {
1148 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1149 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1150 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1151 }
1152};
1153using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1154
1155/// LoopVectorizationCostModel - estimates the expected speedups due to
1156/// vectorization.
1157/// In many cases vectorization is not profitable. This can happen because of
1158/// a number of reasons. In this class we mainly attempt to predict the
1159/// expected speedup/slowdowns due to the supported instruction set. We use the
1160/// TargetTransformInfo to query the different backends for the cost of
1161/// different operations.
1162class LoopVectorizationCostModel {
1163public:
1164 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1165 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1166 LoopVectorizationLegality *Legal,
1167 const TargetTransformInfo &TTI,
1168 const TargetLibraryInfo *TLI, DemandedBits *DB,
1169 AssumptionCache *AC,
1170 OptimizationRemarkEmitter *ORE, const Function *F,
1171 const LoopVectorizeHints *Hints,
1172 InterleavedAccessInfo &IAI)
1173 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1174 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1175 Hints(Hints), InterleaveInfo(IAI) {}
1176
1177 /// \return An upper bound for the vectorization factors (both fixed and
1178 /// scalable). If the factors are 0, vectorization and interleaving should be
1179 /// avoided up front.
1180 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1181
1182 /// \return True if runtime checks are required for vectorization, and false
1183 /// otherwise.
1184 bool runtimeChecksRequired();
1185
1186 /// \return The most profitable vectorization factor and the cost of that VF.
1187 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1188 /// then this vectorization factor will be selected if vectorization is
1189 /// possible.
1190 VectorizationFactor
1191 selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1192
1193 VectorizationFactor
1194 selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1195 const LoopVectorizationPlanner &LVP);
1196
1197 /// Setup cost-based decisions for user vectorization factor.
1198 /// \return true if the UserVF is a feasible VF to be chosen.
1199 bool selectUserVectorizationFactor(ElementCount UserVF) {
1200 collectUniformsAndScalars(UserVF);
1201 collectInstsToScalarize(UserVF);
1202 return expectedCost(UserVF).first.isValid();
1203 }
1204
1205 /// \return The size (in bits) of the smallest and widest types in the code
1206 /// that needs to be vectorized. We ignore values that remain scalar such as
1207 /// 64 bit loop indices.
1208 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1209
1210 /// \return The desired interleave count.
1211 /// If interleave count has been specified by metadata it will be returned.
1212 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1213 /// are the selected vectorization factor and the cost of the selected VF.
1214 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1215
1216 /// Memory access instruction may be vectorized in more than one way.
1217 /// Form of instruction after vectorization depends on cost.
1218 /// This function takes cost-based decisions for Load/Store instructions
1219 /// and collects them in a map. This decisions map is used for building
1220 /// the lists of loop-uniform and loop-scalar instructions.
1221 /// The calculated cost is saved with widening decision in order to
1222 /// avoid redundant calculations.
1223 void setCostBasedWideningDecision(ElementCount VF);
1224
1225 /// A struct that represents some properties of the register usage
1226 /// of a loop.
1227 struct RegisterUsage {
1228 /// Holds the number of loop invariant values that are used in the loop.
1229 /// The key is ClassID of target-provided register class.
1230 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1231 /// Holds the maximum number of concurrent live intervals in the loop.
1232 /// The key is ClassID of target-provided register class.
1233 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1234 };
1235
1236 /// \return Returns information about the register usages of the loop for the
1237 /// given vectorization factors.
1238 SmallVector<RegisterUsage, 8>
1239 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1240
1241 /// Collect values we want to ignore in the cost model.
1242 void collectValuesToIgnore();
1243
1244 /// Collect all element types in the loop for which widening is needed.
1245 void collectElementTypesForWidening();
1246
1247 /// Split reductions into those that happen in the loop, and those that happen
1248 /// outside. In loop reductions are collected into InLoopReductionChains.
1249 void collectInLoopReductions();
1250
1251 /// Returns true if we should use strict in-order reductions for the given
1252 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1253 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1254 /// of FP operations.
1255 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1256 return !Hints->allowReordering() && RdxDesc.isOrdered();
1257 }
1258
1259 /// \returns The smallest bitwidth each instruction can be represented with.
1260 /// The vector equivalents of these instructions should be truncated to this
1261 /// type.
1262 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1263 return MinBWs;
1264 }
1265
1266 /// \returns True if it is more profitable to scalarize instruction \p I for
1267 /// vectorization factor \p VF.
1268 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1269 assert(VF.isVector() &&(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1270, __extension__
__PRETTY_FUNCTION__))
1270 "Profitable to scalarize relevant only for VF > 1.")(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1270, __extension__
__PRETTY_FUNCTION__))
;
1271
1272 // Cost model is not run in the VPlan-native path - return conservative
1273 // result until this changes.
1274 if (EnableVPlanNativePath)
1275 return false;
1276
1277 auto Scalars = InstsToScalarize.find(VF);
1278 assert(Scalars != InstsToScalarize.end() &&(static_cast <bool> (Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability") ? void
(0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1279, __extension__
__PRETTY_FUNCTION__))
1279 "VF not yet analyzed for scalarization profitability")(static_cast <bool> (Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability") ? void
(0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1279, __extension__
__PRETTY_FUNCTION__))
;
1280 return Scalars->second.find(I) != Scalars->second.end();
1281 }
1282
1283 /// Returns true if \p I is known to be uniform after vectorization.
1284 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1285 if (VF.isScalar())
1286 return true;
1287
1288 // Cost model is not run in the VPlan-native path - return conservative
1289 // result until this changes.
1290 if (EnableVPlanNativePath)
1291 return false;
1292
1293 auto UniformsPerVF = Uniforms.find(VF);
1294 assert(UniformsPerVF != Uniforms.end() &&(static_cast <bool> (UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity") ? void (0) : __assert_fail
("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1295, __extension__
__PRETTY_FUNCTION__))
1295 "VF not yet analyzed for uniformity")(static_cast <bool> (UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity") ? void (0) : __assert_fail
("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1295, __extension__
__PRETTY_FUNCTION__))
;
1296 return UniformsPerVF->second.count(I);
1297 }
1298
1299 /// Returns true if \p I is known to be scalar after vectorization.
1300 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1301 if (VF.isScalar())
1302 return true;
1303
1304 // Cost model is not run in the VPlan-native path - return conservative
1305 // result until this changes.
1306 if (EnableVPlanNativePath)
1307 return false;
1308
1309 auto ScalarsPerVF = Scalars.find(VF);
1310 assert(ScalarsPerVF != Scalars.end() &&(static_cast <bool> (ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF") ? void (0) : __assert_fail
("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1311, __extension__
__PRETTY_FUNCTION__))
1311 "Scalar values are not calculated for VF")(static_cast <bool> (ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF") ? void (0) : __assert_fail
("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1311, __extension__
__PRETTY_FUNCTION__))
;
1312 return ScalarsPerVF->second.count(I);
1313 }
1314
1315 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1316 /// for vectorization factor \p VF.
1317 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1318 return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1319 !isProfitableToScalarize(I, VF) &&
1320 !isScalarAfterVectorization(I, VF);
1321 }
1322
1323 /// Decision that was taken during cost calculation for memory instruction.
1324 enum InstWidening {
1325 CM_Unknown,
1326 CM_Widen, // For consecutive accesses with stride +1.
1327 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1328 CM_Interleave,
1329 CM_GatherScatter,
1330 CM_Scalarize
1331 };
1332
1333 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1334 /// instruction \p I and vector width \p VF.
1335 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1336 InstructionCost Cost) {
1337 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1337, __extension__
__PRETTY_FUNCTION__))
;
1338 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1339 }
1340
1341 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1342 /// interleaving group \p Grp and vector width \p VF.
1343 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1344 ElementCount VF, InstWidening W,
1345 InstructionCost Cost) {
1346 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1346, __extension__
__PRETTY_FUNCTION__))
;
1347 /// Broadcast this decicion to all instructions inside the group.
1348 /// But the cost will be assigned to one instruction only.
1349 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1350 if (auto *I = Grp->getMember(i)) {
1351 if (Grp->getInsertPos() == I)
1352 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1353 else
1354 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1355 }
1356 }
1357 }
1358
1359 /// Return the cost model decision for the given instruction \p I and vector
1360 /// width \p VF. Return CM_Unknown if this instruction did not pass
1361 /// through the cost modeling.
1362 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1363 assert(VF.isVector() && "Expected VF to be a vector VF")(static_cast <bool> (VF.isVector() && "Expected VF to be a vector VF"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF to be a vector VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1363, __extension__
__PRETTY_FUNCTION__))
;
1364 // Cost model is not run in the VPlan-native path - return conservative
1365 // result until this changes.
1366 if (EnableVPlanNativePath)
1367 return CM_GatherScatter;
1368
1369 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1370 auto Itr = WideningDecisions.find(InstOnVF);
1371 if (Itr == WideningDecisions.end())
1372 return CM_Unknown;
1373 return Itr->second.first;
1374 }
1375
1376 /// Return the vectorization cost for the given instruction \p I and vector
1377 /// width \p VF.
1378 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1379 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1379, __extension__
__PRETTY_FUNCTION__))
;
1380 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1381 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&(static_cast <bool> (WideningDecisions.find(InstOnVF) !=
WideningDecisions.end() && "The cost is not calculated"
) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1382, __extension__
__PRETTY_FUNCTION__))
1382 "The cost is not calculated")(static_cast <bool> (WideningDecisions.find(InstOnVF) !=
WideningDecisions.end() && "The cost is not calculated"
) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1382, __extension__
__PRETTY_FUNCTION__))
;
1383 return WideningDecisions[InstOnVF].second;
1384 }
1385
1386 /// Return True if instruction \p I is an optimizable truncate whose operand
1387 /// is an induction variable. Such a truncate will be removed by adding a new
1388 /// induction variable with the destination type.
1389 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1390 // If the instruction is not a truncate, return false.
1391 auto *Trunc = dyn_cast<TruncInst>(I);
1392 if (!Trunc)
1393 return false;
1394
1395 // Get the source and destination types of the truncate.
1396 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1397 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1398
1399 // If the truncate is free for the given types, return false. Replacing a
1400 // free truncate with an induction variable would add an induction variable
1401 // update instruction to each iteration of the loop. We exclude from this
1402 // check the primary induction variable since it will need an update
1403 // instruction regardless.
1404 Value *Op = Trunc->getOperand(0);
1405 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1406 return false;
1407
1408 // If the truncated value is not an induction variable, return false.
1409 return Legal->isInductionPhi(Op);
1410 }
1411
1412 /// Collects the instructions to scalarize for each predicated instruction in
1413 /// the loop.
1414 void collectInstsToScalarize(ElementCount VF);
1415
1416 /// Collect Uniform and Scalar values for the given \p VF.
1417 /// The sets depend on CM decision for Load/Store instructions
1418 /// that may be vectorized as interleave, gather-scatter or scalarized.
1419 void collectUniformsAndScalars(ElementCount VF) {
1420 // Do the analysis once.
1421 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1422 return;
1423 setCostBasedWideningDecision(VF);
1424 collectLoopUniforms(VF);
1425 collectLoopScalars(VF);
1426 }
1427
1428 /// Returns true if the target machine supports masked store operation
1429 /// for the given \p DataType and kind of access to \p Ptr.
1430 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1431 return Legal->isConsecutivePtr(DataType, Ptr) &&
1432 TTI.isLegalMaskedStore(DataType, Alignment);
1433 }
1434
1435 /// Returns true if the target machine supports masked load operation
1436 /// for the given \p DataType and kind of access to \p Ptr.
1437 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1438 return Legal->isConsecutivePtr(DataType, Ptr) &&
1439 TTI.isLegalMaskedLoad(DataType, Alignment);
1440 }
1441
1442 /// Returns true if the target machine can represent \p V as a masked gather
1443 /// or scatter operation.
1444 bool isLegalGatherOrScatter(Value *V,
1445 ElementCount VF = ElementCount::getFixed(1)) {
1446 bool LI = isa<LoadInst>(V);
1447 bool SI = isa<StoreInst>(V);
1448 if (!LI && !SI)
1449 return false;
1450 auto *Ty = getLoadStoreType(V);
1451 Align Align = getLoadStoreAlignment(V);
1452 if (VF.isVector())
1453 Ty = VectorType::get(Ty, VF);
1454 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1455 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1456 }
1457
1458 /// Returns true if the target machine supports all of the reduction
1459 /// variables found for the given VF.
1460 bool canVectorizeReductions(ElementCount VF) const {
1461 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1462 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1463 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1464 }));
1465 }
1466
1467 /// Given costs for both strategies, return true if the scalar predication
1468 /// lowering should be used for div/rem. This incorporates an override
1469 /// option so it is not simply a cost comparison.
1470 bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1471 InstructionCost SafeDivisorCost) const {
1472 switch (ForceSafeDivisor) {
1473 case cl::BOU_UNSET:
1474 return ScalarCost < SafeDivisorCost;
1475 case cl::BOU_TRUE:
1476 return false;
1477 case cl::BOU_FALSE:
1478 return true;
1479 };
1480 llvm_unreachable("impossible case value")::llvm::llvm_unreachable_internal("impossible case value", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1480)
;
1481 }
1482
1483 /// Returns true if \p I is an instruction which requires predication and
1484 /// for which our chosen predication strategy is scalarization (i.e. we
1485 /// don't have an alternate strategy such as masking available).
1486 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1487 bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1488
1489 /// Returns true if \p I is an instruction that needs to be predicated
1490 /// at runtime. The result is independent of the predication mechanism.
1491 /// Superset of instructions that return true for isScalarWithPredication.
1492 bool isPredicatedInst(Instruction *I) const;
1493
1494 /// Return the costs for our two available strategies for lowering a
1495 /// div/rem operation which requires speculating at least one lane.
1496 /// First result is for scalarization (will be invalid for scalable
1497 /// vectors); second is for the safe-divisor strategy.
1498 std::pair<InstructionCost, InstructionCost>
1499 getDivRemSpeculationCost(Instruction *I,
1500 ElementCount VF) const;
1501
1502 /// Returns true if \p I is a memory instruction with consecutive memory
1503 /// access that can be widened.
1504 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1505
1506 /// Returns true if \p I is a memory instruction in an interleaved-group
1507 /// of memory accesses that can be vectorized with wide vector loads/stores
1508 /// and shuffles.
1509 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
1510
1511 /// Check if \p Instr belongs to any interleaved access group.
1512 bool isAccessInterleaved(Instruction *Instr) {
1513 return InterleaveInfo.isInterleaved(Instr);
1514 }
1515
1516 /// Get the interleaved access group that \p Instr belongs to.
1517 const InterleaveGroup<Instruction> *
1518 getInterleavedAccessGroup(Instruction *Instr) {
1519 return InterleaveInfo.getInterleaveGroup(Instr);
1520 }
1521
1522 /// Returns true if we're required to use a scalar epilogue for at least
1523 /// the final iteration of the original loop.
1524 bool requiresScalarEpilogue(ElementCount VF) const {
1525 if (!isScalarEpilogueAllowed())
1526 return false;
1527 // If we might exit from anywhere but the latch, must run the exiting
1528 // iteration in scalar form.
1529 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1530 return true;
1531 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1532 }
1533
1534 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1535 /// loop hint annotation.
1536 bool isScalarEpilogueAllowed() const {
1537 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1538 }
1539
1540 /// Returns true if all loop blocks should be masked to fold tail loop.
1541 bool foldTailByMasking() const { return FoldTailByMasking; }
1542
1543 /// Returns true if were tail-folding and want to use the active lane mask
1544 /// for vector loop control flow.
1545 bool useActiveLaneMaskForControlFlow() const {
1546 return FoldTailByMasking &&
1547 TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow;
1548 }
1549
1550 /// Returns true if the instructions in this block requires predication
1551 /// for any reason, e.g. because tail folding now requires a predicate
1552 /// or because the block in the original loop was predicated.
1553 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1554 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1555 }
1556
1557 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1558 /// nodes to the chain of instructions representing the reductions. Uses a
1559 /// MapVector to ensure deterministic iteration order.
1560 using ReductionChainMap =
1561 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1562
1563 /// Return the chain of instructions representing an inloop reduction.
1564 const ReductionChainMap &getInLoopReductionChains() const {
1565 return InLoopReductionChains;
1566 }
1567
1568 /// Returns true if the Phi is part of an inloop reduction.
1569 bool isInLoopReduction(PHINode *Phi) const {
1570 return InLoopReductionChains.count(Phi);
1571 }
1572
1573 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1574 /// with factor VF. Return the cost of the instruction, including
1575 /// scalarization overhead if it's needed.
1576 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1577
1578 /// Estimate cost of a call instruction CI if it were vectorized with factor
1579 /// VF. Return the cost of the instruction, including scalarization overhead
1580 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1581 /// scalarized -
1582 /// i.e. either vector version isn't available, or is too expensive.
1583 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1584 bool &NeedToScalarize) const;
1585
1586 /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1587 /// that of B.
1588 bool isMoreProfitable(const VectorizationFactor &A,
1589 const VectorizationFactor &B) const;
1590
1591 /// Invalidates decisions already taken by the cost model.
1592 void invalidateCostModelingDecisions() {
1593 WideningDecisions.clear();
1594 Uniforms.clear();
1595 Scalars.clear();
1596 }
1597
1598 /// Convenience function that returns the value of vscale_range iff
1599 /// vscale_range.min == vscale_range.max or otherwise returns the value
1600 /// returned by the corresponding TLI method.
1601 std::optional<unsigned> getVScaleForTuning() const;
1602
1603private:
1604 unsigned NumPredStores = 0;
1605
1606 /// \return An upper bound for the vectorization factors for both
1607 /// fixed and scalable vectorization, where the minimum-known number of
1608 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1609 /// disabled or unsupported, then the scalable part will be equal to
1610 /// ElementCount::getScalable(0).
1611 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1612 ElementCount UserVF,
1613 bool FoldTailByMasking);
1614
1615 /// \return the maximized element count based on the targets vector
1616 /// registers and the loop trip-count, but limited to a maximum safe VF.
1617 /// This is a helper function of computeFeasibleMaxVF.
1618 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1619 unsigned SmallestType,
1620 unsigned WidestType,
1621 ElementCount MaxSafeVF,
1622 bool FoldTailByMasking);
1623
1624 /// \return the maximum legal scalable VF, based on the safe max number
1625 /// of elements.
1626 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1627
1628 /// The vectorization cost is a combination of the cost itself and a boolean
1629 /// indicating whether any of the contributing operations will actually
1630 /// operate on vector values after type legalization in the backend. If this
1631 /// latter value is false, then all operations will be scalarized (i.e. no
1632 /// vectorization has actually taken place).
1633 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1634
1635 /// Returns the expected execution cost. The unit of the cost does
1636 /// not matter because we use the 'cost' units to compare different
1637 /// vector widths. The cost that is returned is *not* normalized by
1638 /// the factor width. If \p Invalid is not nullptr, this function
1639 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1640 /// each instruction that has an Invalid cost for the given VF.
1641 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1642 VectorizationCostTy
1643 expectedCost(ElementCount VF,
1644 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1645
1646 /// Returns the execution time cost of an instruction for a given vector
1647 /// width. Vector width of one means scalar.
1648 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1649
1650 /// The cost-computation logic from getInstructionCost which provides
1651 /// the vector type as an output parameter.
1652 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1653 Type *&VectorTy);
1654
1655 /// Return the cost of instructions in an inloop reduction pattern, if I is
1656 /// part of that pattern.
1657 std::optional<InstructionCost>
1658 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1659 TTI::TargetCostKind CostKind);
1660
1661 /// Calculate vectorization cost of memory instruction \p I.
1662 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1663
1664 /// The cost computation for scalarized memory instruction.
1665 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1666
1667 /// The cost computation for interleaving group of memory instructions.
1668 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1669
1670 /// The cost computation for Gather/Scatter instruction.
1671 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1672
1673 /// The cost computation for widening instruction \p I with consecutive
1674 /// memory access.
1675 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1676
1677 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1678 /// Load: scalar load + broadcast.
1679 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1680 /// element)
1681 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1682
1683 /// Estimate the overhead of scalarizing an instruction. This is a
1684 /// convenience wrapper for the type-based getScalarizationOverhead API.
1685 InstructionCost getScalarizationOverhead(Instruction *I,
1686 ElementCount VF) const;
1687
1688 /// Returns true if an artificially high cost for emulated masked memrefs
1689 /// should be used.
1690 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1691
1692 /// Map of scalar integer values to the smallest bitwidth they can be legally
1693 /// represented as. The vector equivalents of these values should be truncated
1694 /// to this type.
1695 MapVector<Instruction *, uint64_t> MinBWs;
1696
1697 /// A type representing the costs for instructions if they were to be
1698 /// scalarized rather than vectorized. The entries are Instruction-Cost
1699 /// pairs.
1700 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1701
1702 /// A set containing all BasicBlocks that are known to present after
1703 /// vectorization as a predicated block.
1704 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1705 PredicatedBBsAfterVectorization;
1706
1707 /// Records whether it is allowed to have the original scalar loop execute at
1708 /// least once. This may be needed as a fallback loop in case runtime
1709 /// aliasing/dependence checks fail, or to handle the tail/remainder
1710 /// iterations when the trip count is unknown or doesn't divide by the VF,
1711 /// or as a peel-loop to handle gaps in interleave-groups.
1712 /// Under optsize and when the trip count is very small we don't allow any
1713 /// iterations to execute in the scalar loop.
1714 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1715
1716 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1717 bool FoldTailByMasking = false;
1718
1719 /// A map holding scalar costs for different vectorization factors. The
1720 /// presence of a cost for an instruction in the mapping indicates that the
1721 /// instruction will be scalarized when vectorizing with the associated
1722 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1723 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1724
1725 /// Holds the instructions known to be uniform after vectorization.
1726 /// The data is collected per VF.
1727 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1728
1729 /// Holds the instructions known to be scalar after vectorization.
1730 /// The data is collected per VF.
1731 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1732
1733 /// Holds the instructions (address computations) that are forced to be
1734 /// scalarized.
1735 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1736
1737 /// PHINodes of the reductions that should be expanded in-loop along with
1738 /// their associated chains of reduction operations, in program order from top
1739 /// (PHI) to bottom
1740 ReductionChainMap InLoopReductionChains;
1741
1742 /// A Map of inloop reduction operations and their immediate chain operand.
1743 /// FIXME: This can be removed once reductions can be costed correctly in
1744 /// vplan. This was added to allow quick lookup to the inloop operations,
1745 /// without having to loop through InLoopReductionChains.
1746 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1747
1748 /// Returns the expected difference in cost from scalarizing the expression
1749 /// feeding a predicated instruction \p PredInst. The instructions to
1750 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1751 /// non-negative return value implies the expression will be scalarized.
1752 /// Currently, only single-use chains are considered for scalarization.
1753 InstructionCost computePredInstDiscount(Instruction *PredInst,
1754 ScalarCostsTy &ScalarCosts,
1755 ElementCount VF);
1756
1757 /// Collect the instructions that are uniform after vectorization. An
1758 /// instruction is uniform if we represent it with a single scalar value in
1759 /// the vectorized loop corresponding to each vector iteration. Examples of
1760 /// uniform instructions include pointer operands of consecutive or
1761 /// interleaved memory accesses. Note that although uniformity implies an
1762 /// instruction will be scalar, the reverse is not true. In general, a
1763 /// scalarized instruction will be represented by VF scalar values in the
1764 /// vectorized loop, each corresponding to an iteration of the original
1765 /// scalar loop.
1766 void collectLoopUniforms(ElementCount VF);
1767
1768 /// Collect the instructions that are scalar after vectorization. An
1769 /// instruction is scalar if it is known to be uniform or will be scalarized
1770 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1771 /// to the list if they are used by a load/store instruction that is marked as
1772 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1773 /// VF values in the vectorized loop, each corresponding to an iteration of
1774 /// the original scalar loop.
1775 void collectLoopScalars(ElementCount VF);
1776
1777 /// Keeps cost model vectorization decision and cost for instructions.
1778 /// Right now it is used for memory instructions only.
1779 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1780 std::pair<InstWidening, InstructionCost>>;
1781
1782 DecisionList WideningDecisions;
1783
1784 /// Returns true if \p V is expected to be vectorized and it needs to be
1785 /// extracted.
1786 bool needsExtract(Value *V, ElementCount VF) const {
1787 Instruction *I = dyn_cast<Instruction>(V);
1788 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1789 TheLoop->isLoopInvariant(I))
1790 return false;
1791
1792 // Assume we can vectorize V (and hence we need extraction) if the
1793 // scalars are not computed yet. This can happen, because it is called
1794 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1795 // the scalars are collected. That should be a safe assumption in most
1796 // cases, because we check if the operands have vectorizable types
1797 // beforehand in LoopVectorizationLegality.
1798 return Scalars.find(VF) == Scalars.end() ||
1799 !isScalarAfterVectorization(I, VF);
1800 };
1801
1802 /// Returns a range containing only operands needing to be extracted.
1803 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1804 ElementCount VF) const {
1805 return SmallVector<Value *, 4>(make_filter_range(
1806 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1807 }
1808
1809 /// Determines if we have the infrastructure to vectorize loop \p L and its
1810 /// epilogue, assuming the main loop is vectorized by \p VF.
1811 bool isCandidateForEpilogueVectorization(const Loop &L,
1812 const ElementCount VF) const;
1813
1814 /// Returns true if epilogue vectorization is considered profitable, and
1815 /// false otherwise.
1816 /// \p VF is the vectorization factor chosen for the original loop.
1817 bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1818
1819public:
1820 /// The loop that we evaluate.
1821 Loop *TheLoop;
1822
1823 /// Predicated scalar evolution analysis.
1824 PredicatedScalarEvolution &PSE;
1825
1826 /// Loop Info analysis.
1827 LoopInfo *LI;
1828
1829 /// Vectorization legality.
1830 LoopVectorizationLegality *Legal;
1831
1832 /// Vector target information.
1833 const TargetTransformInfo &TTI;
1834
1835 /// Target Library Info.
1836 const TargetLibraryInfo *TLI;
1837
1838 /// Demanded bits analysis.
1839 DemandedBits *DB;
1840
1841 /// Assumption cache.
1842 AssumptionCache *AC;
1843
1844 /// Interface to emit optimization remarks.
1845 OptimizationRemarkEmitter *ORE;
1846
1847 const Function *TheFunction;
1848
1849 /// Loop Vectorize Hint.
1850 const LoopVectorizeHints *Hints;
1851
1852 /// The interleave access information contains groups of interleaved accesses
1853 /// with the same stride and close to each other.
1854 InterleavedAccessInfo &InterleaveInfo;
1855
1856 /// Values to ignore in the cost model.
1857 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1858
1859 /// Values to ignore in the cost model when VF > 1.
1860 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1861
1862 /// All element types found in the loop.
1863 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1864
1865 /// Profitable vector factors.
1866 SmallVector<VectorizationFactor, 8> ProfitableVFs;
1867};
1868} // end namespace llvm
1869
1870/// Helper struct to manage generating runtime checks for vectorization.
1871///
1872/// The runtime checks are created up-front in temporary blocks to allow better
1873/// estimating the cost and un-linked from the existing IR. After deciding to
1874/// vectorize, the checks are moved back. If deciding not to vectorize, the
1875/// temporary blocks are completely removed.
1876class GeneratedRTChecks {
1877 /// Basic block which contains the generated SCEV checks, if any.
1878 BasicBlock *SCEVCheckBlock = nullptr;
1879
1880 /// The value representing the result of the generated SCEV checks. If it is
1881 /// nullptr, either no SCEV checks have been generated or they have been used.
1882 Value *SCEVCheckCond = nullptr;
1883
1884 /// Basic block which contains the generated memory runtime checks, if any.
1885 BasicBlock *MemCheckBlock = nullptr;
1886
1887 /// The value representing the result of the generated memory runtime checks.
1888 /// If it is nullptr, either no memory runtime checks have been generated or
1889 /// they have been used.
1890 Value *MemRuntimeCheckCond = nullptr;
1891
1892 DominatorTree *DT;
1893 LoopInfo *LI;
1894 TargetTransformInfo *TTI;
1895
1896 SCEVExpander SCEVExp;
1897 SCEVExpander MemCheckExp;
1898
1899 bool CostTooHigh = false;
1900
1901public:
1902 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1903 TargetTransformInfo *TTI, const DataLayout &DL)
1904 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1905 MemCheckExp(SE, DL, "scev.check") {}
1906
1907 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1908 /// accurately estimate the cost of the runtime checks. The blocks are
1909 /// un-linked from the IR and is added back during vector code generation. If
1910 /// there is no vector code generation, the check blocks are removed
1911 /// completely.
1912 void Create(Loop *L, const LoopAccessInfo &LAI,
1913 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1914
1915 // Hard cutoff to limit compile-time increase in case a very large number of
1916 // runtime checks needs to be generated.
1917 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1918 // profile info.
1919 CostTooHigh =
1920 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1921 if (CostTooHigh)
1922 return;
1923
1924 BasicBlock *LoopHeader = L->getHeader();
1925 BasicBlock *Preheader = L->getLoopPreheader();
1926
1927 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1928 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1929 // may be used by SCEVExpander. The blocks will be un-linked from their
1930 // predecessors and removed from LI & DT at the end of the function.
1931 if (!UnionPred.isAlwaysTrue()) {
1932 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1933 nullptr, "vector.scevcheck");
1934
1935 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1936 &UnionPred, SCEVCheckBlock->getTerminator());
1937 }
1938
1939 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1940 if (RtPtrChecking.Need) {
1941 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1942 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1943 "vector.memcheck");
1944
1945 auto DiffChecks = RtPtrChecking.getDiffChecks();
1946 if (DiffChecks) {
1947 Value *RuntimeVF = nullptr;
1948 MemRuntimeCheckCond = addDiffRuntimeChecks(
1949 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1950 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1951 if (!RuntimeVF)
1952 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1953 return RuntimeVF;
1954 },
1955 IC);
1956 } else {
1957 MemRuntimeCheckCond =
1958 addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1959 RtPtrChecking.getChecks(), MemCheckExp);
1960 }
1961 assert(MemRuntimeCheckCond &&(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1963, __extension__
__PRETTY_FUNCTION__))
1962 "no RT checks generated although RtPtrChecking "(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1963, __extension__
__PRETTY_FUNCTION__))
1963 "claimed checks are required")(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1963, __extension__
__PRETTY_FUNCTION__))
;
1964 }
1965
1966 if (!MemCheckBlock && !SCEVCheckBlock)
1967 return;
1968
1969 // Unhook the temporary block with the checks, update various places
1970 // accordingly.
1971 if (SCEVCheckBlock)
1972 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1973 if (MemCheckBlock)
1974 MemCheckBlock->replaceAllUsesWith(Preheader);
1975
1976 if (SCEVCheckBlock) {
1977 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1978 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1979 Preheader->getTerminator()->eraseFromParent();
1980 }
1981 if (MemCheckBlock) {
1982 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1983 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1984 Preheader->getTerminator()->eraseFromParent();
1985 }
1986
1987 DT->changeImmediateDominator(LoopHeader, Preheader);
1988 if (MemCheckBlock) {
1989 DT->eraseNode(MemCheckBlock);
1990 LI->removeBlock(MemCheckBlock);
1991 }
1992 if (SCEVCheckBlock) {
1993 DT->eraseNode(SCEVCheckBlock);
1994 LI->removeBlock(SCEVCheckBlock);
1995 }
1996 }
1997
1998 InstructionCost getCost() {
1999 if (SCEVCheckBlock || MemCheckBlock)
2000 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Calculating cost of runtime checks:\n"
; } } while (false)
;
2001
2002 if (CostTooHigh) {
2003 InstructionCost Cost;
2004 Cost.setInvalid();
2005 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " number of checks exceeded threshold\n"
; } } while (false)
;
2006 return Cost;
2007 }
2008
2009 InstructionCost RTCheckCost = 0;
2010 if (SCEVCheckBlock)
2011 for (Instruction &I : *SCEVCheckBlock) {
2012 if (SCEVCheckBlock->getTerminator() == &I)
2013 continue;
2014 InstructionCost C =
2015 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2016 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " " << C <<
" for " << I << "\n"; } } while (false)
;
2017 RTCheckCost += C;
2018 }
2019 if (MemCheckBlock)
2020 for (Instruction &I : *MemCheckBlock) {
2021 if (MemCheckBlock->getTerminator() == &I)
2022 continue;
2023 InstructionCost C =
2024 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2025 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " " << C <<
" for " << I << "\n"; } } while (false)
;
2026 RTCheckCost += C;
2027 }
2028
2029 if (SCEVCheckBlock || MemCheckBlock)
2030 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCostdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Total cost of runtime checks: "
<< RTCheckCost << "\n"; } } while (false)
2031 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Total cost of runtime checks: "
<< RTCheckCost << "\n"; } } while (false)
;
2032
2033 return RTCheckCost;
2034 }
2035
2036 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2037 /// unused.
2038 ~GeneratedRTChecks() {
2039 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2040 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2041 if (!SCEVCheckCond)
2042 SCEVCleaner.markResultUsed();
2043
2044 if (!MemRuntimeCheckCond)
2045 MemCheckCleaner.markResultUsed();
2046
2047 if (MemRuntimeCheckCond) {
2048 auto &SE = *MemCheckExp.getSE();
2049 // Memory runtime check generation creates compares that use expanded
2050 // values. Remove them before running the SCEVExpanderCleaners.
2051 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2052 if (MemCheckExp.isInsertedInstruction(&I))
2053 continue;
2054 SE.forgetValue(&I);
2055 I.eraseFromParent();
2056 }
2057 }
2058 MemCheckCleaner.cleanup();
2059 SCEVCleaner.cleanup();
2060
2061 if (SCEVCheckCond)
2062 SCEVCheckBlock->eraseFromParent();
2063 if (MemRuntimeCheckCond)
2064 MemCheckBlock->eraseFromParent();
2065 }
2066
2067 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2068 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2069 /// depending on the generated condition.
2070 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2071 BasicBlock *LoopVectorPreHeader,
2072 BasicBlock *LoopExitBlock) {
2073 if (!SCEVCheckCond)
2074 return nullptr;
2075
2076 Value *Cond = SCEVCheckCond;
2077 // Mark the check as used, to prevent it from being removed during cleanup.
2078 SCEVCheckCond = nullptr;
2079 if (auto *C = dyn_cast<ConstantInt>(Cond))
2080 if (C->isZero())
2081 return nullptr;
2082
2083 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2084
2085 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2086 // Create new preheader for vector loop.
2087 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2088 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2089
2090 SCEVCheckBlock->getTerminator()->eraseFromParent();
2091 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2092 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2093 SCEVCheckBlock);
2094
2095 DT->addNewBlock(SCEVCheckBlock, Pred);
2096 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2097
2098 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
2099 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
2100 return SCEVCheckBlock;
2101 }
2102
2103 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2104 /// the branches to branch to the vector preheader or \p Bypass, depending on
2105 /// the generated condition.
2106 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2107 BasicBlock *LoopVectorPreHeader) {
2108 // Check if we generated code that checks in runtime if arrays overlap.
2109 if (!MemRuntimeCheckCond)
2110 return nullptr;
2111
2112 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2113 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2114 MemCheckBlock);
2115
2116 DT->addNewBlock(MemCheckBlock, Pred);
2117 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2118 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2119
2120 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2121 PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2122
2123 ReplaceInstWithInst(
2124 MemCheckBlock->getTerminator(),
2125 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2126 MemCheckBlock->getTerminator()->setDebugLoc(
2127 Pred->getTerminator()->getDebugLoc());
2128
2129 // Mark the check as used, to prevent it from being removed during cleanup.
2130 MemRuntimeCheckCond = nullptr;
2131 return MemCheckBlock;
2132 }
2133};
2134
2135// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2136// vectorization. The loop needs to be annotated with #pragma omp simd
2137// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2138// vector length information is not provided, vectorization is not considered
2139// explicit. Interleave hints are not allowed either. These limitations will be
2140// relaxed in the future.
2141// Please, note that we are currently forced to abuse the pragma 'clang
2142// vectorize' semantics. This pragma provides *auto-vectorization hints*
2143// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2144// provides *explicit vectorization hints* (LV can bypass legal checks and
2145// assume that vectorization is legal). However, both hints are implemented
2146// using the same metadata (llvm.loop.vectorize, processed by
2147// LoopVectorizeHints). This will be fixed in the future when the native IR
2148// representation for pragma 'omp simd' is introduced.
2149static bool isExplicitVecOuterLoop(Loop *OuterLp,
2150 OptimizationRemarkEmitter *ORE) {
2151 assert(!OuterLp->isInnermost() && "This is not an outer loop")(static_cast <bool> (!OuterLp->isInnermost() &&
"This is not an outer loop") ? void (0) : __assert_fail ("!OuterLp->isInnermost() && \"This is not an outer loop\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2151, __extension__
__PRETTY_FUNCTION__))
;
2152 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2153
2154 // Only outer loops with an explicit vectorization hint are supported.
2155 // Unannotated outer loops are ignored.
2156 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2157 return false;
2158
2159 Function *Fn = OuterLp->getHeader()->getParent();
2160 if (!Hints.allowVectorization(Fn, OuterLp,
2161 true /*VectorizeOnlyWhenForced*/)) {
2162 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"
; } } while (false)
;
2163 return false;
2164 }
2165
2166 if (Hints.getInterleave() > 1) {
2167 // TODO: Interleave support is future work.
2168 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
2169 "outer loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
;
2170 Hints.emitRemarkWithHints();
2171 return false;
2172 }
2173
2174 return true;
2175}
2176
2177static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2178 OptimizationRemarkEmitter *ORE,
2179 SmallVectorImpl<Loop *> &V) {
2180 // Collect inner loops and outer loops without irreducible control flow. For
2181 // now, only collect outer loops that have explicit vectorization hints. If we
2182 // are stress testing the VPlan H-CFG construction, we collect the outermost
2183 // loop of every loop nest.
2184 if (L.isInnermost() || VPlanBuildStressTest ||
2185 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2186 LoopBlocksRPO RPOT(&L);
2187 RPOT.perform(LI);
2188 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2189 V.push_back(&L);
2190 // TODO: Collect inner loops inside marked outer loops in case
2191 // vectorization fails for the outer loop. Do not invoke
2192 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2193 // already known to be reducible. We can use an inherited attribute for
2194 // that.
2195 return;
2196 }
2197 }
2198 for (Loop *InnerL : L)
2199 collectSupportedLoops(*InnerL, LI, ORE, V);
2200}
2201
2202namespace {
2203
2204/// The LoopVectorize Pass.
2205struct LoopVectorize : public FunctionPass {
2206 /// Pass identification, replacement for typeid
2207 static char ID;
2208
2209 LoopVectorizePass Impl;
2210
2211 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2212 bool VectorizeOnlyWhenForced = false)
2213 : FunctionPass(ID),
2214 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2215 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2216 }
2217
2218 bool runOnFunction(Function &F) override {
2219 if (skipFunction(F))
2220 return false;
2221
2222 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2223 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2224 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2225 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2226 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2227 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2228 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2229 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2230 auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs();
2231 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2232 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2233 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2234
2235 return Impl
2236 .runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AC, LAIs, *ORE, PSI)
2237 .MadeAnyChange;
2238 }
2239
2240 void getAnalysisUsage(AnalysisUsage &AU) const override {
2241 AU.addRequired<AssumptionCacheTracker>();
2242 AU.addRequired<BlockFrequencyInfoWrapperPass>();
2243 AU.addRequired<DominatorTreeWrapperPass>();
2244 AU.addRequired<LoopInfoWrapperPass>();
2245 AU.addRequired<ScalarEvolutionWrapperPass>();
2246 AU.addRequired<TargetTransformInfoWrapperPass>();
2247 AU.addRequired<LoopAccessLegacyAnalysis>();
2248 AU.addRequired<DemandedBitsWrapperPass>();
2249 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2250 AU.addRequired<InjectTLIMappingsLegacy>();
2251
2252 // We currently do not preserve loopinfo/dominator analyses with outer loop
2253 // vectorization. Until this is addressed, mark these analyses as preserved
2254 // only for non-VPlan-native path.
2255 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2256 if (!EnableVPlanNativePath) {
2257 AU.addPreserved<LoopInfoWrapperPass>();
2258 AU.addPreserved<DominatorTreeWrapperPass>();
2259 }
2260
2261 AU.addPreserved<BasicAAWrapperPass>();
2262 AU.addPreserved<GlobalsAAWrapperPass>();
2263 AU.addRequired<ProfileSummaryInfoWrapperPass>();
2264 }
2265};
2266
2267} // end anonymous namespace
2268
2269//===----------------------------------------------------------------------===//
2270// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2271// LoopVectorizationCostModel and LoopVectorizationPlanner.
2272//===----------------------------------------------------------------------===//
2273
2274Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2275 // We need to place the broadcast of invariant variables outside the loop,
2276 // but only if it's proven safe to do so. Else, broadcast will be inside
2277 // vector loop body.
2278 Instruction *Instr = dyn_cast<Instruction>(V);
2279 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2280 (!Instr ||
2281 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2282 // Place the code for broadcasting invariant variables in the new preheader.
2283 IRBuilder<>::InsertPointGuard Guard(Builder);
2284 if (SafeToHoist)
2285 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2286
2287 // Broadcast the scalar into all locations in the vector.
2288 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2289
2290 return Shuf;
2291}
2292
2293/// This function adds
2294/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2295/// to each vector element of Val. The sequence starts at StartIndex.
2296/// \p Opcode is relevant for FP induction variable.
2297static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2298 Instruction::BinaryOps BinOp, ElementCount VF,
2299 IRBuilderBase &Builder) {
2300 assert(VF.isVector() && "only vector VFs are supported")(static_cast <bool> (VF.isVector() && "only vector VFs are supported"
) ? void (0) : __assert_fail ("VF.isVector() && \"only vector VFs are supported\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2300, __extension__
__PRETTY_FUNCTION__))
;
2301
2302 // Create and check the types.
2303 auto *ValVTy = cast<VectorType>(Val->getType());
2304 ElementCount VLen = ValVTy->getElementCount();
2305
2306 Type *STy = Val->getType()->getScalarType();
2307 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2308, __extension__
__PRETTY_FUNCTION__))
2308 "Induction Step must be an integer or FP")(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2308, __extension__
__PRETTY_FUNCTION__))
;
2309 assert(Step->getType() == STy && "Step has wrong type")(static_cast <bool> (Step->getType() == STy &&
"Step has wrong type") ? void (0) : __assert_fail ("Step->getType() == STy && \"Step has wrong type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2309, __extension__
__PRETTY_FUNCTION__))
;
2310
2311 SmallVector<Constant *, 8> Indices;
2312
2313 // Create a vector of consecutive numbers from zero to VF.
2314 VectorType *InitVecValVTy = ValVTy;
2315 if (STy->isFloatingPointTy()) {
2316 Type *InitVecValSTy =
2317 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2318 InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2319 }
2320 Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2321
2322 // Splat the StartIdx
2323 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2324
2325 if (STy->isIntegerTy()) {
2326 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2327 Step = Builder.CreateVectorSplat(VLen, Step);
2328 assert(Step->getType() == Val->getType() && "Invalid step vec")(static_cast <bool> (Step->getType() == Val->getType
() && "Invalid step vec") ? void (0) : __assert_fail (
"Step->getType() == Val->getType() && \"Invalid step vec\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2328, __extension__
__PRETTY_FUNCTION__))
;
2329 // FIXME: The newly created binary instructions should contain nsw/nuw
2330 // flags, which can be found from the original scalar operations.
2331 Step = Builder.CreateMul(InitVec, Step);
2332 return Builder.CreateAdd(Val, Step, "induction");
2333 }
2334
2335 // Floating point induction.
2336 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
== Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2337, __extension__
__PRETTY_FUNCTION__))
2337 "Binary Opcode should be specified for FP induction")(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
== Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2337, __extension__
__PRETTY_FUNCTION__))
;
2338 InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2339 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2340
2341 Step = Builder.CreateVectorSplat(VLen, Step);
2342 Value *MulOp = Builder.CreateFMul(InitVec, Step);
2343 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2344}
2345
2346/// Compute scalar induction steps. \p ScalarIV is the scalar induction
2347/// variable on which to base the steps, \p Step is the size of the step.
2348static void buildScalarSteps(Value *ScalarIV, Value *Step,
2349 const InductionDescriptor &ID, VPValue *Def,
2350 VPTransformState &State) {
2351 IRBuilderBase &Builder = State.Builder;
2352
2353 // Ensure step has the same type as that of scalar IV.
2354 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2355 if (ScalarIVTy != Step->getType()) {
2356 // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to
2357 // avoid separate truncate here.
2358 assert(Step->getType()->isIntegerTy() &&(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2359, __extension__
__PRETTY_FUNCTION__))
2359 "Truncation requires an integer step")(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2359, __extension__
__PRETTY_FUNCTION__))
;
2360 Step = State.Builder.CreateTrunc(Step, ScalarIVTy);
2361 }
2362
2363 // We build scalar steps for both integer and floating-point induction
2364 // variables. Here, we determine the kind of arithmetic we will perform.
2365 Instruction::BinaryOps AddOp;
2366 Instruction::BinaryOps MulOp;
2367 if (ScalarIVTy->isIntegerTy()) {
2368 AddOp = Instruction::Add;
2369 MulOp = Instruction::Mul;
2370 } else {
2371 AddOp = ID.getInductionOpcode();
2372 MulOp = Instruction::FMul;
2373 }
2374
2375 // Determine the number of scalars we need to generate for each unroll
2376 // iteration.
2377 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2378 // Compute the scalar steps and save the results in State.
2379 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2380 ScalarIVTy->getScalarSizeInBits());
2381 Type *VecIVTy = nullptr;
2382 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2383 if (!FirstLaneOnly && State.VF.isScalable()) {
2384 VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2385 UnitStepVec =
2386 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2387 SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2388 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2389 }
2390
2391 unsigned StartPart = 0;
2392 unsigned EndPart = State.UF;
2393 unsigned StartLane = 0;
2394 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2395 if (State.Instance) {
2396 StartPart = State.Instance->Part;
2397 EndPart = StartPart + 1;
2398 StartLane = State.Instance->Lane.getKnownLane();
2399 EndLane = StartLane + 1;
2400 }
2401 for (unsigned Part = StartPart; Part < EndPart; ++Part) {
2402 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2403
2404 if (!FirstLaneOnly && State.VF.isScalable()) {
2405 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2406 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2407 if (ScalarIVTy->isFloatingPointTy())
2408 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2409 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2410 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2411 State.set(Def, Add, Part);
2412 // It's useful to record the lane values too for the known minimum number
2413 // of elements so we do those below. This improves the code quality when
2414 // trying to extract the first element, for example.
2415 }
2416
2417 if (ScalarIVTy->isFloatingPointTy())
2418 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2419
2420 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2421 Value *StartIdx = Builder.CreateBinOp(
2422 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2423 // The step returned by `createStepForVF` is a runtime-evaluated value
2424 // when VF is scalable. Otherwise, it should be folded into a Constant.
2425 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&(static_cast <bool> ((State.VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2427, __extension__
__PRETTY_FUNCTION__))
2426 "Expected StartIdx to be folded to a constant when VF is not "(static_cast <bool> ((State.VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2427, __extension__
__PRETTY_FUNCTION__))
2427 "scalable")(static_cast <bool> ((State.VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2427, __extension__
__PRETTY_FUNCTION__))
;
2428 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2429 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2430 State.set(Def, Add, VPIteration(Part, Lane));
2431 }
2432 }
2433}
2434
2435// Generate code for the induction step. Note that induction steps are
2436// required to be loop-invariant
2437static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2438 Instruction *InsertBefore,
2439 Loop *OrigLoop = nullptr) {
2440 const DataLayout &DL = SE.getDataLayout();
2441 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&(static_cast <bool> ((!OrigLoop || SE.isLoopInvariant(Step
, OrigLoop)) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("(!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && \"Induction step should be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2442, __extension__
__PRETTY_FUNCTION__))
2442 "Induction step should be loop invariant")(static_cast <bool> ((!OrigLoop || SE.isLoopInvariant(Step
, OrigLoop)) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("(!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && \"Induction step should be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2442, __extension__
__PRETTY_FUNCTION__))
;
2443 if (auto *E = dyn_cast<SCEVUnknown>(Step))
2444 return E->getValue();
2445
2446 SCEVExpander Exp(SE, DL, "induction");
2447 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2448}
2449
2450/// Compute the transformed value of Index at offset StartValue using step
2451/// StepValue.
2452/// For integer induction, returns StartValue + Index * StepValue.
2453/// For pointer induction, returns StartValue[Index * StepValue].
2454/// FIXME: The newly created binary instructions should contain nsw/nuw
2455/// flags, which can be found from the original scalar operations.
2456static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
2457 Value *StartValue, Value *Step,
2458 const InductionDescriptor &ID) {
2459 Type *StepTy = Step->getType();
2460 Value *CastedIndex = StepTy->isIntegerTy()
2461 ? B.CreateSExtOrTrunc(Index, StepTy)
2462 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2463 if (CastedIndex != Index) {
2464 CastedIndex->setName(CastedIndex->getName() + ".cast");
2465 Index = CastedIndex;
2466 }
2467
2468 // Note: the IR at this point is broken. We cannot use SE to create any new
2469 // SCEV and then expand it, hoping that SCEV's simplification will give us
2470 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2471 // lead to various SCEV crashes. So all we can do is to use builder and rely
2472 // on InstCombine for future simplifications. Here we handle some trivial
2473 // cases only.
2474 auto CreateAdd = [&B](Value *X, Value *Y) {
2475 assert(X->getType() == Y->getType() && "Types don't match!")(static_cast <bool> (X->getType() == Y->getType()
&& "Types don't match!") ? void (0) : __assert_fail (
"X->getType() == Y->getType() && \"Types don't match!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2475, __extension__
__PRETTY_FUNCTION__))
;
2476 if (auto *CX = dyn_cast<ConstantInt>(X))
2477 if (CX->isZero())
2478 return Y;
2479 if (auto *CY = dyn_cast<ConstantInt>(Y))
2480 if (CY->isZero())
2481 return X;
2482 return B.CreateAdd(X, Y);
2483 };
2484
2485 // We allow X to be a vector type, in which case Y will potentially be
2486 // splatted into a vector with the same element count.
2487 auto CreateMul = [&B](Value *X, Value *Y) {
2488 assert(X->getType()->getScalarType() == Y->getType() &&(static_cast <bool> (X->getType()->getScalarType(
) == Y->getType() && "Types don't match!") ? void (
0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2489, __extension__
__PRETTY_FUNCTION__))
2489 "Types don't match!")(static_cast <bool> (X->getType()->getScalarType(
) == Y->getType() && "Types don't match!") ? void (
0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2489, __extension__
__PRETTY_FUNCTION__))
;
2490 if (auto *CX = dyn_cast<ConstantInt>(X))
2491 if (CX->isOne())
2492 return Y;
2493 if (auto *CY = dyn_cast<ConstantInt>(Y))
2494 if (CY->isOne())
2495 return X;
2496 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2497 if (XVTy && !isa<VectorType>(Y->getType()))
2498 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2499 return B.CreateMul(X, Y);
2500 };
2501
2502 switch (ID.getKind()) {
2503 case InductionDescriptor::IK_IntInduction: {
2504 assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for integer inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2505, __extension__
__PRETTY_FUNCTION__))
2505 "Vector indices not supported for integer inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for integer inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2505, __extension__
__PRETTY_FUNCTION__))
;
2506 assert(Index->getType() == StartValue->getType() &&(static_cast <bool> (Index->getType() == StartValue->
getType() && "Index type does not match StartValue type"
) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2507, __extension__
__PRETTY_FUNCTION__))
2507 "Index type does not match StartValue type")(static_cast <bool> (Index->getType() == StartValue->
getType() && "Index type does not match StartValue type"
) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2507, __extension__
__PRETTY_FUNCTION__))
;
2508 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2509 return B.CreateSub(StartValue, Index);
2510 auto *Offset = CreateMul(Index, Step);
2511 return CreateAdd(StartValue, Offset);
2512 }
2513 case InductionDescriptor::IK_PtrInduction: {
2514 assert(isa<Constant>(Step) &&(static_cast <bool> (isa<Constant>(Step) &&
"Expected constant step for pointer induction") ? void (0) :
__assert_fail ("isa<Constant>(Step) && \"Expected constant step for pointer induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2515, __extension__
__PRETTY_FUNCTION__))
2515 "Expected constant step for pointer induction")(static_cast <bool> (isa<Constant>(Step) &&
"Expected constant step for pointer induction") ? void (0) :
__assert_fail ("isa<Constant>(Step) && \"Expected constant step for pointer induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2515, __extension__
__PRETTY_FUNCTION__))
;
2516 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2517 }
2518 case InductionDescriptor::IK_FpInduction: {
2519 assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for FP inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2520, __extension__
__PRETTY_FUNCTION__))
2520 "Vector indices not supported for FP inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for FP inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2520, __extension__
__PRETTY_FUNCTION__))
;
2521 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value")(static_cast <bool> (Step->getType()->isFloatingPointTy
() && "Expected FP Step value") ? void (0) : __assert_fail
("Step->getType()->isFloatingPointTy() && \"Expected FP Step value\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2521, __extension__
__PRETTY_FUNCTION__))
;
2522 auto InductionBinOp = ID.getInductionBinOp();
2523 assert(InductionBinOp &&(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2526, __extension__
__PRETTY_FUNCTION__))
2524 (InductionBinOp->getOpcode() == Instruction::FAdd ||(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2526, __extension__
__PRETTY_FUNCTION__))
2525 InductionBinOp->getOpcode() == Instruction::FSub) &&(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2526, __extension__
__PRETTY_FUNCTION__))
2526 "Original bin op should be defined for FP induction")(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2526, __extension__
__PRETTY_FUNCTION__))
;
2527
2528 Value *MulExp = B.CreateFMul(Step, Index);
2529 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2530 "induction");
2531 }
2532 case InductionDescriptor::IK_NoInduction:
2533 return nullptr;
2534 }
2535 llvm_unreachable("invalid enum")::llvm::llvm_unreachable_internal("invalid enum", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2535)
;
2536}
2537
2538void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2539 const VPIteration &Instance,
2540 VPTransformState &State) {
2541 Value *ScalarInst = State.get(Def, Instance);
2542 Value *VectorValue = State.get(Def, Instance.Part);
2543 VectorValue = Builder.CreateInsertElement(
2544 VectorValue, ScalarInst,
2545 Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2546 State.set(Def, VectorValue, Instance.Part);
2547}
2548
2549// Return whether we allow using masked interleave-groups (for dealing with
2550// strided loads/stores that reside in predicated blocks, or for dealing
2551// with gaps).
2552static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2553 // If an override option has been passed in for interleaved accesses, use it.
2554 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2555 return EnableMaskedInterleavedMemAccesses;
2556
2557 return TTI.enableMaskedInterleavedAccessVectorization();
2558}
2559
2560// Try to vectorize the interleave group that \p Instr belongs to.
2561//
2562// E.g. Translate following interleaved load group (factor = 3):
2563// for (i = 0; i < N; i+=3) {
2564// R = Pic[i]; // Member of index 0
2565// G = Pic[i+1]; // Member of index 1
2566// B = Pic[i+2]; // Member of index 2
2567// ... // do something to R, G, B
2568// }
2569// To:
2570// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2571// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2572// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2573// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2574//
2575// Or translate following interleaved store group (factor = 3):
2576// for (i = 0; i < N; i+=3) {
2577// ... do something to R, G, B
2578// Pic[i] = R; // Member of index 0
2579// Pic[i+1] = G; // Member of index 1
2580// Pic[i+2] = B; // Member of index 2
2581// }
2582// To:
2583// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2584// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2585// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2586// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2587// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2588void InnerLoopVectorizer::vectorizeInterleaveGroup(
2589 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2590 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2591 VPValue *BlockInMask) {
2592 Instruction *Instr = Group->getInsertPos();
2593 const DataLayout &DL = Instr->getModule()->getDataLayout();
2594
2595 // Prepare for the vector type of the interleaved load/store.
2596 Type *ScalarTy = getLoadStoreType(Instr);
2597 unsigned InterleaveFactor = Group->getFactor();
2598 assert(!VF.isScalable() && "scalable vectors not yet supported.")(static_cast <bool> (!VF.isScalable() && "scalable vectors not yet supported."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2598, __extension__
__PRETTY_FUNCTION__))
;
2599 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2600
2601 // Prepare for the new pointers.
2602 SmallVector<Value *, 2> AddrParts;
2603 unsigned Index = Group->getIndex(Instr);
2604
2605 // TODO: extend the masked interleaved-group support to reversed access.
2606 assert((!BlockInMask || !Group->isReverse()) &&(static_cast <bool> ((!BlockInMask || !Group->isReverse
()) && "Reversed masked interleave-group not supported."
) ? void (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2607, __extension__
__PRETTY_FUNCTION__))
2607 "Reversed masked interleave-group not supported.")(static_cast <bool> ((!BlockInMask || !Group->isReverse
()) && "Reversed masked interleave-group not supported."
) ? void (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2607, __extension__
__PRETTY_FUNCTION__))
;
2608
2609 // If the group is reverse, adjust the index to refer to the last vector lane
2610 // instead of the first. We adjust the index from the first vector lane,
2611 // rather than directly getting the pointer for lane VF - 1, because the
2612 // pointer operand of the interleaved access is supposed to be uniform. For
2613 // uniform instructions, we're only required to generate a value for the
2614 // first vector lane in each unroll iteration.
2615 if (Group->isReverse())
2616 Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2617
2618 for (unsigned Part = 0; Part < UF; Part++) {
2619 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2620 State.setDebugLocFromInst(AddrPart);
2621
2622 // Notice current instruction could be any index. Need to adjust the address
2623 // to the member of index 0.
2624 //
2625 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2626 // b = A[i]; // Member of index 0
2627 // Current pointer is pointed to A[i+1], adjust it to A[i].
2628 //
2629 // E.g. A[i+1] = a; // Member of index 1
2630 // A[i] = b; // Member of index 0
2631 // A[i+2] = c; // Member of index 2 (Current instruction)
2632 // Current pointer is pointed to A[i+2], adjust it to A[i].
2633
2634 bool InBounds = false;
2635 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2636 InBounds = gep->isInBounds();
2637 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2638 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2639
2640 // Cast to the vector pointer type.
2641 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2642 Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2643 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2644 }
2645
2646 State.setDebugLocFromInst(Instr);
2647 Value *PoisonVec = PoisonValue::get(VecTy);
2648
2649 Value *MaskForGaps = nullptr;
2650 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2651 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2652 assert(MaskForGaps && "Mask for Gaps is required but it is null")(static_cast <bool> (MaskForGaps && "Mask for Gaps is required but it is null"
) ? void (0) : __assert_fail ("MaskForGaps && \"Mask for Gaps is required but it is null\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2652, __extension__
__PRETTY_FUNCTION__))
;
2653 }
2654
2655 // Vectorize the interleaved load group.
2656 if (isa<LoadInst>(Instr)) {
2657 // For each unroll part, create a wide load for the group.
2658 SmallVector<Value *, 2> NewLoads;
2659 for (unsigned Part = 0; Part < UF; Part++) {
2660 Instruction *NewLoad;
2661 if (BlockInMask || MaskForGaps) {
2662 assert(useMaskedInterleavedAccesses(*TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(*TTI)
&& "masked interleaved groups are not allowed.") ? void
(0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2663, __extension__
__PRETTY_FUNCTION__))
2663 "masked interleaved groups are not allowed.")(static_cast <bool> (useMaskedInterleavedAccesses(*TTI)
&& "masked interleaved groups are not allowed.") ? void
(0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2663, __extension__
__PRETTY_FUNCTION__))
;
2664 Value *GroupMask = MaskForGaps;
2665 if (BlockInMask) {
2666 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2667 Value *ShuffledMask = Builder.CreateShuffleVector(
2668 BlockInMaskPart,
2669 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2670 "interleaved.mask");
2671 GroupMask = MaskForGaps
2672 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2673 MaskForGaps)
2674 : ShuffledMask;
2675 }
2676 NewLoad =
2677 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2678 GroupMask, PoisonVec, "wide.masked.vec");
2679 }
2680 else
2681 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2682 Group->getAlign(), "wide.vec");
2683 Group->addMetadata(NewLoad);
2684 NewLoads.push_back(NewLoad);
2685 }
2686
2687 // For each member in the group, shuffle out the appropriate data from the
2688 // wide loads.
2689 unsigned J = 0;
2690 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2691 Instruction *Member = Group->getMember(I);
2692
2693 // Skip the gaps in the group.
2694 if (!Member)
2695 continue;
2696
2697 auto StrideMask =
2698 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2699 for (unsigned Part = 0; Part < UF; Part++) {
2700 Value *StridedVec = Builder.CreateShuffleVector(
2701 NewLoads[Part], StrideMask, "strided.vec");
2702
2703 // If this member has different type, cast the result type.
2704 if (Member->getType() != ScalarTy) {
2705 assert(!VF.isScalable() && "VF is assumed to be non scalable.")(static_cast <bool> (!VF.isScalable() && "VF is assumed to be non scalable."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2705, __extension__
__PRETTY_FUNCTION__))
;
2706 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2707 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2708 }
2709
2710 if (Group->isReverse())
2711 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2712
2713 State.set(VPDefs[J], StridedVec, Part);
2714 }
2715 ++J;
2716 }
2717 return;
2718 }
2719
2720 // The sub vector type for current instruction.
2721 auto *SubVT = VectorType::get(ScalarTy, VF);
2722
2723 // Vectorize the interleaved store group.
2724 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2725 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&(static_cast <bool> ((!MaskForGaps || useMaskedInterleavedAccesses
(*TTI)) && "masked interleaved groups are not allowed."
) ? void (0) : __assert_fail ("(!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2726, __extension__
__PRETTY_FUNCTION__))
2726 "masked interleaved groups are not allowed.")(static_cast <bool> ((!MaskForGaps || useMaskedInterleavedAccesses
(*TTI)) && "masked interleaved groups are not allowed."
) ? void (0) : __assert_fail ("(!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2726, __extension__
__PRETTY_FUNCTION__))
;
2727 assert((!MaskForGaps || !VF.isScalable()) &&(static_cast <bool> ((!MaskForGaps || !VF.isScalable())
&& "masking gaps for scalable vectors is not yet supported."
) ? void (0) : __assert_fail ("(!MaskForGaps || !VF.isScalable()) && \"masking gaps for scalable vectors is not yet supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2728, __extension__
__PRETTY_FUNCTION__))
2728 "masking gaps for scalable vectors is not yet supported.")(static_cast <bool> ((!MaskForGaps || !VF.isScalable())
&& "masking gaps for scalable vectors is not yet supported."
) ? void (0) : __assert_fail ("(!MaskForGaps || !VF.isScalable()) && \"masking gaps for scalable vectors is not yet supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2728, __extension__
__PRETTY_FUNCTION__))
;
2729 for (unsigned Part = 0; Part < UF; Part++) {
2730 // Collect the stored vector from each member.
2731 SmallVector<Value *, 4> StoredVecs;
2732 unsigned StoredIdx = 0;
2733 for (unsigned i = 0; i < InterleaveFactor; i++) {
2734 assert((Group->getMember(i) || MaskForGaps) &&(static_cast <bool> ((Group->getMember(i) || MaskForGaps
) && "Fail to get a member from an interleaved store group"
) ? void (0) : __assert_fail ("(Group->getMember(i) || MaskForGaps) && \"Fail to get a member from an interleaved store group\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2735, __extension__
__PRETTY_FUNCTION__))
2735 "Fail to get a member from an interleaved store group")(static_cast <bool> ((Group->getMember(i) || MaskForGaps
) && "Fail to get a member from an interleaved store group"
) ? void (0) : __assert_fail ("(Group->getMember(i) || MaskForGaps) && \"Fail to get a member from an interleaved store group\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2735, __extension__
__PRETTY_FUNCTION__))
;
2736 Instruction *Member = Group->getMember(i);
2737
2738 // Skip the gaps in the group.
2739 if (!Member) {
2740 Value *Undef = PoisonValue::get(SubVT);
2741 StoredVecs.push_back(Undef);
2742 continue;
2743 }
2744
2745 Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2746 ++StoredIdx;
2747
2748 if (Group->isReverse())
2749 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2750
2751 // If this member has different type, cast it to a unified type.
2752
2753 if (StoredVec->getType() != SubVT)
2754 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2755
2756 StoredVecs.push_back(StoredVec);
2757 }
2758
2759 // Concatenate all vectors into a wide vector.
2760 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2761
2762 // Interleave the elements in the wide vector.
2763 Value *IVec = Builder.CreateShuffleVector(
2764 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2765 "interleaved.vec");
2766
2767 Instruction *NewStoreInstr;
2768 if (BlockInMask || MaskForGaps) {
2769 Value *GroupMask = MaskForGaps;
2770 if (BlockInMask) {
2771 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2772 Value *ShuffledMask = Builder.CreateShuffleVector(
2773 BlockInMaskPart,
2774 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2775 "interleaved.mask");
2776 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2777 ShuffledMask, MaskForGaps)
2778 : ShuffledMask;
2779 }
2780 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2781 Group->getAlign(), GroupMask);
2782 } else
2783 NewStoreInstr =
2784 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2785
2786 Group->addMetadata(NewStoreInstr);
2787 }
2788}
2789
2790void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2791 VPReplicateRecipe *RepRecipe,
2792 const VPIteration &Instance,
2793 bool IfPredicateInstr,
2794 VPTransformState &State) {
2795 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")(static_cast <bool> (!Instr->getType()->isAggregateType
() && "Can't handle vectors") ? void (0) : __assert_fail
("!Instr->getType()->isAggregateType() && \"Can't handle vectors\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2795, __extension__
__PRETTY_FUNCTION__))
;
2796
2797 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2798 // the first lane and part.
2799 if (isa<NoAliasScopeDeclInst>(Instr))
2800 if (!Instance.isFirstIteration())
2801 return;
2802
2803 // Does this instruction return a value ?
2804 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2805
2806 Instruction *Cloned = Instr->clone();
2807 if (!IsVoidRetTy)
2808 Cloned->setName(Instr->getName() + ".cloned");
2809
2810 // If the scalarized instruction contributes to the address computation of a
2811 // widen masked load/store which was in a basic block that needed predication
2812 // and is not predicated after vectorization, we can't propagate
2813 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2814 // instruction could feed a poison value to the base address of the widen
2815 // load/store.
2816 if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2817 Cloned->dropPoisonGeneratingFlags();
2818
2819 if (Instr->getDebugLoc())
2820 State.setDebugLocFromInst(Instr);
2821
2822 // Replace the operands of the cloned instructions with their scalar
2823 // equivalents in the new loop.
2824 for (const auto &I : enumerate(RepRecipe->operands())) {
2825 auto InputInstance = Instance;
2826 VPValue *Operand = I.value();
2827 if (vputils::isUniformAfterVectorization(Operand))
2828 InputInstance.Lane = VPLane::getFirstLane();
2829 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2830 }
2831 State.addNewMetadata(Cloned, Instr);
2832
2833 // Place the cloned scalar in the new loop.
2834 State.Builder.Insert(Cloned);
2835
2836 State.set(RepRecipe, Cloned, Instance);
2837
2838 // If we just cloned a new assumption, add it the assumption cache.
2839 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2840 AC->registerAssumption(II);
2841
2842 // End if-block.
2843 if (IfPredicateInstr)
2844 PredicatedInstructions.push_back(Cloned);
2845}
2846
2847Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
2848 if (TripCount)
2849 return TripCount;
2850
2851 assert(InsertBlock)(static_cast <bool> (InsertBlock) ? void (0) : __assert_fail
("InsertBlock", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2851, __extension__ __PRETTY_FUNCTION__))
;
2852 IRBuilder<> Builder(InsertBlock->getTerminator());
2853 // Find the loop boundaries.
2854 Type *IdxTy = Legal->getWidestInductionType();
2855 assert(IdxTy && "No type for induction")(static_cast <bool> (IdxTy && "No type for induction"
) ? void (0) : __assert_fail ("IdxTy && \"No type for induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2855, __extension__
__PRETTY_FUNCTION__))
;
2856 const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE);
2857
2858 const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
2859
2860 // Expand the trip count and place the new instructions in the preheader.
2861 // Notice that the pre-header does not change, only the loop body.
2862 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2863
2864 // Count holds the overall loop count (N).
2865 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2866 InsertBlock->getTerminator());
2867
2868 if (TripCount->getType()->isPointerTy())
2869 TripCount =
2870 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2871 InsertBlock->getTerminator());
2872
2873 return TripCount;
2874}
2875
2876Value *
2877InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2878 if (VectorTripCount)
2879 return VectorTripCount;
2880
2881 Value *TC = getOrCreateTripCount(InsertBlock);
2882 IRBuilder<> Builder(InsertBlock->getTerminator());
2883
2884 Type *Ty = TC->getType();
2885 // This is where we can make the step a runtime constant.
2886 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2887
2888 // If the tail is to be folded by masking, round the number of iterations N
2889 // up to a multiple of Step instead of rounding down. This is done by first
2890 // adding Step-1 and then rounding down. Note that it's ok if this addition
2891 // overflows: the vector induction variable will eventually wrap to zero given
2892 // that it starts at zero and its Step is a power of two; the loop will then
2893 // exit, with the last early-exit vector comparison also producing all-true.
2894 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2895 // is accounted for in emitIterationCountCheck that adds an overflow check.
2896 if (Cost->foldTailByMasking()) {
2897 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
) * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2898, __extension__
__PRETTY_FUNCTION__))
2898 "VF*UF must be a power of 2 when folding tail by masking")(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
) * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2898, __extension__
__PRETTY_FUNCTION__))
;
2899 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2900 TC = Builder.CreateAdd(
2901 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2902 }
2903
2904 // Now we need to generate the expression for the part of the loop that the
2905 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2906 // iterations are not required for correctness, or N - Step, otherwise. Step
2907 // is equal to the vectorization factor (number of SIMD elements) times the
2908 // unroll factor (number of SIMD instructions).
2909 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2910
2911 // There are cases where we *must* run at least one iteration in the remainder
2912 // loop. See the cost model for when this can happen. If the step evenly
2913 // divides the trip count, we set the remainder to be equal to the step. If
2914 // the step does not evenly divide the trip count, no adjustment is necessary
2915 // since there will already be scalar iterations. Note that the minimum
2916 // iterations check ensures that N >= Step.
2917 if (Cost->requiresScalarEpilogue(VF)) {
2918 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2919 R = Builder.CreateSelect(IsZero, Step, R);
2920 }
2921
2922 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2923
2924 return VectorTripCount;
2925}
2926
2927Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2928 const DataLayout &DL) {
2929 // Verify that V is a vector type with same number of elements as DstVTy.
2930 auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2931 unsigned VF = DstFVTy->getNumElements();
2932 auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2933 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match")(static_cast <bool> ((VF == SrcVecTy->getNumElements
()) && "Vector dimensions do not match") ? void (0) :
__assert_fail ("(VF == SrcVecTy->getNumElements()) && \"Vector dimensions do not match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2933, __extension__
__PRETTY_FUNCTION__))
;
2934 Type *SrcElemTy = SrcVecTy->getElementType();
2935 Type *DstElemTy = DstFVTy->getElementType();
2936 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2937, __extension__
__PRETTY_FUNCTION__))
2937 "Vector elements must have same size")(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2937, __extension__
__PRETTY_FUNCTION__))
;
2938
2939 // Do a direct cast if element types are castable.
2940 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2941 return Builder.CreateBitOrPointerCast(V, DstFVTy);
2942 }
2943 // V cannot be directly casted to desired vector type.
2944 // May happen when V is a floating point vector but DstVTy is a vector of
2945 // pointers or vice-versa. Handle this using a two-step bitcast using an
2946 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2947 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2948, __extension__
__PRETTY_FUNCTION__))
2948 "Only one type should be a pointer type")(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2948, __extension__
__PRETTY_FUNCTION__))
;
2949 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&(static_cast <bool> ((DstElemTy->isFloatingPointTy()
!= SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2950, __extension__
__PRETTY_FUNCTION__))
2950 "Only one type should be a floating point type")(static_cast <bool> ((DstElemTy->isFloatingPointTy()
!= SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2950, __extension__
__PRETTY_FUNCTION__))
;
2951 Type *IntTy =
2952 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2953 auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2954 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2955 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2956}
2957
2958void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2959 Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
2960 // Reuse existing vector loop preheader for TC checks.
2961 // Note that new preheader block is generated for vector loop.
2962 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2963 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2964
2965 // Generate code to check if the loop's trip count is less than VF * UF, or
2966 // equal to it in case a scalar epilogue is required; this implies that the
2967 // vector trip count is zero. This check also covers the case where adding one
2968 // to the backedge-taken count overflowed leading to an incorrect trip count
2969 // of zero. In this case we will also jump to the scalar loop.
2970 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2971 : ICmpInst::ICMP_ULT;
2972
2973 // If tail is to be folded, vector loop takes care of all iterations.
2974 Type *CountTy = Count->getType();
2975 Value *CheckMinIters = Builder.getFalse();
2976 auto CreateStep = [&]() -> Value * {
2977 // Create step with max(MinProTripCount, UF * VF).
2978 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2979 return createStepForVF(Builder, CountTy, VF, UF);
2980
2981 Value *MinProfTC =
2982 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2983 if (!VF.isScalable())
2984 return MinProfTC;
2985 return Builder.CreateBinaryIntrinsic(
2986 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2987 };
2988
2989 if (!Cost->foldTailByMasking())
2990 CheckMinIters =
2991 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2992 else if (VF.isScalable()) {
2993 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2994 // an overflow to zero when updating induction variables and so an
2995 // additional overflow check is required before entering the vector loop.
2996
2997 // Get the maximum unsigned value for the type.
2998 Value *MaxUIntTripCount =
2999 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
3000 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
3001
3002 // Don't execute the vector loop if (UMax - n) < (VF * UF).
3003 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
3004 }
3005
3006 // Create new preheader for vector loop.
3007 LoopVectorPreHeader =
3008 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3009 "vector.ph");
3010
3011 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3013, __extension__
__PRETTY_FUNCTION__))
3012 DT->getNode(Bypass)->getIDom()) &&(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3013, __extension__
__PRETTY_FUNCTION__))
3013 "TC check is expected to dominate Bypass")(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3013, __extension__
__PRETTY_FUNCTION__))
;
3014
3015 // Update dominator for Bypass & LoopExit (if needed).
3016 DT->changeImmediateDominator(Bypass, TCCheckBlock);
3017 if (!Cost->requiresScalarEpilogue(VF))
3018 // If there is an epilogue which must run, there's no edge from the
3019 // middle block to exit blocks and thus no need to update the immediate
3020 // dominator of the exit blocks.
3021 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3022
3023 ReplaceInstWithInst(
3024 TCCheckBlock->getTerminator(),
3025 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3026 LoopBypassBlocks.push_back(TCCheckBlock);
3027}
3028
3029BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
3030 BasicBlock *const SCEVCheckBlock =
3031 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
3032 if (!SCEVCheckBlock)
3033 return nullptr;
3034
3035 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3038, __extension__
__PRETTY_FUNCTION__))
3036 (OptForSizeBasedOnProfile &&(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3038, __extension__
__PRETTY_FUNCTION__))
3037 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3038, __extension__
__PRETTY_FUNCTION__))
3038 "Cannot SCEV check stride or overflow when optimizing for size")(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3038, __extension__
__PRETTY_FUNCTION__))
;
3039
3040
3041 // Update dominator only if this is first RT check.
3042 if (LoopBypassBlocks.empty()) {
3043 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3044 if (!Cost->requiresScalarEpilogue(VF))
3045 // If there is an epilogue which must run, there's no edge from the
3046 // middle block to exit blocks and thus no need to update the immediate
3047 // dominator of the exit blocks.
3048 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3049 }
3050
3051 LoopBypassBlocks.push_back(SCEVCheckBlock);
3052 AddedSafetyChecks = true;
3053 return SCEVCheckBlock;
3054}
3055
3056BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
3057 // VPlan-native path does not do any analysis for runtime checks currently.
3058 if (EnableVPlanNativePath)
3059 return nullptr;
3060
3061 BasicBlock *const MemCheckBlock =
3062 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
3063
3064 // Check if we generated code that checks in runtime if arrays overlap. We put
3065 // the checks into a separate block to make the more common case of few
3066 // elements faster.
3067 if (!MemCheckBlock)
3068 return nullptr;
3069
3070 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3071 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3073, __extension__
__PRETTY_FUNCTION__))
3072 "Cannot emit memory checks when optimizing for size, unless forced "(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3073, __extension__
__PRETTY_FUNCTION__))
3073 "to vectorize.")(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3073, __extension__
__PRETTY_FUNCTION__))
;
3074 ORE->emit([&]() {
3075 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationCodeSize",
3076 OrigLoop->getStartLoc(),
3077 OrigLoop->getHeader())
3078 << "Code-size may be reduced by not forcing "
3079 "vectorization, or by source-code modifications "
3080 "eliminating the need for runtime checks "
3081 "(e.g., adding 'restrict').";
3082 });
3083 }
3084
3085 LoopBypassBlocks.push_back(MemCheckBlock);
3086
3087 AddedSafetyChecks = true;
3088
3089 return MemCheckBlock;
3090}
3091
3092void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3093 LoopScalarBody = OrigLoop->getHeader();
3094 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3095 assert(LoopVectorPreHeader && "Invalid loop structure")(static_cast <bool> (LoopVectorPreHeader && "Invalid loop structure"
) ? void (0) : __assert_fail ("LoopVectorPreHeader && \"Invalid loop structure\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3095, __extension__
__PRETTY_FUNCTION__))
;
3096 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3097 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&(static_cast <bool> ((LoopExitBlock || Cost->requiresScalarEpilogue
(VF)) && "multiple exit loop without required epilogue?"
) ? void (0) : __assert_fail ("(LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && \"multiple exit loop without required epilogue?\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3098, __extension__
__PRETTY_FUNCTION__))
3098 "multiple exit loop without required epilogue?")(static_cast <bool> ((LoopExitBlock || Cost->requiresScalarEpilogue
(VF)) && "multiple exit loop without required epilogue?"
) ? void (0) : __assert_fail ("(LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && \"multiple exit loop without required epilogue?\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3098, __extension__
__PRETTY_FUNCTION__))
;
3099
3100 LoopMiddleBlock =
3101 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3102 LI, nullptr, Twine(Prefix) + "middle.block");
3103 LoopScalarPreHeader =
3104 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3105 nullptr, Twine(Prefix) + "scalar.ph");
3106
3107 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3108
3109 // Set up the middle block terminator. Two cases:
3110 // 1) If we know that we must execute the scalar epilogue, emit an
3111 // unconditional branch.
3112 // 2) Otherwise, we must have a single unique exit block (due to how we
3113 // implement the multiple exit case). In this case, set up a conditional
3114 // branch from the middle block to the loop scalar preheader, and the
3115 // exit block. completeLoopSkeleton will update the condition to use an
3116 // iteration check, if required to decide whether to execute the remainder.
3117 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3118 BranchInst::Create(LoopScalarPreHeader) :
3119 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3120 Builder.getTrue());
3121 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3122 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3123
3124 // Update dominator for loop exit. During skeleton creation, only the vector
3125 // pre-header and the middle block are created. The vector loop is entirely
3126 // created during VPlan exection.
3127 if (!Cost->requiresScalarEpilogue(VF))
3128 // If there is an epilogue which must run, there's no edge from the
3129 // middle block to exit blocks and thus no need to update the immediate
3130 // dominator of the exit blocks.
3131 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3132}
3133
3134PHINode *InnerLoopVectorizer::createInductionResumeValue(
3135 PHINode *OrigPhi, const InductionDescriptor &II,
3136 ArrayRef<BasicBlock *> BypassBlocks,
3137 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3138 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3139 assert(VectorTripCount && "Expected valid arguments")(static_cast <bool> (VectorTripCount && "Expected valid arguments"
) ? void (0) : __assert_fail ("VectorTripCount && \"Expected valid arguments\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3139, __extension__
__PRETTY_FUNCTION__))
;
3140
3141 Instruction *OldInduction = Legal->getPrimaryInduction();
3142 Value *&EndValue = IVEndValues[OrigPhi];
3143 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3144 if (OrigPhi == OldInduction) {
3145 // We know what the end value is.
3146 EndValue = VectorTripCount;
3147 } else {
3148 IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3149
3150 // Fast-math-flags propagate from the original induction instruction.
3151 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3152 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3153
3154 Value *Step =
3155 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3156 EndValue =
3157 emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II);
3158 EndValue->setName("ind.end");
3159
3160 // Compute the end value for the additional bypass (if applicable).
3161 if (AdditionalBypass.first) {
3162 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3163 Value *Step =
3164 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3165 EndValueFromAdditionalBypass = emitTransformedIndex(
3166 B, AdditionalBypass.second, II.getStartValue(), Step, II);
3167 EndValueFromAdditionalBypass->setName("ind.end");
3168 }
3169 }
3170
3171 // Create phi nodes to merge from the backedge-taken check block.
3172 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3173 LoopScalarPreHeader->getTerminator());
3174 // Copy original phi DL over to the new one.
3175 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3176
3177 // The new PHI merges the original incoming value, in case of a bypass,
3178 // or the value at the end of the vectorized loop.
3179 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3180
3181 // Fix the scalar body counter (PHI node).
3182 // The old induction's phi node in the scalar body needs the truncated
3183 // value.
3184 for (BasicBlock *BB : BypassBlocks)
3185 BCResumeVal->addIncoming(II.getStartValue(), BB);
3186
3187 if (AdditionalBypass.first)
3188 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3189 EndValueFromAdditionalBypass);
3190 return BCResumeVal;
3191}
3192
3193void InnerLoopVectorizer::createInductionResumeValues(
3194 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3195 assert(((AdditionalBypass.first && AdditionalBypass.second) ||(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3197, __extension__
__PRETTY_FUNCTION__))
3196 (!AdditionalBypass.first && !AdditionalBypass.second)) &&(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3197, __extension__
__PRETTY_FUNCTION__))
3197 "Inconsistent information about additional bypass.")(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3197, __extension__
__PRETTY_FUNCTION__))
;
3198 // We are going to resume the execution of the scalar loop.
3199 // Go over all of the induction variables that we found and fix the
3200 // PHIs that are left in the scalar version of the loop.
3201 // The starting values of PHI nodes depend on the counter of the last
3202 // iteration in the vectorized loop.
3203 // If we come from a bypass edge then we need to start from the original
3204 // start value.
3205 for (const auto &InductionEntry : Legal->getInductionVars()) {
3206 PHINode *OrigPhi = InductionEntry.first;
3207 const InductionDescriptor &II = InductionEntry.second;
3208 PHINode *BCResumeVal = createInductionResumeValue(
3209 OrigPhi, II, LoopBypassBlocks, AdditionalBypass);
3210 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3211 }
3212}
3213
3214BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3215 // The trip counts should be cached by now.
3216 Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
3217 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3218
3219 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3220
3221 // Add a check in the middle block to see if we have completed
3222 // all of the iterations in the first vector loop. Three cases:
3223 // 1) If we require a scalar epilogue, there is no conditional branch as
3224 // we unconditionally branch to the scalar preheader. Do nothing.
3225 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3226 // Thus if tail is to be folded, we know we don't need to run the
3227 // remainder and we can use the previous value for the condition (true).
3228 // 3) Otherwise, construct a runtime check.
3229 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3230 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3231 Count, VectorTripCount, "cmp.n",
3232 LoopMiddleBlock->getTerminator());
3233
3234 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3235 // of the corresponding compare because they may have ended up with
3236 // different line numbers and we want to avoid awkward line stepping while
3237 // debugging. Eg. if the compare has got a line number inside the loop.
3238 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3239 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3240 }
3241
3242#ifdef EXPENSIVE_CHECKS
3243 assert(DT->verify(DominatorTree::VerificationLevel::Fast))(static_cast <bool> (DT->verify(DominatorTree::VerificationLevel
::Fast)) ? void (0) : __assert_fail ("DT->verify(DominatorTree::VerificationLevel::Fast)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3243, __extension__
__PRETTY_FUNCTION__))
;
3244#endif
3245
3246 return LoopVectorPreHeader;
3247}
3248
3249std::pair<BasicBlock *, Value *>
3250InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3251 /*
3252 In this function we generate a new loop. The new loop will contain
3253 the vectorized instructions while the old loop will continue to run the
3254 scalar remainder.
3255
3256 [ ] <-- loop iteration number check.
3257 / |
3258 / v
3259 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3260 | / |
3261 | / v
3262 || [ ] <-- vector pre header.
3263 |/ |
3264 | v
3265 | [ ] \
3266 | [ ]_| <-- vector loop (created during VPlan execution).
3267 | |
3268 | v
3269 \ -[ ] <--- middle-block.
3270 \/ |
3271 /\ v
3272 | ->[ ] <--- new preheader.
3273 | |
3274 (opt) v <-- edge from middle to exit iff epilogue is not required.
3275 | [ ] \
3276 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3277 \ |
3278 \ v
3279 >[ ] <-- exit block(s).
3280 ...
3281 */
3282
3283 // Create an empty vector loop, and prepare basic blocks for the runtime
3284 // checks.
3285 createVectorLoopSkeleton("");
3286
3287 // Now, compare the new count to zero. If it is zero skip the vector loop and
3288 // jump to the scalar loop. This check also covers the case where the
3289 // backedge-taken count is uint##_max: adding one to it will overflow leading
3290 // to an incorrect trip count of zero. In this (rare) case we will also jump
3291 // to the scalar loop.
3292 emitIterationCountCheck(LoopScalarPreHeader);
3293
3294 // Generate the code to check any assumptions that we've made for SCEV
3295 // expressions.
3296 emitSCEVChecks(LoopScalarPreHeader);
3297
3298 // Generate the code that checks in runtime if arrays overlap. We put the
3299 // checks into a separate block to make the more common case of few elements
3300 // faster.
3301 emitMemRuntimeChecks(LoopScalarPreHeader);
3302
3303 // Emit phis for the new starting index of the scalar loop.
3304 createInductionResumeValues();
3305
3306 return {completeLoopSkeleton(), nullptr};
3307}
3308
3309// Fix up external users of the induction variable. At this point, we are
3310// in LCSSA form, with all external PHIs that use the IV having one input value,
3311// coming from the remainder loop. We need those PHIs to also have a correct
3312// value for the IV when arriving directly from the middle block.
3313void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3314 const InductionDescriptor &II,
3315 Value *VectorTripCount, Value *EndValue,
3316 BasicBlock *MiddleBlock,
3317 BasicBlock *VectorHeader, VPlan &Plan) {
3318 // There are two kinds of external IV usages - those that use the value
3319 // computed in the last iteration (the PHI) and those that use the penultimate
3320 // value (the value that feeds into the phi from the loop latch).
3321 // We allow both, but they, obviously, have different values.
3322
3323 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block")(static_cast <bool> (OrigLoop->getUniqueExitBlock() &&
"Expected a single exit block") ? void (0) : __assert_fail (
"OrigLoop->getUniqueExitBlock() && \"Expected a single exit block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3323, __extension__
__PRETTY_FUNCTION__))
;
3324
3325 DenseMap<Value *, Value *> MissingVals;
3326
3327 // An external user of the last iteration's value should see the value that
3328 // the remainder loop uses to initialize its own IV.
3329 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3330 for (User *U : PostInc->users()) {
3331 Instruction *UI = cast<Instruction>(U);
3332 if (!OrigLoop->contains(UI)) {
3333 assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3333, __extension__
__PRETTY_FUNCTION__))
;
3334 MissingVals[UI] = EndValue;
3335 }
3336 }
3337
3338 // An external user of the penultimate value need to see EndValue - Step.
3339 // The simplest way to get this is to recompute it from the constituent SCEVs,
3340 // that is Start + (Step * (CRD - 1)).
3341 for (User *U : OrigPhi->users()) {
3342 auto *UI = cast<Instruction>(U);
3343 if (!OrigLoop->contains(UI)) {
3344 assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3344, __extension__
__PRETTY_FUNCTION__))
;
3345
3346 IRBuilder<> B(MiddleBlock->getTerminator());
3347
3348 // Fast-math-flags propagate from the original induction instruction.
3349 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3350 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3351
3352 Value *CountMinusOne = B.CreateSub(
3353 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3354 CountMinusOne->setName("cmo");
3355 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3356 VectorHeader->getTerminator());
3357 Value *Escape =
3358 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II);
3359 Escape->setName("ind.escape");
3360 MissingVals[UI] = Escape;
3361 }
3362 }
3363
3364 for (auto &I : MissingVals) {
3365 PHINode *PHI = cast<PHINode>(I.first);
3366 // One corner case we have to handle is two IVs "chasing" each-other,
3367 // that is %IV2 = phi [...], [ %IV1, %latch ]
3368 // In this case, if IV1 has an external use, we need to avoid adding both
3369 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3370 // don't already have an incoming value for the middle block.
3371 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3372 PHI->addIncoming(I.second, MiddleBlock);
3373 Plan.removeLiveOut(PHI);
3374 }
3375 }
3376}
3377
3378namespace {
3379
3380struct CSEDenseMapInfo {
3381 static bool canHandle(const Instruction *I) {
3382 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3383 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3384 }
3385
3386 static inline Instruction *getEmptyKey() {
3387 return DenseMapInfo<Instruction *>::getEmptyKey();
3388 }
3389
3390 static inline Instruction *getTombstoneKey() {
3391 return DenseMapInfo<Instruction *>::getTombstoneKey();
3392 }
3393
3394 static unsigned getHashValue(const Instruction *I) {
3395 assert(canHandle(I) && "Unknown instruction!")(static_cast <bool> (canHandle(I) && "Unknown instruction!"
) ? void (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3395, __extension__
__PRETTY_FUNCTION__))
;
3396 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3397 I->value_op_end()));
3398 }
3399
3400 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3401 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3402 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3403 return LHS == RHS;
3404 return LHS->isIdenticalTo(RHS);
3405 }
3406};
3407
3408} // end anonymous namespace
3409
3410///Perform cse of induction variable instructions.
3411static void cse(BasicBlock *BB) {
3412 // Perform simple cse.
3413 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3414 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3415 if (!CSEDenseMapInfo::canHandle(&In))
3416 continue;
3417
3418 // Check if we can replace this instruction with any of the
3419 // visited instructions.
3420 if (Instruction *V = CSEMap.lookup(&In)) {
3421 In.replaceAllUsesWith(V);
3422 In.eraseFromParent();
3423 continue;
3424 }
3425
3426 CSEMap[&In] = &In;
3427 }
3428}
3429
3430InstructionCost
3431LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3432 bool &NeedToScalarize) const {
3433 Function *F = CI->getCalledFunction();
3434 Type *ScalarRetTy = CI->getType();
3435 SmallVector<Type *, 4> Tys, ScalarTys;
3436 for (auto &ArgOp : CI->args())
3437 ScalarTys.push_back(ArgOp->getType());
3438
3439 // Estimate cost of scalarized vector call. The source operands are assumed
3440 // to be vectors, so we need to extract individual elements from there,
3441 // execute VF scalar calls, and then gather the result into the vector return
3442 // value.
3443 InstructionCost ScalarCallCost =
3444 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3445 if (VF.isScalar())
3446 return ScalarCallCost;
3447
3448 // Compute corresponding vector type for return value and arguments.
3449 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3450 for (Type *ScalarTy : ScalarTys)
3451 Tys.push_back(ToVectorTy(ScalarTy, VF));
3452
3453 // Compute costs of unpacking argument values for the scalar calls and
3454 // packing the return values to a vector.
3455 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3456
3457 InstructionCost Cost =
3458 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3459
3460 // If we can't emit a vector call for this function, then the currently found
3461 // cost is the cost we need to return.
3462 NeedToScalarize = true;
3463 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3464 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3465
3466 if (!TLI || CI->isNoBuiltin() || !VecFunc)
3467 return Cost;
3468
3469 // If the corresponding vector cost is cheaper, return its cost.
3470 InstructionCost VectorCallCost =
3471 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3472 if (VectorCallCost < Cost) {
3473 NeedToScalarize = false;
3474 Cost = VectorCallCost;
3475 }
3476 return Cost;
3477}
3478
3479static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3480 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3481 return Elt;
3482 return VectorType::get(Elt, VF);
3483}
3484
3485InstructionCost
3486LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3487 ElementCount VF) const {
3488 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3489 assert(ID && "Expected intrinsic call!")(static_cast <bool> (ID && "Expected intrinsic call!"
) ? void (0) : __assert_fail ("ID && \"Expected intrinsic call!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3489, __extension__
__PRETTY_FUNCTION__))
;
3490 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3491 FastMathFlags FMF;
3492 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3493 FMF = FPMO->getFastMathFlags();
3494
3495 SmallVector<const Value *> Arguments(CI->args());
3496 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3497 SmallVector<Type *> ParamTys;
3498 std::transform(FTy->param_begin(), FTy->param_end(),
3499 std::back_inserter(ParamTys),
3500 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3501
3502 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3503 dyn_cast<IntrinsicInst>(CI));
3504 return TTI.getIntrinsicInstrCost(CostAttrs,
3505 TargetTransformInfo::TCK_RecipThroughput);
3506}
3507
3508static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3509 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3510 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3511 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3512}
3513
3514static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3515 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3516 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3517 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3518}
3519
3520void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3521 // For every instruction `I` in MinBWs, truncate the operands, create a
3522 // truncated version of `I` and reextend its result. InstCombine runs
3523 // later and will remove any ext/trunc pairs.
3524 SmallPtrSet<Value *, 4> Erased;
3525 for (const auto &KV : Cost->getMinimalBitwidths()) {
3526 // If the value wasn't vectorized, we must maintain the original scalar
3527 // type. The absence of the value from State indicates that it
3528 // wasn't vectorized.
3529 // FIXME: Should not rely on getVPValue at this point.
3530 VPValue *Def = State.Plan->getVPValue(KV.first, true);
3531 if (!State.hasAnyVectorValue(Def))
3532 continue;
3533 for (unsigned Part = 0; Part < UF; ++Part) {
3534 Value *I = State.get(Def, Part);
3535 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3536 continue;
3537 Type *OriginalTy = I->getType();
3538 Type *ScalarTruncatedTy =
3539 IntegerType::get(OriginalTy->getContext(), KV.second);
3540 auto *TruncatedTy = VectorType::get(
3541 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3542 if (TruncatedTy == OriginalTy)
3543 continue;
3544
3545 IRBuilder<> B(cast<Instruction>(I));
3546 auto ShrinkOperand = [&](Value *V) -> Value * {
3547 if (auto *ZI = dyn_cast<ZExtInst>(V))
3548 if (ZI->getSrcTy() == TruncatedTy)
3549 return ZI->getOperand(0);
3550 return B.CreateZExtOrTrunc(V, TruncatedTy);
3551 };
3552
3553 // The actual instruction modification depends on the instruction type,
3554 // unfortunately.
3555 Value *NewI = nullptr;
3556 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3557 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3558 ShrinkOperand(BO->getOperand(1)));
3559
3560 // Any wrapping introduced by shrinking this operation shouldn't be
3561 // considered undefined behavior. So, we can't unconditionally copy
3562 // arithmetic wrapping flags to NewI.
3563 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3564 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3565 NewI =
3566 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3567 ShrinkOperand(CI->getOperand(1)));
3568 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3569 NewI = B.CreateSelect(SI->getCondition(),
3570 ShrinkOperand(SI->getTrueValue()),
3571 ShrinkOperand(SI->getFalseValue()));
3572 } else if (auto *CI = dyn_cast<CastInst>(I)) {
3573 switch (CI->getOpcode()) {
3574 default:
3575 llvm_unreachable("Unhandled cast!")::llvm::llvm_unreachable_internal("Unhandled cast!", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3575)
;
3576 case Instruction::Trunc:
3577 NewI = ShrinkOperand(CI->getOperand(0));
3578 break;
3579 case Instruction::SExt:
3580 NewI = B.CreateSExtOrTrunc(
3581 CI->getOperand(0),
3582 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3583 break;
3584 case Instruction::ZExt:
3585 NewI = B.CreateZExtOrTrunc(
3586 CI->getOperand(0),
3587 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3588 break;
3589 }
3590 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3591 auto Elements0 =
3592 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3593 auto *O0 = B.CreateZExtOrTrunc(
3594 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3595 auto Elements1 =
3596 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3597 auto *O1 = B.CreateZExtOrTrunc(
3598 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3599
3600 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3601 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3602 // Don't do anything with the operands, just extend the result.
3603 continue;
3604 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3605 auto Elements =
3606 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3607 auto *O0 = B.CreateZExtOrTrunc(
3608 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3609 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3610 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3611 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3612 auto Elements =
3613 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3614 auto *O0 = B.CreateZExtOrTrunc(
3615 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3616 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3617 } else {
3618 // If we don't know what to do, be conservative and don't do anything.
3619 continue;
3620 }
3621
3622 // Lastly, extend the result.
3623 NewI->takeName(cast<Instruction>(I));
3624 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3625 I->replaceAllUsesWith(Res);
3626 cast<Instruction>(I)->eraseFromParent();
3627 Erased.insert(I);
3628 State.reset(Def, Res, Part);
3629 }
3630 }
3631
3632 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3633 for (const auto &KV : Cost->getMinimalBitwidths()) {
3634 // If the value wasn't vectorized, we must maintain the original scalar
3635 // type. The absence of the value from State indicates that it
3636 // wasn't vectorized.
3637 // FIXME: Should not rely on getVPValue at this point.
3638 VPValue *Def = State.Plan->getVPValue(KV.first, true);
3639 if (!State.hasAnyVectorValue(Def))
3640 continue;
3641 for (unsigned Part = 0; Part < UF; ++Part) {
3642 Value *I = State.get(Def, Part);
3643 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3644 if (Inst && Inst->use_empty()) {
3645 Value *NewI = Inst->getOperand(0);
3646 Inst->eraseFromParent();
3647 State.reset(Def, NewI, Part);
3648 }
3649 }
3650 }
3651}
3652
3653void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3654 VPlan &Plan) {
3655 // Insert truncates and extends for any truncated instructions as hints to
3656 // InstCombine.
3657 if (VF.isVector())
3658 truncateToMinimalBitwidths(State);
3659
3660 // Fix widened non-induction PHIs by setting up the PHI operands.
3661 if (EnableVPlanNativePath)
3662 fixNonInductionPHIs(Plan, State);
3663
3664 // At this point every instruction in the original loop is widened to a
3665 // vector form. Now we need to fix the recurrences in the loop. These PHI
3666 // nodes are currently empty because we did not want to introduce cycles.
3667 // This is the second stage of vectorizing recurrences.
3668 fixCrossIterationPHIs(State);
3669
3670 // Forget the original basic block.
3671 PSE.getSE()->forgetLoop(OrigLoop);
3672
3673 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
3674 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3675 if (Cost->requiresScalarEpilogue(VF)) {
3676 // No edge from the middle block to the unique exit block has been inserted
3677 // and there is nothing to fix from vector loop; phis should have incoming
3678 // from scalar loop only.
3679 Plan.clearLiveOuts();
3680 } else {
3681 // If we inserted an edge from the middle block to the unique exit block,
3682 // update uses outside the loop (phis) to account for the newly inserted
3683 // edge.
3684
3685 // Fix-up external users of the induction variables.
3686 for (const auto &Entry : Legal->getInductionVars())
3687 fixupIVUsers(Entry.first, Entry.second,
3688 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3689 IVEndValues[Entry.first], LoopMiddleBlock,
3690 VectorLoop->getHeader(), Plan);
3691 }
3692
3693 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3694 // in the exit block, so update the builder.
3695 State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
3696 for (const auto &KV : Plan.getLiveOuts())
3697 KV.second->fixPhi(Plan, State);
3698
3699 for (Instruction *PI : PredicatedInstructions)
3700 sinkScalarOperands(&*PI);
3701
3702 // Remove redundant induction instructions.
3703 cse(VectorLoop->getHeader());
3704
3705 // Set/update profile weights for the vector and remainder loops as original
3706 // loop iterations are now distributed among them. Note that original loop
3707 // represented by LoopScalarBody becomes remainder loop after vectorization.
3708 //
3709 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3710 // end up getting slightly roughened result but that should be OK since
3711 // profile is not inherently precise anyway. Note also possible bypass of
3712 // vector code caused by legality checks is ignored, assigning all the weight
3713 // to the vector loop, optimistically.
3714 //
3715 // For scalable vectorization we can't know at compile time how many iterations
3716 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3717 // vscale of '1'.
3718 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3719 LI->getLoopFor(LoopScalarBody),
3720 VF.getKnownMinValue() * UF);
3721}
3722
3723void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3724 // In order to support recurrences we need to be able to vectorize Phi nodes.
3725 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3726 // stage #2: We now need to fix the recurrences by adding incoming edges to
3727 // the currently empty PHI nodes. At this point every instruction in the
3728 // original loop is widened to a vector form so we can use them to construct
3729 // the incoming edges.
3730 VPBasicBlock *Header =
3731 State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
3732 for (VPRecipeBase &R : Header->phis()) {
3733 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3734 fixReduction(ReductionPhi, State);
3735 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3736 fixFixedOrderRecurrence(FOR, State);
3737 }
3738}
3739
3740void InnerLoopVectorizer::fixFixedOrderRecurrence(
3741 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3742 // This is the second phase of vectorizing first-order recurrences. An
3743 // overview of the transformation is described below. Suppose we have the
3744 // following loop.
3745 //
3746 // for (int i = 0; i < n; ++i)
3747 // b[i] = a[i] - a[i - 1];
3748 //
3749 // There is a first-order recurrence on "a". For this loop, the shorthand
3750 // scalar IR looks like:
3751 //
3752 // scalar.ph:
3753 // s_init = a[-1]
3754 // br scalar.body
3755 //
3756 // scalar.body:
3757 // i = phi [0, scalar.ph], [i+1, scalar.body]
3758 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3759 // s2 = a[i]
3760 // b[i] = s2 - s1
3761 // br cond, scalar.body, ...
3762 //
3763 // In this example, s1 is a recurrence because it's value depends on the
3764 // previous iteration. In the first phase of vectorization, we created a
3765 // vector phi v1 for s1. We now complete the vectorization and produce the
3766 // shorthand vector IR shown below (for VF = 4, UF = 1).
3767 //
3768 // vector.ph:
3769 // v_init = vector(..., ..., ..., a[-1])
3770 // br vector.body
3771 //
3772 // vector.body
3773 // i = phi [0, vector.ph], [i+4, vector.body]
3774 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3775 // v2 = a[i, i+1, i+2, i+3];
3776 // v3 = vector(v1(3), v2(0, 1, 2))
3777 // b[i, i+1, i+2, i+3] = v2 - v3
3778 // br cond, vector.body, middle.block
3779 //
3780 // middle.block:
3781 // x = v2(3)
3782 // br scalar.ph
3783 //
3784 // scalar.ph:
3785 // s_init = phi [x, middle.block], [a[-1], otherwise]
3786 // br scalar.body
3787 //
3788 // After execution completes the vector loop, we extract the next value of
3789 // the recurrence (x) to use as the initial value in the scalar loop.
3790
3791 // Extract the last vector element in the middle block. This will be the
3792 // initial value for the recurrence when jumping to the scalar loop.
3793 VPValue *PreviousDef = PhiR->getBackedgeValue();
3794 Value *Incoming = State.get(PreviousDef, UF - 1);
3795 auto *ExtractForScalar = Incoming;
3796 auto *IdxTy = Builder.getInt32Ty();
3797 if (VF.isVector()) {
3798 auto *One = ConstantInt::get(IdxTy, 1);
3799 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3800 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3801 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3802 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
3803 "vector.recur.extract");
3804 }
3805 // Extract the second last element in the middle block if the
3806 // Phi is used outside the loop. We need to extract the phi itself
3807 // and not the last element (the phi update in the current iteration). This
3808 // will be the value when jumping to the exit block from the LoopMiddleBlock,
3809 // when the scalar loop is not run at all.
3810 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3811 if (VF.isVector()) {
3812 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3813 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3814 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3815 Incoming, Idx, "vector.recur.extract.for.phi");
3816 } else if (UF > 1)
3817 // When loop is unrolled without vectorizing, initialize
3818 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
3819 // of `Incoming`. This is analogous to the vectorized case above: extracting
3820 // the second last element when VF > 1.
3821 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3822
3823 // Fix the initial value of the original recurrence in the scalar loop.
3824 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3825 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3826 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3827 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3828 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3829 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3830 Start->addIncoming(Incoming, BB);
3831 }
3832
3833 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3834 Phi->setName("scalar.recur");
3835
3836 // Finally, fix users of the recurrence outside the loop. The users will need
3837 // either the last value of the scalar recurrence or the last value of the
3838 // vector recurrence we extracted in the middle block. Since the loop is in
3839 // LCSSA form, we just need to find all the phi nodes for the original scalar
3840 // recurrence in the exit block, and then add an edge for the middle block.
3841 // Note that LCSSA does not imply single entry when the original scalar loop
3842 // had multiple exiting edges (as we always run the last iteration in the
3843 // scalar epilogue); in that case, there is no edge from middle to exit and
3844 // and thus no phis which needed updated.
3845 if (!Cost->requiresScalarEpilogue(VF))
3846 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3847 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
3848 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3849 State.Plan->removeLiveOut(&LCSSAPhi);
3850 }
3851}
3852
3853void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3854 VPTransformState &State) {
3855 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3856 // Get it's reduction variable descriptor.
3857 assert(Legal->isReductionVariable(OrigPhi) &&(static_cast <bool> (Legal->isReductionVariable(OrigPhi
) && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3858, __extension__
__PRETTY_FUNCTION__))
3858 "Unable to find the reduction variable")(static_cast <bool> (Legal->isReductionVariable(OrigPhi
) && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3858, __extension__
__PRETTY_FUNCTION__))
;
3859 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3860
3861 RecurKind RK = RdxDesc.getRecurrenceKind();
3862 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3863 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3864 State.setDebugLocFromInst(ReductionStartValue);
3865
3866 VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3867 // This is the vector-clone of the value that leaves the loop.
3868 Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3869
3870 // Wrap flags are in general invalid after vectorization, clear them.
3871 clearReductionWrapFlags(PhiR, State);
3872
3873 // Before each round, move the insertion point right between
3874 // the PHIs and the values we are going to write.
3875 // This allows us to write both PHINodes and the extractelement
3876 // instructions.
3877 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3878
3879 State.setDebugLocFromInst(LoopExitInst);
3880
3881 Type *PhiTy = OrigPhi->getType();
3882
3883 VPBasicBlock *LatchVPBB =
3884 PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
3885 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
3886 // If tail is folded by masking, the vector value to leave the loop should be
3887 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3888 // instead of the former. For an inloop reduction the reduction will already
3889 // be predicated, and does not need to be handled here.
3890 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3891 for (unsigned Part = 0; Part < UF; ++Part) {
3892 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3893 SelectInst *Sel = nullptr;
3894 for (User *U : VecLoopExitInst->users()) {
3895 if (isa<SelectInst>(U)) {
3896 assert(!Sel && "Reduction exit feeding two selects")(static_cast <bool> (!Sel && "Reduction exit feeding two selects"
) ? void (0) : __assert_fail ("!Sel && \"Reduction exit feeding two selects\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3896, __extension__
__PRETTY_FUNCTION__))
;
3897 Sel = cast<SelectInst>(U);
3898 } else
3899 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select")(static_cast <bool> (isa<PHINode>(U) && "Reduction exit must feed Phi's or select"
) ? void (0) : __assert_fail ("isa<PHINode>(U) && \"Reduction exit must feed Phi's or select\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3899, __extension__
__PRETTY_FUNCTION__))
;
3900 }
3901 assert(Sel && "Reduction exit feeds no select")(static_cast <bool> (Sel && "Reduction exit feeds no select"
) ? void (0) : __assert_fail ("Sel && \"Reduction exit feeds no select\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3901, __extension__
__PRETTY_FUNCTION__))
;
3902 State.reset(LoopExitInstDef, Sel, Part);
3903
3904 if (isa<FPMathOperator>(Sel))
3905 Sel->setFastMathFlags(RdxDesc.getFastMathFlags());
3906
3907 // If the target can create a predicated operator for the reduction at no
3908 // extra cost in the loop (for example a predicated vadd), it can be
3909 // cheaper for the select to remain in the loop than be sunk out of it,
3910 // and so use the select value for the phi instead of the old
3911 // LoopExitValue.
3912 if (PreferPredicatedReductionSelect ||
3913 TTI->preferPredicatedReductionSelect(
3914 RdxDesc.getOpcode(), PhiTy,
3915 TargetTransformInfo::ReductionFlags())) {
3916 auto *VecRdxPhi =
3917 cast<PHINode>(State.get(PhiR, Part));
3918 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3919 }
3920 }
3921 }
3922
3923 // If the vector reduction can be performed in a smaller type, we truncate
3924 // then extend the loop exit value to enable InstCombine to evaluate the
3925 // entire expression in the smaller type.
3926 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3927 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!")(static_cast <bool> (!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"
) ? void (0) : __assert_fail ("!PhiR->isInLoop() && \"Unexpected truncated inloop reduction!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3927, __extension__
__PRETTY_FUNCTION__))
;
3928 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3929 Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3930 VectorParts RdxParts(UF);
3931 for (unsigned Part = 0; Part < UF; ++Part) {
3932 RdxParts[Part] = State.get(LoopExitInstDef, Part);
3933 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3934 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3935 : Builder.CreateZExt(Trunc, VecTy);
3936 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3937 if (U != Trunc) {
3938 U->replaceUsesOfWith(RdxParts[Part], Extnd);
3939 RdxParts[Part] = Extnd;
3940 }
3941 }
3942 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3943 for (unsigned Part = 0; Part < UF; ++Part) {
3944 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3945 State.reset(LoopExitInstDef, RdxParts[Part], Part);
3946 }
3947 }
3948
3949 // Reduce all of the unrolled parts into a single vector.
3950 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3951 unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3952
3953 // The middle block terminator has already been assigned a DebugLoc here (the
3954 // OrigLoop's single latch terminator). We want the whole middle block to
3955 // appear to execute on this line because: (a) it is all compiler generated,
3956 // (b) these instructions are always executed after evaluating the latch
3957 // conditional branch, and (c) other passes may add new predecessors which
3958 // terminate on this line. This is the easiest way to ensure we don't
3959 // accidentally cause an extra step back into the loop while debugging.
3960 State.setDebugLocFromInst(LoopMiddleBlock->getTerminator());
3961 if (PhiR->isOrdered())
3962 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3963 else {
3964 // Floating-point operations should have some FMF to enable the reduction.
3965 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3966 Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3967 for (unsigned Part = 1; Part < UF; ++Part) {
3968 Value *RdxPart = State.get(LoopExitInstDef, Part);
3969 if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
3970 ReducedPartRdx = Builder.CreateBinOp(
3971 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
3972 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
3973 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
3974 ReducedPartRdx, RdxPart);
3975 else
3976 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
3977 }
3978 }
3979
3980 // Create the reduction after the loop. Note that inloop reductions create the
3981 // target reduction in the loop using a Reduction recipe.
3982 if (VF.isVector() && !PhiR->isInLoop()) {
3983 ReducedPartRdx =
3984 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
3985 // If the reduction can be performed in a smaller type, we need to extend
3986 // the reduction to the wider type before we branch to the original loop.
3987 if (PhiTy != RdxDesc.getRecurrenceType())
3988 ReducedPartRdx = RdxDesc.isSigned()
3989 ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
3990 : Builder.CreateZExt(ReducedPartRdx, PhiTy);
3991 }
3992
3993 PHINode *ResumePhi =
3994 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
3995
3996 // Create a phi node that merges control-flow from the backedge-taken check
3997 // block and the middle block.
3998 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
3999 LoopScalarPreHeader->getTerminator());
4000
4001 // If we are fixing reductions in the epilogue loop then we should already
4002 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
4003 // we carry over the incoming values correctly.
4004 for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
4005 if (Incoming == LoopMiddleBlock)
4006 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
4007 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
4008 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
4009 Incoming);
4010 else
4011 BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4012 }
4013
4014 // Set the resume value for this reduction
4015 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4016
4017 // If there were stores of the reduction value to a uniform memory address
4018 // inside the loop, create the final store here.
4019 if (StoreInst *SI = RdxDesc.IntermediateStore) {
4020 StoreInst *NewSI =
4021 Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
4022 propagateMetadata(NewSI, SI);
4023
4024 // If the reduction value is used in other places,
4025 // then let the code below create PHI's for that.
4026 }
4027
4028 // Now, we need to fix the users of the reduction variable
4029 // inside and outside of the scalar remainder loop.
4030
4031 // We know that the loop is in LCSSA form. We need to update the PHI nodes
4032 // in the exit blocks. See comment on analogous loop in
4033 // fixFixedOrderRecurrence for a more complete explaination of the logic.
4034 if (!Cost->requiresScalarEpilogue(VF))
4035 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4036 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
4037 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4038 State.Plan->removeLiveOut(&LCSSAPhi);
4039 }
4040
4041 // Fix the scalar loop reduction variable with the incoming reduction sum
4042 // from the vector body and from the backedge value.
4043 int IncomingEdgeBlockIdx =
4044 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4045 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")(static_cast <bool> (IncomingEdgeBlockIdx >= 0 &&
"Invalid block index") ? void (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4045, __extension__
__PRETTY_FUNCTION__))
;
4046 // Pick the other block.
4047 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4048 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4049 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4050}
4051
4052void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
4053 VPTransformState &State) {
4054 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4055 RecurKind RK = RdxDesc.getRecurrenceKind();
4056 if (RK != RecurKind::Add && RK != RecurKind::Mul)
4057 return;
4058
4059 SmallVector<VPValue *, 8> Worklist;
4060 SmallPtrSet<VPValue *, 8> Visited;
4061 Worklist.push_back(PhiR);
4062 Visited.insert(PhiR);
4063
4064 while (!Worklist.empty()) {
4065 VPValue *Cur = Worklist.pop_back_val();
4066 for (unsigned Part = 0; Part < UF; ++Part) {
4067 Value *V = State.get(Cur, Part);
4068 if (!isa<OverflowingBinaryOperator>(V))
4069 break;
4070 cast<Instruction>(V)->dropPoisonGeneratingFlags();
4071 }
4072
4073 for (VPUser *U : Cur->users()) {
4074 auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
4075 if (!UserRecipe)
4076 continue;
4077 for (VPValue *V : UserRecipe->definedValues())
4078 if (Visited.insert(V).second)
4079 Worklist.push_back(V);
4080 }
4081 }
4082}
4083
4084void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4085 // The basic block and loop containing the predicated instruction.
4086 auto *PredBB = PredInst->getParent();
4087 auto *VectorLoop = LI->getLoopFor(PredBB);
4088
4089 // Initialize a worklist with the operands of the predicated instruction.
4090 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4091
4092 // Holds instructions that we need to analyze again. An instruction may be
4093 // reanalyzed if we don't yet know if we can sink it or not.
4094 SmallVector<Instruction *, 8> InstsToReanalyze;
4095
4096 // Returns true if a given use occurs in the predicated block. Phi nodes use
4097 // their operands in their corresponding predecessor blocks.
4098 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4099 auto *I = cast<Instruction>(U.getUser());
4100 BasicBlock *BB = I->getParent();
4101 if (auto *Phi = dyn_cast<PHINode>(I))
4102 BB = Phi->getIncomingBlock(
4103 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4104 return BB == PredBB;
4105 };
4106
4107 // Iteratively sink the scalarized operands of the predicated instruction
4108 // into the block we created for it. When an instruction is sunk, it's
4109 // operands are then added to the worklist. The algorithm ends after one pass
4110 // through the worklist doesn't sink a single instruction.
4111 bool Changed;
4112 do {
4113 // Add the instructions that need to be reanalyzed to the worklist, and
4114 // reset the changed indicator.
4115 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4116 InstsToReanalyze.clear();
4117 Changed = false;
4118
4119 while (!Worklist.empty()) {
4120 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4121
4122 // We can't sink an instruction if it is a phi node, is not in the loop,
4123 // or may have side effects.
4124 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4125 I->mayHaveSideEffects())
4126 continue;
4127
4128 // If the instruction is already in PredBB, check if we can sink its
4129 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4130 // sinking the scalar instruction I, hence it appears in PredBB; but it
4131 // may have failed to sink I's operands (recursively), which we try
4132 // (again) here.
4133 if (I->getParent() == PredBB) {
4134 Worklist.insert(I->op_begin(), I->op_end());
4135 continue;
4136 }
4137
4138 // It's legal to sink the instruction if all its uses occur in the
4139 // predicated block. Otherwise, there's nothing to do yet, and we may
4140 // need to reanalyze the instruction.
4141 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4142 InstsToReanalyze.push_back(I);
4143 continue;
4144 }
4145
4146 // Move the instruction to the beginning of the predicated block, and add
4147 // it's operands to the worklist.
4148 I->moveBefore(&*PredBB->getFirstInsertionPt());
4149 Worklist.insert(I->op_begin(), I->op_end());
4150
4151 // The sinking may have enabled other instructions to be sunk, so we will
4152 // need to iterate.
4153 Changed = true;
4154 }
4155 } while (Changed);
4156}
4157
4158void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
4159 VPTransformState &State) {
4160 auto Iter = depth_first(
4161 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry()));
4162 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4163 for (VPRecipeBase &P : VPBB->phis()) {
4164 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
4165 if (!VPPhi)
4166 continue;
4167 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4168 // Make sure the builder has a valid insert point.
4169 Builder.SetInsertPoint(NewPhi);
4170 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4171 VPValue *Inc = VPPhi->getIncomingValue(i);
4172 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4173 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4174 }
4175 }
4176 }
4177}
4178
4179bool InnerLoopVectorizer::useOrderedReductions(
4180 const RecurrenceDescriptor &RdxDesc) {
4181 return Cost->useOrderedReductions(RdxDesc);
4182}
4183
4184void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4185 // We should not collect Scalars more than once per VF. Right now, this
4186 // function is called from collectUniformsAndScalars(), which already does
4187 // this check. Collecting Scalars for VF=1 does not make any sense.
4188 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&(static_cast <bool> (VF.isVector() && Scalars.find
(VF) == Scalars.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4189, __extension__
__PRETTY_FUNCTION__))
4189 "This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Scalars.find
(VF) == Scalars.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4189, __extension__
__PRETTY_FUNCTION__))
;
4190
4191 // This avoids any chances of creating a REPLICATE recipe during planning
4192 // since that would result in generation of scalarized code during execution,
4193 // which is not supported for scalable vectors.
4194 if (VF.isScalable()) {
4195 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
4196 return;
4197 }
4198
4199 SmallSetVector<Instruction *, 8> Worklist;
4200
4201 // These sets are used to seed the analysis with pointers used by memory
4202 // accesses that will remain scalar.
4203 SmallSetVector<Instruction *, 8> ScalarPtrs;
4204 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4205 auto *Latch = TheLoop->getLoopLatch();
4206
4207 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4208 // The pointer operands of loads and stores will be scalar as long as the
4209 // memory access is not a gather or scatter operation. The value operand of a
4210 // store will remain scalar if the store is scalarized.
4211 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4212 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4213 assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4214, __extension__
__PRETTY_FUNCTION__))
4214 "Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4214, __extension__
__PRETTY_FUNCTION__))
;
4215 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4216 if (Ptr == Store->getValueOperand())
4217 return WideningDecision == CM_Scalarize;
4218 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4219, __extension__
__PRETTY_FUNCTION__))
4219 "Ptr is neither a value or pointer operand")(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4219, __extension__
__PRETTY_FUNCTION__))
;
4220 return WideningDecision != CM_GatherScatter;
4221 };
4222
4223 // A helper that returns true if the given value is a bitcast or
4224 // getelementptr instruction contained in the loop.
4225 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4226 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4227 isa<GetElementPtrInst>(V)) &&
4228 !TheLoop->isLoopInvariant(V);
4229 };
4230
4231 // A helper that evaluates a memory access's use of a pointer. If the use will
4232 // be a scalar use and the pointer is only used by memory accesses, we place
4233 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4234 // PossibleNonScalarPtrs.
4235 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4236 // We only care about bitcast and getelementptr instructions contained in
4237 // the loop.
4238 if (!isLoopVaryingBitCastOrGEP(Ptr))
4239 return;
4240
4241 // If the pointer has already been identified as scalar (e.g., if it was
4242 // also identified as uniform), there's nothing to do.
4243 auto *I = cast<Instruction>(Ptr);
4244 if (Worklist.count(I))
4245 return;
4246
4247 // If the use of the pointer will be a scalar use, and all users of the
4248 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4249 // place the pointer in PossibleNonScalarPtrs.
4250 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4251 return isa<LoadInst>(U) || isa<StoreInst>(U);
4252 }))
4253 ScalarPtrs.insert(I);
4254 else
4255 PossibleNonScalarPtrs.insert(I);
4256 };
4257
4258 // We seed the scalars analysis with three classes of instructions: (1)
4259 // instructions marked uniform-after-vectorization and (2) bitcast,
4260 // getelementptr and (pointer) phi instructions used by memory accesses
4261 // requiring a scalar use.
4262 //
4263 // (1) Add to the worklist all instructions that have been identified as
4264 // uniform-after-vectorization.
4265 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4266
4267 // (2) Add to the worklist all bitcast and getelementptr instructions used by
4268 // memory accesses requiring a scalar use. The pointer operands of loads and
4269 // stores will be scalar as long as the memory accesses is not a gather or
4270 // scatter operation. The value operand of a store will remain scalar if the
4271 // store is scalarized.
4272 for (auto *BB : TheLoop->blocks())
4273 for (auto &I : *BB) {
4274 if (auto *Load = dyn_cast<LoadInst>(&I)) {
4275 evaluatePtrUse(Load, Load->getPointerOperand());
4276 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4277 evaluatePtrUse(Store, Store->getPointerOperand());
4278 evaluatePtrUse(Store, Store->getValueOperand());
4279 }
4280 }
4281 for (auto *I : ScalarPtrs)
4282 if (!PossibleNonScalarPtrs.count(I)) {
4283 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *I << "\n"; } } while (false)
;
4284 Worklist.insert(I);
4285 }
4286
4287 // Insert the forced scalars.
4288 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
4289 // induction variable when the PHI user is scalarized.
4290 auto ForcedScalar = ForcedScalars.find(VF);
4291 if (ForcedScalar != ForcedScalars.end())
4292 for (auto *I : ForcedScalar->second) {
4293 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found (forced) scalar instruction: "
<< *I << "\n"; } } while (false)
;
4294 Worklist.insert(I);
4295 }
4296
4297 // Expand the worklist by looking through any bitcasts and getelementptr
4298 // instructions we've already identified as scalar. This is similar to the
4299 // expansion step in collectLoopUniforms(); however, here we're only
4300 // expanding to include additional bitcasts and getelementptr instructions.
4301 unsigned Idx = 0;
4302 while (Idx != Worklist.size()) {
4303 Instruction *Dst = Worklist[Idx++];
4304 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4305 continue;
4306 auto *Src = cast<Instruction>(Dst->getOperand(0));
4307 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4308 auto *J = cast<Instruction>(U);
4309 return !TheLoop->contains(J) || Worklist.count(J) ||
4310 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4311 isScalarUse(J, Src));
4312 })) {
4313 Worklist.insert(Src);
4314 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Src << "\n"; } } while (false)
;
4315 }
4316 }
4317
4318 // An induction variable will remain scalar if all users of the induction
4319 // variable and induction variable update remain scalar.
4320 for (const auto &Induction : Legal->getInductionVars()) {
4321 auto *Ind = Induction.first;
4322 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4323
4324 // If tail-folding is applied, the primary induction variable will be used
4325 // to feed a vector compare.
4326 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4327 continue;
4328
4329 // Returns true if \p Indvar is a pointer induction that is used directly by
4330 // load/store instruction \p I.
4331 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4332 Instruction *I) {
4333 return Induction.second.getKind() ==
4334 InductionDescriptor::IK_PtrInduction &&
4335 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4336 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4337 };
4338
4339 // Determine if all users of the induction variable are scalar after
4340 // vectorization.
4341 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4342 auto *I = cast<Instruction>(U);
4343 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4344 IsDirectLoadStoreFromPtrIndvar(Ind, I);
4345 });
4346 if (!ScalarInd)
4347 continue;
4348
4349 // Determine if all users of the induction variable update instruction are
4350 // scalar after vectorization.
4351 auto ScalarIndUpdate =
4352 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4353 auto *I = cast<Instruction>(U);
4354 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4355 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4356 });
4357 if (!ScalarIndUpdate)
4358 continue;
4359
4360 // The induction variable and its update instruction will remain scalar.
4361 Worklist.insert(Ind);
4362 Worklist.insert(IndUpdate);
4363 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Ind << "\n"; } } while (false)
;
4364 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
4365 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
;
4366 }
4367
4368 Scalars[VF].insert(Worklist.begin(), Worklist.end());
4369}
4370
4371bool LoopVectorizationCostModel::isScalarWithPredication(
4372 Instruction *I, ElementCount VF) const {
4373 if (!isPredicatedInst(I))
4374 return false;
4375
4376 // Do we have a non-scalar lowering for this predicated
4377 // instruction? No - it is scalar with predication.
4378 switch(I->getOpcode()) {
4379 default:
4380 return true;
4381 case Instruction::Load:
4382 case Instruction::Store: {
4383 auto *Ptr = getLoadStorePointerOperand(I);
4384 auto *Ty = getLoadStoreType(I);
4385 Type *VTy = Ty;
4386 if (VF.isVector())
4387 VTy = VectorType::get(Ty, VF);
4388 const Align Alignment = getLoadStoreAlignment(I);
4389 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4390 TTI.isLegalMaskedGather(VTy, Alignment))
4391 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4392 TTI.isLegalMaskedScatter(VTy, Alignment));
4393 }
4394 case Instruction::UDiv:
4395 case Instruction::SDiv:
4396 case Instruction::SRem:
4397 case Instruction::URem: {
4398 // We have the option to use the safe-divisor idiom to avoid predication.
4399 // The cost based decision here will always select safe-divisor for
4400 // scalable vectors as scalarization isn't legal.
4401 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
4402 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
4403 }
4404 }
4405}
4406
4407bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
4408 if (!blockNeedsPredicationForAnyReason(I->getParent()))
4409 return false;
4410
4411 // Can we prove this instruction is safe to unconditionally execute?
4412 // If not, we must use some form of predication.
4413 switch(I->getOpcode()) {
4414 default:
4415 return false;
4416 case Instruction::Load:
4417 case Instruction::Store: {
4418 if (!Legal->isMaskRequired(I))
4419 return false;
4420 // When we know the load's address is loop invariant and the instruction
4421 // in the original scalar loop was unconditionally executed then we
4422 // don't need to mark it as a predicated instruction. Tail folding may
4423 // introduce additional predication, but we're guaranteed to always have
4424 // at least one active lane. We call Legal->blockNeedsPredication here
4425 // because it doesn't query tail-folding. For stores, we need to prove
4426 // both speculation safety (which follows from the same argument as loads),
4427 // but also must prove the value being stored is correct. The easiest
4428 // form of the later is to require that all values stored are the same.
4429 if (Legal->isUniformMemOp(*I) &&
4430 (isa<LoadInst>(I) ||
4431 (isa<StoreInst>(I) &&
4432 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
4433 !Legal->blockNeedsPredication(I->getParent()))
4434 return false;
4435 return true;
4436 }
4437 case Instruction::UDiv:
4438 case Instruction::SDiv:
4439 case Instruction::SRem:
4440 case Instruction::URem:
4441 // TODO: We can use the loop-preheader as context point here and get
4442 // context sensitive reasoning
4443 return !isSafeToSpeculativelyExecute(I);
4444 }
4445}
4446
4447std::pair<InstructionCost, InstructionCost>
4448LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
4449 ElementCount VF) const {
4450 assert(I->getOpcode() == Instruction::UDiv ||(static_cast <bool> (I->getOpcode() == Instruction::
UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode
() == Instruction::SRem || I->getOpcode() == Instruction::
URem) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4453, __extension__
__PRETTY_FUNCTION__))
4451 I->getOpcode() == Instruction::SDiv ||(static_cast <bool> (I->getOpcode() == Instruction::
UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode
() == Instruction::SRem || I->getOpcode() == Instruction::
URem) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4453, __extension__
__PRETTY_FUNCTION__))
4452 I->getOpcode() == Instruction::SRem ||(static_cast <bool> (I->getOpcode() == Instruction::
UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode
() == Instruction::SRem || I->getOpcode() == Instruction::
URem) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4453, __extension__
__PRETTY_FUNCTION__))
4453 I->getOpcode() == Instruction::URem)(static_cast <bool> (I->getOpcode() == Instruction::
UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode
() == Instruction::SRem || I->getOpcode() == Instruction::
URem) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4453, __extension__
__PRETTY_FUNCTION__))
;
4454 assert(!isSafeToSpeculativelyExecute(I))(static_cast <bool> (!isSafeToSpeculativelyExecute(I)) ?
void (0) : __assert_fail ("!isSafeToSpeculativelyExecute(I)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4454, __extension__
__PRETTY_FUNCTION__))
;
4455
4456 const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4457
4458 // Scalarization isn't legal for scalable vector types
4459 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
4460 if (!VF.isScalable()) {
4461 // Get the scalarization cost and scale this amount by the probability of
4462 // executing the predicated block. If the instruction is not predicated,
4463 // we fall through to the next case.
4464 ScalarizationCost = 0;
4465
4466 // These instructions have a non-void type, so account for the phi nodes
4467 // that we will create. This cost is likely to be zero. The phi node
4468 // cost, if any, should be scaled by the block probability because it
4469 // models a copy at the end of each predicated block.
4470 ScalarizationCost += VF.getKnownMinValue() *
4471 TTI.getCFInstrCost(Instruction::PHI, CostKind);
4472
4473 // The cost of the non-predicated instruction.
4474 ScalarizationCost += VF.getKnownMinValue() *
4475 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4476
4477 // The cost of insertelement and extractelement instructions needed for
4478 // scalarization.
4479 ScalarizationCost += getScalarizationOverhead(I, VF);
4480
4481 // Scale the cost by the probability of executing the predicated blocks.
4482 // This assumes the predicated block for each vector lane is equally
4483 // likely.
4484 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4485 }
4486 InstructionCost SafeDivisorCost = 0;
4487
4488 auto *VecTy = ToVectorTy(I->getType(), VF);
4489
4490 // The cost of the select guard to ensure all lanes are well defined
4491 // after we speculate above any internal control flow.
4492 SafeDivisorCost += TTI.getCmpSelInstrCost(
4493 Instruction::Select, VecTy,
4494 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4495 CmpInst::BAD_ICMP_PREDICATE, CostKind);
4496
4497 // Certain instructions can be cheaper to vectorize if they have a constant
4498 // second vector operand. One example of this are shifts on x86.
4499 Value *Op2 = I->getOperand(1);
4500 auto Op2Info = TTI.getOperandInfo(Op2);
4501 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
4502 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
4503
4504 SmallVector<const Value *, 4> Operands(I->operand_values());
4505 SafeDivisorCost += TTI.getArithmeticInstrCost(
4506 I->getOpcode(), VecTy, CostKind,
4507 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4508 Op2Info, Operands, I);
4509 return {ScalarizationCost, SafeDivisorCost};
4510}
4511
4512bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4513 Instruction *I, ElementCount VF) {
4514 assert(isAccessInterleaved(I) && "Expecting interleaved access.")(static_cast <bool> (isAccessInterleaved(I) && "Expecting interleaved access."
) ? void (0) : __assert_fail ("isAccessInterleaved(I) && \"Expecting interleaved access.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4514, __extension__
__PRETTY_FUNCTION__))
;
4515 assert(getWideningDecision(I, VF) == CM_Unknown &&(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown
&& "Decision should not be set yet.") ? void (0) : __assert_fail
("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4516, __extension__
__PRETTY_FUNCTION__))
4516 "Decision should not be set yet.")(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown
&& "Decision should not be set yet.") ? void (0) : __assert_fail
("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4516, __extension__
__PRETTY_FUNCTION__))
;
4517 auto *Group = getInterleavedAccessGroup(I);
4518 assert(Group && "Must have a group.")(static_cast <bool> (Group && "Must have a group."
) ? void (0) : __assert_fail ("Group && \"Must have a group.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4518, __extension__
__PRETTY_FUNCTION__))
;
4519
4520 // If the instruction's allocated size doesn't equal it's type size, it
4521 // requires padding and will be scalarized.
4522 auto &DL = I->getModule()->getDataLayout();
4523 auto *ScalarTy = getLoadStoreType(I);
4524 if (hasIrregularType(ScalarTy, DL))
4525 return false;
4526
4527 // If the group involves a non-integral pointer, we may not be able to
4528 // losslessly cast all values to a common type.
4529 unsigned InterleaveFactor = Group->getFactor();
4530 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4531 for (unsigned i = 0; i < InterleaveFactor; i++) {
4532 Instruction *Member = Group->getMember(i);
4533 if (!Member)
4534 continue;
4535 auto *MemberTy = getLoadStoreType(Member);
4536 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4537 // Don't coerce non-integral pointers to integers or vice versa.
4538 if (MemberNI != ScalarNI) {
4539 // TODO: Consider adding special nullptr value case here
4540 return false;
4541 } else if (MemberNI && ScalarNI &&
4542 ScalarTy->getPointerAddressSpace() !=
4543 MemberTy->getPointerAddressSpace()) {
4544 return false;
4545 }
4546 }
4547
4548 // Check if masking is required.
4549 // A Group may need masking for one of two reasons: it resides in a block that
4550 // needs predication, or it was decided to use masking to deal with gaps
4551 // (either a gap at the end of a load-access that may result in a speculative
4552 // load, or any gaps in a store-access).
4553 bool PredicatedAccessRequiresMasking =
4554 blockNeedsPredicationForAnyReason(I->getParent()) &&
4555 Legal->isMaskRequired(I);
4556 bool LoadAccessWithGapsRequiresEpilogMasking =
4557 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4558 !isScalarEpilogueAllowed();
4559 bool StoreAccessWithGapsRequiresMasking =
4560 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4561 if (!PredicatedAccessRequiresMasking &&
4562 !LoadAccessWithGapsRequiresEpilogMasking &&
4563 !StoreAccessWithGapsRequiresMasking)
4564 return true;
4565
4566 // If masked interleaving is required, we expect that the user/target had
4567 // enabled it, because otherwise it either wouldn't have been created or
4568 // it should have been invalidated by the CostModel.
4569 assert(useMaskedInterleavedAccesses(TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled."
) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4570, __extension__
__PRETTY_FUNCTION__))
4570 "Masked interleave-groups for predicated accesses are not enabled.")(static_cast <bool> (useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled."
) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4570, __extension__
__PRETTY_FUNCTION__))
;
4571
4572 if (Group->isReverse())
4573 return false;
4574
4575 auto *Ty = getLoadStoreType(I);
4576 const Align Alignment = getLoadStoreAlignment(I);
4577 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4578 : TTI.isLegalMaskedStore(Ty, Alignment);
4579}
4580
4581bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4582 Instruction *I, ElementCount VF) {
4583 // Get and ensure we have a valid memory instruction.
4584 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction")(static_cast <bool> ((isa<LoadInst, StoreInst>(I)
) && "Invalid memory instruction") ? void (0) : __assert_fail
("(isa<LoadInst, StoreInst>(I)) && \"Invalid memory instruction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4584, __extension__
__PRETTY_FUNCTION__))
;
4585
4586 auto *Ptr = getLoadStorePointerOperand(I);
4587 auto *ScalarTy = getLoadStoreType(I);
4588
4589 // In order to be widened, the pointer should be consecutive, first of all.
4590 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4591 return false;
4592
4593 // If the instruction is a store located in a predicated block, it will be
4594 // scalarized.
4595 if (isScalarWithPredication(I, VF))
4596 return false;
4597
4598 // If the instruction's allocated size doesn't equal it's type size, it
4599 // requires padding and will be scalarized.
4600 auto &DL = I->getModule()->getDataLayout();
4601 if (hasIrregularType(ScalarTy, DL))
4602 return false;
4603
4604 return true;
4605}
4606
4607void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4608 // We should not collect Uniforms more than once per VF. Right now,
4609 // this function is called from collectUniformsAndScalars(), which
4610 // already does this check. Collecting Uniforms for VF=1 does not make any
4611 // sense.
4612
4613 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&(static_cast <bool> (VF.isVector() && Uniforms.
find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4614, __extension__
__PRETTY_FUNCTION__))
4614 "This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Uniforms.
find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4614, __extension__
__PRETTY_FUNCTION__))
;
4615
4616 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4617 // not analyze again. Uniforms.count(VF) will return 1.
4618 Uniforms[VF].clear();
4619
4620 // We now know that the loop is vectorizable!
4621 // Collect instructions inside the loop that will remain uniform after
4622 // vectorization.
4623
4624 // Global values, params and instructions outside of current loop are out of
4625 // scope.
4626 auto isOutOfScope = [&](Value *V) -> bool {
4627 Instruction *I = dyn_cast<Instruction>(V);
4628 return (!I || !TheLoop->contains(I));
4629 };
4630
4631 // Worklist containing uniform instructions demanding lane 0.
4632 SetVector<Instruction *> Worklist;
4633 BasicBlock *Latch = TheLoop->getLoopLatch();
4634
4635 // Add uniform instructions demanding lane 0 to the worklist. Instructions
4636 // that are scalar with predication must not be considered uniform after
4637 // vectorization, because that would create an erroneous replicating region
4638 // where only a single instance out of VF should be formed.
4639 // TODO: optimize such seldom cases if found important, see PR40816.
4640 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4641 if (isOutOfScope(I)) {
4642 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
4643 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
;
4644 return;
4645 }
4646 if (isScalarWithPredication(I, VF)) {
4647 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
4648 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
;
4649 return;
4650 }
4651 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
<< *I << "\n"; } } while (false)
;
4652 Worklist.insert(I);
4653 };
4654
4655 // Start with the conditional branch. If the branch condition is an
4656 // instruction contained in the loop that is only used by the branch, it is
4657 // uniform.
4658 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4659 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4660 addToWorklistIfAllowed(Cmp);
4661
4662 // Return true if all lanes perform the same memory operation, and we can
4663 // thus chose to execute only one.
4664 auto isUniformMemOpUse = [&](Instruction *I) {
4665 if (!Legal->isUniformMemOp(*I))
4666 return false;
4667 if (isa<LoadInst>(I))
4668 // Loading the same address always produces the same result - at least
4669 // assuming aliasing and ordering which have already been checked.
4670 return true;
4671 // Storing the same value on every iteration.
4672 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4673 };
4674
4675 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4676 InstWidening WideningDecision = getWideningDecision(I, VF);
4677 assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4678, __extension__
__PRETTY_FUNCTION__))
4678 "Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4678, __extension__
__PRETTY_FUNCTION__))
;
4679
4680 if (isUniformMemOpUse(I))
4681 return true;
4682
4683 return (WideningDecision == CM_Widen ||
4684 WideningDecision == CM_Widen_Reverse ||
4685 WideningDecision == CM_Interleave);
4686 };
4687
4688
4689 // Returns true if Ptr is the pointer operand of a memory access instruction
4690 // I, and I is known to not require scalarization.
4691 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4692 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4693 };
4694
4695 // Holds a list of values which are known to have at least one uniform use.
4696 // Note that there may be other uses which aren't uniform. A "uniform use"
4697 // here is something which only demands lane 0 of the unrolled iterations;
4698 // it does not imply that all lanes produce the same value (e.g. this is not
4699 // the usual meaning of uniform)
4700 SetVector<Value *> HasUniformUse;
4701
4702 // Scan the loop for instructions which are either a) known to have only
4703 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4704 for (auto *BB : TheLoop->blocks())
4705 for (auto &I : *BB) {
4706 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4707 switch (II->getIntrinsicID()) {
4708 case Intrinsic::sideeffect:
4709 case Intrinsic::experimental_noalias_scope_decl:
4710 case Intrinsic::assume:
4711 case Intrinsic::lifetime_start:
4712 case Intrinsic::lifetime_end:
4713 if (TheLoop->hasLoopInvariantOperands(&I))
4714 addToWorklistIfAllowed(&I);
4715 break;
4716 default:
4717 break;
4718 }
4719 }
4720
4721 // ExtractValue instructions must be uniform, because the operands are
4722 // known to be loop-invariant.
4723 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4724 assert(isOutOfScope(EVI->getAggregateOperand()) &&(static_cast <bool> (isOutOfScope(EVI->getAggregateOperand
()) && "Expected aggregate value to be loop invariant"
) ? void (0) : __assert_fail ("isOutOfScope(EVI->getAggregateOperand()) && \"Expected aggregate value to be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4725, __extension__
__PRETTY_FUNCTION__))
4725 "Expected aggregate value to be loop invariant")(static_cast <bool> (isOutOfScope(EVI->getAggregateOperand
()) && "Expected aggregate value to be loop invariant"
) ? void (0) : __assert_fail ("isOutOfScope(EVI->getAggregateOperand()) && \"Expected aggregate value to be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4725, __extension__
__PRETTY_FUNCTION__))
;
4726 addToWorklistIfAllowed(EVI);
4727 continue;
4728 }
4729
4730 // If there's no pointer operand, there's nothing to do.
4731 auto *Ptr = getLoadStorePointerOperand(&I);
4732 if (!Ptr)
4733 continue;
4734
4735 if (isUniformMemOpUse(&I))
4736 addToWorklistIfAllowed(&I);
4737
4738 if (isUniformDecision(&I, VF)) {
4739 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check")(static_cast <bool> (isVectorizedMemAccessUse(&I, Ptr
) && "consistency check") ? void (0) : __assert_fail (
"isVectorizedMemAccessUse(&I, Ptr) && \"consistency check\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4739, __extension__
__PRETTY_FUNCTION__))
;
4740 HasUniformUse.insert(Ptr);
4741 }
4742 }
4743
4744 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4745 // demanding) users. Since loops are assumed to be in LCSSA form, this
4746 // disallows uses outside the loop as well.
4747 for (auto *V : HasUniformUse) {
4748 if (isOutOfScope(V))
4749 continue;
4750 auto *I = cast<Instruction>(V);
4751 auto UsersAreMemAccesses =
4752 llvm::all_of(I->users(), [&](User *U) -> bool {
4753 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4754 });
4755 if (UsersAreMemAccesses)
4756 addToWorklistIfAllowed(I);
4757 }
4758
4759 // Expand Worklist in topological order: whenever a new instruction
4760 // is added , its users should be already inside Worklist. It ensures
4761 // a uniform instruction will only be used by uniform instructions.
4762 unsigned idx = 0;
4763 while (idx != Worklist.size()) {
4764 Instruction *I = Worklist[idx++];
4765
4766 for (auto *OV : I->operand_values()) {
4767 // isOutOfScope operands cannot be uniform instructions.
4768 if (isOutOfScope(OV))
4769 continue;
4770 // First order recurrence Phi's should typically be considered
4771 // non-uniform.
4772 auto *OP = dyn_cast<PHINode>(OV);
4773 if (OP && Legal->isFixedOrderRecurrence(OP))
4774 continue;
4775 // If all the users of the operand are uniform, then add the
4776 // operand into the uniform worklist.
4777 auto *OI = cast<Instruction>(OV);
4778 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4779 auto *J = cast<Instruction>(U);
4780 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4781 }))
4782 addToWorklistIfAllowed(OI);
4783 }
4784 }
4785
4786 // For an instruction to be added into Worklist above, all its users inside
4787 // the loop should also be in Worklist. However, this condition cannot be
4788 // true for phi nodes that form a cyclic dependence. We must process phi
4789 // nodes separately. An induction variable will remain uniform if all users
4790 // of the induction variable and induction variable update remain uniform.
4791 // The code below handles both pointer and non-pointer induction variables.
4792 for (const auto &Induction : Legal->getInductionVars()) {
4793 auto *Ind = Induction.first;
4794 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4795
4796 // Determine if all users of the induction variable are uniform after
4797 // vectorization.
4798 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4799 auto *I = cast<Instruction>(U);
4800 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4801 isVectorizedMemAccessUse(I, Ind);
4802 });
4803 if (!UniformInd)
4804 continue;
4805
4806 // Determine if all users of the induction variable update instruction are
4807 // uniform after vectorization.
4808 auto UniformIndUpdate =
4809 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4810 auto *I = cast<Instruction>(U);
4811 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4812 isVectorizedMemAccessUse(I, IndUpdate);
4813 });
4814 if (!UniformIndUpdate)
4815 continue;
4816
4817 // The induction variable and its update instruction will remain uniform.
4818 addToWorklistIfAllowed(Ind);
4819 addToWorklistIfAllowed(IndUpdate);
4820 }
4821
4822 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4823}
4824
4825bool LoopVectorizationCostModel::runtimeChecksRequired() {
4826 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Performing code size checks.\n"
; } } while (false)
;
4827
4828 if (Legal->getRuntimePointerChecking()->Need) {
4829 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4830 "runtime pointer checks needed. Enable vectorization of this "
4831 "loop with '#pragma clang loop vectorize(enable)' when "
4832 "compiling with -Os/-Oz",
4833 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4834 return true;
4835 }
4836
4837 if (!PSE.getPredicate().isAlwaysTrue()) {
4838 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4839 "runtime SCEV checks needed. Enable vectorization of this "
4840 "loop with '#pragma clang loop vectorize(enable)' when "
4841 "compiling with -Os/-Oz",
4842 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4843 return true;
4844 }
4845
4846 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4847 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4848 reportVectorizationFailure("Runtime stride check for small trip count",
4849 "runtime stride == 1 checks needed. Enable vectorization of "
4850 "this loop without such check by compiling with -Os/-Oz",
4851 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4852 return true;
4853 }
4854
4855 return false;
4856}
4857
4858ElementCount
4859LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4860 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4861 return ElementCount::getScalable(0);
4862
4863 if (Hints->isScalableVectorizationDisabled()) {
4864 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4865 "ScalableVectorizationDisabled", ORE, TheLoop);
4866 return ElementCount::getScalable(0);
4867 }
4868
4869 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalable vectorization is available\n"
; } } while (false)
;
4870
4871 auto MaxScalableVF = ElementCount::getScalable(
4872 std::numeric_limits<ElementCount::ScalarTy>::max());
4873
4874 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4875 // FIXME: While for scalable vectors this is currently sufficient, this should
4876 // be replaced by a more detailed mechanism that filters out specific VFs,
4877 // instead of invalidating vectorization for a whole set of VFs based on the
4878 // MaxVF.
4879
4880 // Disable scalable vectorization if the loop contains unsupported reductions.
4881 if (!canVectorizeReductions(MaxScalableVF)) {
4882 reportVectorizationInfo(
4883 "Scalable vectorization not supported for the reduction "
4884 "operations found in this loop.",
4885 "ScalableVFUnfeasible", ORE, TheLoop);
4886 return ElementCount::getScalable(0);
4887 }
4888
4889 // Disable scalable vectorization if the loop contains any instructions
4890 // with element types not supported for scalable vectors.
4891 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4892 return !Ty->isVoidTy() &&
4893 !this->TTI.isElementTypeLegalForScalableVector(Ty);
4894 })) {
4895 reportVectorizationInfo("Scalable vectorization is not supported "
4896 "for all element types found in this loop.",
4897 "ScalableVFUnfeasible", ORE, TheLoop);
4898 return ElementCount::getScalable(0);
4899 }
4900
4901 if (Legal->isSafeForAnyVectorWidth())
4902 return MaxScalableVF;
4903
4904 // Limit MaxScalableVF by the maximum safe dependence distance.
4905 std::optional<unsigned> MaxVScale = TTI.getMaxVScale();
4906 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
4907 MaxVScale =
4908 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
4909 MaxScalableVF =
4910 ElementCount::getScalable(MaxVScale ? (MaxSafeElements / *MaxVScale) : 0);
4911 if (!MaxScalableVF)
4912 reportVectorizationInfo(
4913 "Max legal vector width too small, scalable vectorization "
4914 "unfeasible.",
4915 "ScalableVFUnfeasible", ORE, TheLoop);
4916
4917 return MaxScalableVF;
4918}
4919
4920FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4921 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4922 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4923 unsigned SmallestType, WidestType;
4924 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4925
4926 // Get the maximum safe dependence distance in bits computed by LAA.
4927 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4928 // the memory accesses that is most restrictive (involved in the smallest
4929 // dependence distance).
4930 unsigned MaxSafeElements =
4931 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4932
4933 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4934 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4935
4936 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe fixed VF is: "
<< MaxSafeFixedVF << ".\n"; } } while (false)
4937 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe fixed VF is: "
<< MaxSafeFixedVF << ".\n"; } } while (false)
;
4938 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe scalable VF is: "
<< MaxSafeScalableVF << ".\n"; } } while (false)
4939 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe scalable VF is: "
<< MaxSafeScalableVF << ".\n"; } } while (false)
;
4940
4941 // First analyze the UserVF, fall back if the UserVF should be ignored.
4942 if (UserVF) {
4943 auto MaxSafeUserVF =
4944 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4945
4946 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4947 // If `VF=vscale x N` is safe, then so is `VF=N`
4948 if (UserVF.isScalable())
4949 return FixedScalableVFPair(
4950 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4951 else
4952 return UserVF;
4953 }
4954
4955 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF))(static_cast <bool> (ElementCount::isKnownGT(UserVF, MaxSafeUserVF
)) ? void (0) : __assert_fail ("ElementCount::isKnownGT(UserVF, MaxSafeUserVF)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4955, __extension__
__PRETTY_FUNCTION__))
;
4956
4957 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4958 // is better to ignore the hint and let the compiler choose a suitable VF.
4959 if (!UserVF.isScalable()) {
4960 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeFixedVF << ".\n"; } } while (false)
4961 << " is unsafe, clamping to max safe VF="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeFixedVF << ".\n"; } } while (false)
4962 << MaxSafeFixedVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeFixedVF << ".\n"; } } while (false)
;
4963 ORE->emit([&]() {
4964 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
4965 TheLoop->getStartLoc(),
4966 TheLoop->getHeader())
4967 << "User-specified vectorization factor "
4968 << ore::NV("UserVectorizationFactor", UserVF)
4969 << " is unsafe, clamping to maximum safe vectorization factor "
4970 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4971 });
4972 return MaxSafeFixedVF;
4973 }
4974
4975 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4976 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is ignored because scalable vectors are not "
"available.\n"; } } while (false)
4977 << " is ignored because scalable vectors are not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is ignored because scalable vectors are not "
"available.\n"; } } while (false)
4978 "available.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is ignored because scalable vectors are not "
"available.\n"; } } while (false)
;
4979 ORE->emit([&]() {
4980 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
4981 TheLoop->getStartLoc(),
4982 TheLoop->getHeader())
4983 << "User-specified vectorization factor "
4984 << ore::NV("UserVectorizationFactor", UserVF)
4985 << " is ignored because the target does not support scalable "
4986 "vectors. The compiler will pick a more suitable value.";
4987 });
4988 } else {
4989 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe. Ignoring scalable UserVF.\n"; }
} while (false)
4990 << " is unsafe. Ignoring scalable UserVF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe. Ignoring scalable UserVF.\n"; }
} while (false)
;
4991 ORE->emit([&]() {
4992 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
4993 TheLoop->getStartLoc(),
4994 TheLoop->getHeader())
4995 << "User-specified vectorization factor "
4996 << ore::NV("UserVectorizationFactor", UserVF)
4997 << " is unsafe. Ignoring the hint to let the compiler pick a "
4998 "more suitable value.";
4999 });
5000 }
5001 }
5002
5003 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestTypedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
5004 << " / " << WidestType << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
;
5005
5006 FixedScalableVFPair Result(ElementCount::getFixed(1),
5007 ElementCount::getScalable(0));
5008 if (auto MaxVF =
5009 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5010 MaxSafeFixedVF, FoldTailByMasking))
5011 Result.FixedVF = MaxVF;
5012
5013 if (auto MaxVF =
5014 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5015 MaxSafeScalableVF, FoldTailByMasking))
5016 if (MaxVF.isScalable()) {
5017 Result.ScalableVF = MaxVF;
5018 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found feasible scalable VF = "
<< MaxVF << "\n"; } } while (false)
5019 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found feasible scalable VF = "
<< MaxVF << "\n"; } } while (false)
;
5020 }
5021
5022 return Result;
5023}
5024
5025FixedScalableVFPair
5026LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5027 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5028 // TODO: It may by useful to do since it's still likely to be dynamically
5029 // uniform if the target can skip.
5030 reportVectorizationFailure(
5031 "Not inserting runtime ptr check for divergent target",
5032 "runtime pointer checks needed. Not enabled for divergent target",
5033 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5034 return FixedScalableVFPair::getNone();
5035 }
5036
5037 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5038 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found trip count: "
<< TC << '\n'; } } while (false)
;
5039 if (TC == 1) {
5040 reportVectorizationFailure("Single iteration (non) loop",
5041 "loop trip count is one, irrelevant for vectorization",
5042 "SingleIterationLoop", ORE, TheLoop);
5043 return FixedScalableVFPair::getNone();
5044 }
5045
5046 switch (ScalarEpilogueStatus) {
5047 case CM_ScalarEpilogueAllowed:
5048 return computeFeasibleMaxVF(TC, UserVF, false);
5049 case CM_ScalarEpilogueNotAllowedUsePredicate:
5050 [[fallthrough]];
5051 case CM_ScalarEpilogueNotNeededUsePredicate:
5052 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5053 dbgs() << "LV: vector predicate hint/switch found.\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5054 << "LV: Not allowing scalar epilogue, creating predicated "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5055 << "vector loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
;
5056 break;
5057 case CM_ScalarEpilogueNotAllowedLowTripLoop:
5058 // fallthrough as a special case of OptForSize
5059 case CM_ScalarEpilogueNotAllowedOptSize:
5060 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5061 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
5062 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
;
5063 else
5064 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
<< "count.\n"; } } while (false)
5065 << "count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
<< "count.\n"; } } while (false)
;
5066
5067 // Bail if runtime checks are required, which are not good when optimising
5068 // for size.
5069 if (runtimeChecksRequired())
5070 return FixedScalableVFPair::getNone();
5071
5072 break;
5073 }
5074
5075 // The only loops we can vectorize without a scalar epilogue, are loops with
5076 // a bottom-test and a single exiting block. We'd have to handle the fact
5077 // that not every instruction executes on the last iteration. This will
5078 // require a lane mask which varies through the vector loop body. (TODO)
5079 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5080 // If there was a tail-folding hint/switch, but we can't fold the tail by
5081 // masking, fallback to a vectorization with a scalar epilogue.
5082 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5083 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
5084 "scalar epilogue instead.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
;
5085 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5086 return computeFeasibleMaxVF(TC, UserVF, false);
5087 }
5088 return FixedScalableVFPair::getNone();
5089 }
5090
5091 // Now try the tail folding
5092
5093 // Invalidate interleave groups that require an epilogue if we can't mask
5094 // the interleave-group.
5095 if (!useMaskedInterleavedAccesses(TTI)) {
5096 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&(static_cast <bool> (WideningDecisions.empty() &&
Uniforms.empty() && Scalars.empty() && "No decisions should have been taken at this point"
) ? void (0) : __assert_fail ("WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && \"No decisions should have been taken at this point\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5097, __extension__
__PRETTY_FUNCTION__))
5097 "No decisions should have been taken at this point")(static_cast <bool> (WideningDecisions.empty() &&
Uniforms.empty() && Scalars.empty() && "No decisions should have been taken at this point"
) ? void (0) : __assert_fail ("WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && \"No decisions should have been taken at this point\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5097, __extension__
__PRETTY_FUNCTION__))
;
5098 // Note: There is no need to invalidate any cost modeling decisions here, as
5099 // non where taken so far.
5100 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5101 }
5102
5103 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5104 // Avoid tail folding if the trip count is known to be a multiple of any VF
5105 // we chose.
5106 // FIXME: The condition below pessimises the case for fixed-width vectors,
5107 // when scalable VFs are also candidates for vectorization.
5108 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5109 ElementCount MaxFixedVF = MaxFactors.FixedVF;
5110 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&(static_cast <bool> ((UserVF.isNonZero() || isPowerOf2_32
(MaxFixedVF.getFixedValue())) && "MaxFixedVF must be a power of 2"
) ? void (0) : __assert_fail ("(UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && \"MaxFixedVF must be a power of 2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5111, __extension__
__PRETTY_FUNCTION__))
5111 "MaxFixedVF must be a power of 2")(static_cast <bool> ((UserVF.isNonZero() || isPowerOf2_32
(MaxFixedVF.getFixedValue())) && "MaxFixedVF must be a power of 2"
) ? void (0) : __assert_fail ("(UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && \"MaxFixedVF must be a power of 2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5111, __extension__
__PRETTY_FUNCTION__))
;
5112 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5113 : MaxFixedVF.getFixedValue();
5114 ScalarEvolution *SE = PSE.getSE();
5115 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5116 const SCEV *ExitCount = SE->getAddExpr(
5117 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5118 const SCEV *Rem = SE->getURemExpr(
5119 SE->applyLoopGuards(ExitCount, TheLoop),
5120 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5121 if (Rem->isZero()) {
5122 // Accept MaxFixedVF if we do not have a tail.
5123 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: No tail will remain for any chosen VF.\n"
; } } while (false)
;
5124 return MaxFactors;
5125 }
5126 }
5127
5128 // If we don't know the precise trip count, or if the trip count that we
5129 // found modulo the vectorization factor is not zero, try to fold the tail
5130 // by masking.
5131 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5132 if (Legal->prepareToFoldTailByMasking()) {
5133 FoldTailByMasking = true;
5134 return MaxFactors;
5135 }
5136
5137 // If there was a tail-folding hint/switch, but we can't fold the tail by
5138 // masking, fallback to a vectorization with a scalar epilogue.
5139 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5140 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
5141 "scalar epilogue instead.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
;
5142 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5143 return MaxFactors;
5144 }
5145
5146 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5147 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"
; } } while (false)
;
5148 return FixedScalableVFPair::getNone();
5149 }
5150
5151 if (TC == 0) {
5152 reportVectorizationFailure(
5153 "Unable to calculate the loop count due to complex control flow",
5154 "unable to calculate the loop count due to complex control flow",
5155 "UnknownLoopCountComplexCFG", ORE, TheLoop);
5156 return FixedScalableVFPair::getNone();
5157 }
5158
5159 reportVectorizationFailure(
5160 "Cannot optimize for size and vectorize at the same time.",
5161 "cannot optimize for size and vectorize at the same time. "
5162 "Enable vectorization of this loop with '#pragma clang loop "
5163 "vectorize(enable)' when compiling with -Os/-Oz",
5164 "NoTailLoopWithOptForSize", ORE, TheLoop);
5165 return FixedScalableVFPair::getNone();
5166}
5167
5168ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5169 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5170 ElementCount MaxSafeVF, bool FoldTailByMasking) {
5171 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5172 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
5173 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5174 : TargetTransformInfo::RGK_FixedWidthVector);
5175
5176 // Convenience function to return the minimum of two ElementCounts.
5177 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5178 assert((LHS.isScalable() == RHS.isScalable()) &&(static_cast <bool> ((LHS.isScalable() == RHS.isScalable
()) && "Scalable flags must match") ? void (0) : __assert_fail
("(LHS.isScalable() == RHS.isScalable()) && \"Scalable flags must match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5179, __extension__
__PRETTY_FUNCTION__))
5179 "Scalable flags must match")(static_cast <bool> ((LHS.isScalable() == RHS.isScalable
()) && "Scalable flags must match") ? void (0) : __assert_fail
("(LHS.isScalable() == RHS.isScalable()) && \"Scalable flags must match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5179, __extension__
__PRETTY_FUNCTION__))
;
5180 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5181 };
5182
5183 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5184 // Note that both WidestRegister and WidestType may not be a powers of 2.
5185 auto MaxVectorElementCount = ElementCount::get(
5186 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5187 ComputeScalableMaxVF);
5188 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5189 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< (MaxVectorElementCount * WidestType) << " bits.\n"
; } } while (false)
5190 << (MaxVectorElementCount * WidestType) << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< (MaxVectorElementCount * WidestType) << " bits.\n"
; } } while (false)
;
5191
5192 if (!MaxVectorElementCount) {
5193 LLVM_DEBUG(dbgs() << "LV: The target has no "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no "
<< (ComputeScalableMaxVF ? "scalable" : "fixed") <<
" vector registers.\n"; } } while (false)
5194 << (ComputeScalableMaxVF ? "scalable" : "fixed")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no "
<< (ComputeScalableMaxVF ? "scalable" : "fixed") <<
" vector registers.\n"; } } while (false)
5195 << " vector registers.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no "
<< (ComputeScalableMaxVF ? "scalable" : "fixed") <<
" vector registers.\n"; } } while (false)
;
5196 return ElementCount::getFixed(1);
5197 }
5198
5199 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
5200 if (MaxVectorElementCount.isScalable() &&
5201 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5202 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5203 auto Min = Attr.getVScaleRangeMin();
5204 WidestRegisterMinEC *= Min;
5205 }
5206 if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC &&
5207 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5208 // If loop trip count (TC) is known at compile time there is no point in
5209 // choosing VF greater than TC (as done in the loop below). Select maximum
5210 // power of two which doesn't exceed TC.
5211 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5212 // when the TC is less than or equal to the known number of lanes.
5213 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5214 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: " << ClampedConstTripCount
<< "\n"; } } while (false)
5215 "exceeding the constant trip count: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: " << ClampedConstTripCount
<< "\n"; } } while (false)
5216 << ClampedConstTripCount << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: " << ClampedConstTripCount
<< "\n"; } } while (false)
;
5217 return ElementCount::getFixed(ClampedConstTripCount);
5218 }
5219
5220 TargetTransformInfo::RegisterKind RegKind =
5221 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5222 : TargetTransformInfo::RGK_FixedWidthVector;
5223 ElementCount MaxVF = MaxVectorElementCount;
5224 if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
5225 TTI.shouldMaximizeVectorBandwidth(RegKind))) {
5226 auto MaxVectorElementCountMaxBW = ElementCount::get(
5227 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5228 ComputeScalableMaxVF);
5229 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5230
5231 // Collect all viable vectorization factors larger than the default MaxVF
5232 // (i.e. MaxVectorElementCount).
5233 SmallVector<ElementCount, 8> VFs;
5234 for (ElementCount VS = MaxVectorElementCount * 2;
5235 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5236 VFs.push_back(VS);
5237
5238 // For each VF calculate its register usage.
5239 auto RUs = calculateRegisterUsage(VFs);
5240
5241 // Select the largest VF which doesn't require more registers than existing
5242 // ones.
5243 for (int i = RUs.size() - 1; i >= 0; --i) {
5244 bool Selected = true;
5245 for (auto &pair : RUs[i].MaxLocalUsers) {
5246 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5247 if (pair.second > TargetNumRegisters)
5248 Selected = false;
5249 }
5250 if (Selected) {
5251 MaxVF = VFs[i];
5252 break;
5253 }
5254 }
5255 if (ElementCount MinVF =
5256 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5257 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5258 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
<< MaxVF << ") with target's minimum: " <<
MinVF << '\n'; } } while (false)
5259 << ") with target's minimum: " << MinVF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
<< MaxVF << ") with target's minimum: " <<
MinVF << '\n'; } } while (false)
;
5260 MaxVF = MinVF;
5261 }
5262 }
5263
5264 // Invalidate any widening decisions we might have made, in case the loop
5265 // requires prediction (decided later), but we have already made some
5266 // load/store widening decisions.
5267 invalidateCostModelingDecisions();
5268 }
5269 return MaxVF;
5270}
5271
5272std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5273 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5274 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5275 auto Min = Attr.getVScaleRangeMin();
5276 auto Max = Attr.getVScaleRangeMax();
5277 if (Max && Min == Max)
5278 return Max;
5279 }
5280
5281 return TTI.getVScaleForTuning();
5282}
5283
5284bool LoopVectorizationCostModel::isMoreProfitable(
5285 const VectorizationFactor &A, const VectorizationFactor &B) const {
5286 InstructionCost CostA = A.Cost;
5287 InstructionCost CostB = B.Cost;
5288
5289 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5290
5291 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5292 MaxTripCount) {
5293 // If we are folding the tail and the trip count is a known (possibly small)
5294 // constant, the trip count will be rounded up to an integer number of
5295 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5296 // which we compare directly. When not folding the tail, the total cost will
5297 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5298 // approximated with the per-lane cost below instead of using the tripcount
5299 // as here.
5300 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5301 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5302 return RTCostA < RTCostB;
5303 }
5304
5305 // Improve estimate for the vector width if it is scalable.
5306 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5307 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5308 if (std::optional<unsigned> VScale = getVScaleForTuning()) {
5309 if (A.Width.isScalable())
5310 EstimatedWidthA *= *VScale;
5311 if (B.Width.isScalable())
5312 EstimatedWidthB *= *VScale;
5313 }
5314
5315 // Assume vscale may be larger than 1 (or the value being tuned for),
5316 // so that scalable vectorization is slightly favorable over fixed-width
5317 // vectorization.
5318 if (A.Width.isScalable() && !B.Width.isScalable())
5319 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5320
5321 // To avoid the need for FP division:
5322 // (CostA / A.Width) < (CostB / B.Width)
5323 // <=> (CostA * B.Width) < (CostB * A.Width)
5324 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5325}
5326
5327VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5328 const ElementCountSet &VFCandidates) {
5329 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5330 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalar loop costs: "
<< ExpectedCost << ".\n"; } } while (false)
;
5331 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop")(static_cast <bool> (ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"
) ? void (0) : __assert_fail ("ExpectedCost.isValid() && \"Unexpected invalid cost for scalar loop\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5331, __extension__
__PRETTY_FUNCTION__))
;
5332 assert(VFCandidates.count(ElementCount::getFixed(1)) &&(static_cast <bool> (VFCandidates.count(ElementCount::getFixed
(1)) && "Expected Scalar VF to be a candidate") ? void
(0) : __assert_fail ("VFCandidates.count(ElementCount::getFixed(1)) && \"Expected Scalar VF to be a candidate\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5333, __extension__
__PRETTY_FUNCTION__))
5333 "Expected Scalar VF to be a candidate")(static_cast <bool> (VFCandidates.count(ElementCount::getFixed
(1)) && "Expected Scalar VF to be a candidate") ? void
(0) : __assert_fail ("VFCandidates.count(ElementCount::getFixed(1)) && \"Expected Scalar VF to be a candidate\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5333, __extension__
__PRETTY_FUNCTION__))
;
5334
5335 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
5336 ExpectedCost);
5337 VectorizationFactor ChosenFactor = ScalarCost;
5338
5339 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5340 if (ForceVectorization && VFCandidates.size() > 1) {
5341 // Ignore scalar width, because the user explicitly wants vectorization.
5342 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5343 // evaluation.
5344 ChosenFactor.Cost = InstructionCost::getMax();
5345 }
5346
5347 SmallVector<InstructionVFPair> InvalidCosts;
5348 for (const auto &i : VFCandidates) {
5349 // The cost for scalar VF=1 is already calculated, so ignore it.
5350 if (i.isScalar())
5351 continue;
5352
5353 VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5354 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5355
5356#ifndef NDEBUG
5357 unsigned AssumedMinimumVscale = 1;
5358 if (std::optional<unsigned> VScale = getVScaleForTuning())
5359 AssumedMinimumVscale = *VScale;
5360 unsigned Width =
5361 Candidate.Width.isScalable()
5362 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5363 : Candidate.Width.getFixedValue();
5364 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (Candidate.Cost / Width
); } } while (false)
5365 << " costs: " << (Candidate.Cost / Width))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (Candidate.Cost / Width
); } } while (false)
;
5366 if (i.isScalable())
5367 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " (assuming a minimum vscale of "
<< AssumedMinimumVscale << ")"; } } while (false
)
5368 << AssumedMinimumVscale << ")")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " (assuming a minimum vscale of "
<< AssumedMinimumVscale << ")"; } } while (false
)
;
5369 LLVM_DEBUG(dbgs() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << ".\n"; } } while (false
)
;
5370#endif
5371
5372 if (!C.second && !ForceVectorization) {
5373 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
5374 dbgs() << "LV: Not considering vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
5375 << " because it will not generate any vector instructions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
;
5376 continue;
5377 }
5378
5379 // If profitable add it to ProfitableVF list.
5380 if (isMoreProfitable(Candidate, ScalarCost))
5381 ProfitableVFs.push_back(Candidate);
5382
5383 if (isMoreProfitable(Candidate, ChosenFactor))
5384 ChosenFactor = Candidate;
5385 }
5386
5387 // Emit a report of VFs with invalid costs in the loop.
5388 if (!InvalidCosts.empty()) {
5389 // Group the remarks per instruction, keeping the instruction order from
5390 // InvalidCosts.
5391 std::map<Instruction *, unsigned> Numbering;
5392 unsigned I = 0;
5393 for (auto &Pair : InvalidCosts)
5394 if (!Numbering.count(Pair.first))
5395 Numbering[Pair.first] = I++;
5396
5397 // Sort the list, first on instruction(number) then on VF.
5398 llvm::sort(InvalidCosts,
5399 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5400 if (Numbering[A.first] != Numbering[B.first])
5401 return Numbering[A.first] < Numbering[B.first];
5402 ElementCountComparator ECC;
5403 return ECC(A.second, B.second);
5404 });
5405
5406 // For a list of ordered instruction-vf pairs:
5407 // [(load, vf1), (load, vf2), (store, vf1)]
5408 // Group the instructions together to emit separate remarks for:
5409 // load (vf1, vf2)
5410 // store (vf1)
5411 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5412 auto Subset = ArrayRef<InstructionVFPair>();
5413 do {
5414 if (Subset.empty())
5415 Subset = Tail.take_front(1);
5416
5417 Instruction *I = Subset.front().first;
5418
5419 // If the next instruction is different, or if there are no other pairs,
5420 // emit a remark for the collated subset. e.g.
5421 // [(load, vf1), (load, vf2))]
5422 // to emit:
5423 // remark: invalid costs for 'load' at VF=(vf, vf2)
5424 if (Subset == Tail || Tail[Subset.size()].first != I) {
5425 std::string OutString;
5426 raw_string_ostream OS(OutString);
5427 assert(!Subset.empty() && "Unexpected empty range")(static_cast <bool> (!Subset.empty() && "Unexpected empty range"
) ? void (0) : __assert_fail ("!Subset.empty() && \"Unexpected empty range\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5427, __extension__
__PRETTY_FUNCTION__))
;
5428 OS << "Instruction with invalid costs prevented vectorization at VF=(";
5429 for (const auto &Pair : Subset)
5430 OS << (Pair.second == Subset.front().second ? "" : ", ")
5431 << Pair.second;
5432 OS << "):";
5433 if (auto *CI = dyn_cast<CallInst>(I))
5434 OS << " call to " << CI->getCalledFunction()->getName();
5435 else