Bug Summary

File:build/source/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Warning:line 8133, column 35
Potential leak of memory pointed to by 'BlockMask'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name LoopVectorize.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm -resource-dir /usr/lib/llvm-16/lib/clang/16 -I lib/Transforms/Vectorize -I /build/source/llvm/lib/Transforms/Vectorize -I include -I /build/source/llvm/include -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-16/lib/clang/16/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm=build-llvm -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm=build-llvm -fcoverage-prefix-map=/build/source/= -source-date-epoch 1674602410 -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm -fdebug-prefix-map=/build/source/build-llvm=build-llvm -fdebug-prefix-map=/build/source/= -fdebug-prefix-map=/build/source/build-llvm=build-llvm -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-01-25-024556-16494-1 -x c++ /build/source/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

/build/source/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/Proposal/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanHCFGBuilder.h"
61#include "VPlanTransforms.h"
62#include "llvm/ADT/APInt.h"
63#include "llvm/ADT/ArrayRef.h"
64#include "llvm/ADT/DenseMap.h"
65#include "llvm/ADT/DenseMapInfo.h"
66#include "llvm/ADT/Hashing.h"
67#include "llvm/ADT/MapVector.h"
68#include "llvm/ADT/STLExtras.h"
69#include "llvm/ADT/SmallPtrSet.h"
70#include "llvm/ADT/SmallSet.h"
71#include "llvm/ADT/SmallVector.h"
72#include "llvm/ADT/Statistic.h"
73#include "llvm/ADT/StringRef.h"
74#include "llvm/ADT/Twine.h"
75#include "llvm/ADT/iterator_range.h"
76#include "llvm/Analysis/AssumptionCache.h"
77#include "llvm/Analysis/BasicAliasAnalysis.h"
78#include "llvm/Analysis/BlockFrequencyInfo.h"
79#include "llvm/Analysis/CFG.h"
80#include "llvm/Analysis/CodeMetrics.h"
81#include "llvm/Analysis/DemandedBits.h"
82#include "llvm/Analysis/GlobalsModRef.h"
83#include "llvm/Analysis/LoopAccessAnalysis.h"
84#include "llvm/Analysis/LoopAnalysisManager.h"
85#include "llvm/Analysis/LoopInfo.h"
86#include "llvm/Analysis/LoopIterator.h"
87#include "llvm/Analysis/OptimizationRemarkEmitter.h"
88#include "llvm/Analysis/ProfileSummaryInfo.h"
89#include "llvm/Analysis/ScalarEvolution.h"
90#include "llvm/Analysis/ScalarEvolutionExpressions.h"
91#include "llvm/Analysis/TargetLibraryInfo.h"
92#include "llvm/Analysis/TargetTransformInfo.h"
93#include "llvm/Analysis/ValueTracking.h"
94#include "llvm/Analysis/VectorUtils.h"
95#include "llvm/IR/Attributes.h"
96#include "llvm/IR/BasicBlock.h"
97#include "llvm/IR/CFG.h"
98#include "llvm/IR/Constant.h"
99#include "llvm/IR/Constants.h"
100#include "llvm/IR/DataLayout.h"
101#include "llvm/IR/DebugInfoMetadata.h"
102#include "llvm/IR/DebugLoc.h"
103#include "llvm/IR/DerivedTypes.h"
104#include "llvm/IR/DiagnosticInfo.h"
105#include "llvm/IR/Dominators.h"
106#include "llvm/IR/Function.h"
107#include "llvm/IR/IRBuilder.h"
108#include "llvm/IR/InstrTypes.h"
109#include "llvm/IR/Instruction.h"
110#include "llvm/IR/Instructions.h"
111#include "llvm/IR/IntrinsicInst.h"
112#include "llvm/IR/Intrinsics.h"
113#include "llvm/IR/Metadata.h"
114#include "llvm/IR/Module.h"
115#include "llvm/IR/Operator.h"
116#include "llvm/IR/PatternMatch.h"
117#include "llvm/IR/Type.h"
118#include "llvm/IR/Use.h"
119#include "llvm/IR/User.h"
120#include "llvm/IR/Value.h"
121#include "llvm/IR/ValueHandle.h"
122#include "llvm/IR/Verifier.h"
123#include "llvm/InitializePasses.h"
124#include "llvm/Pass.h"
125#include "llvm/Support/Casting.h"
126#include "llvm/Support/CommandLine.h"
127#include "llvm/Support/Compiler.h"
128#include "llvm/Support/Debug.h"
129#include "llvm/Support/ErrorHandling.h"
130#include "llvm/Support/InstructionCost.h"
131#include "llvm/Support/MathExtras.h"
132#include "llvm/Support/raw_ostream.h"
133#include "llvm/Transforms/Utils/BasicBlockUtils.h"
134#include "llvm/Transforms/Utils/InjectTLIMappings.h"
135#include "llvm/Transforms/Utils/LoopSimplify.h"
136#include "llvm/Transforms/Utils/LoopUtils.h"
137#include "llvm/Transforms/Utils/LoopVersioning.h"
138#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
139#include "llvm/Transforms/Utils/SizeOpts.h"
140#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
141#include <algorithm>
142#include <cassert>
143#include <cmath>
144#include <cstdint>
145#include <functional>
146#include <iterator>
147#include <limits>
148#include <map>
149#include <memory>
150#include <string>
151#include <tuple>
152#include <utility>
153
154using namespace llvm;
155
156#define LV_NAME"loop-vectorize" "loop-vectorize"
157#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"
158
159#ifndef NDEBUG
160const char VerboseDebug[] = DEBUG_TYPE"loop-vectorize" "-verbose";
161#endif
162
163/// @{
164/// Metadata attribute names
165const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
166const char LLVMLoopVectorizeFollowupVectorized[] =
167 "llvm.loop.vectorize.followup_vectorized";
168const char LLVMLoopVectorizeFollowupEpilogue[] =
169 "llvm.loop.vectorize.followup_epilogue";
170/// @}
171
172STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = {"loop-vectorize", "LoopsVectorized"
, "Number of loops vectorized"}
;
173STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = {"loop-vectorize", "LoopsAnalyzed"
, "Number of loops analyzed for vectorization"}
;
174STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized")static llvm::Statistic LoopsEpilogueVectorized = {"loop-vectorize"
, "LoopsEpilogueVectorized", "Number of epilogues vectorized"
}
;
175
176static cl::opt<bool> EnableEpilogueVectorization(
177 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
178 cl::desc("Enable vectorization of epilogue loops."));
179
180static cl::opt<unsigned> EpilogueVectorizationForceVF(
181 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
182 cl::desc("When epilogue vectorization is enabled, and a value greater than "
183 "1 is specified, forces the given VF for all applicable epilogue "
184 "loops."));
185
186static cl::opt<unsigned> EpilogueVectorizationMinVF(
187 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
188 cl::desc("Only loops with vectorization factor equal to or larger than "
189 "the specified value are considered for epilogue vectorization."));
190
191/// Loops with a known constant trip count below this number are vectorized only
192/// if no scalar iteration overheads are incurred.
193static cl::opt<unsigned> TinyTripCountVectorThreshold(
194 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
195 cl::desc("Loops with a constant trip count that is smaller than this "
196 "value are vectorized only if no scalar iteration overheads "
197 "are incurred."));
198
199static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
200 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
201 cl::desc("The maximum allowed number of runtime memory checks"));
202
203// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
204// that predication is preferred, and this lists all options. I.e., the
205// vectorizer will try to fold the tail-loop (epilogue) into the vector body
206// and predicate the instructions accordingly. If tail-folding fails, there are
207// different fallback strategies depending on these values:
208namespace PreferPredicateTy {
209 enum Option {
210 ScalarEpilogue = 0,
211 PredicateElseScalarEpilogue,
212 PredicateOrDontVectorize
213 };
214} // namespace PreferPredicateTy
215
216static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
217 "prefer-predicate-over-epilogue",
218 cl::init(PreferPredicateTy::ScalarEpilogue),
219 cl::Hidden,
220 cl::desc("Tail-folding and predication preferences over creating a scalar "
221 "epilogue loop."),
222 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
223 "scalar-epilogue",llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
224 "Don't tail-predicate loops, create scalar epilogue")llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
,
225 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
226 "predicate-else-scalar-epilogue",llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
227 "prefer tail-folding, create scalar epilogue if tail "llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
228 "folding fails.")llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
,
229 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
230 "predicate-dont-vectorize",llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
231 "prefers tail-folding, don't attempt vectorization if "llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
232 "tail-folding fails.")llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
));
233
234static cl::opt<bool> MaximizeBandwidth(
235 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
236 cl::desc("Maximize bandwidth when selecting vectorization factor which "
237 "will be determined by the smallest type in loop."));
238
239static cl::opt<bool> EnableInterleavedMemAccesses(
240 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
241 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
242
243/// An interleave-group may need masking if it resides in a block that needs
244/// predication, or in order to mask away gaps.
245static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
246 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
247 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
248
249static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
250 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
251 cl::desc("We don't interleave loops with a estimated constant trip count "
252 "below this number"));
253
254static cl::opt<unsigned> ForceTargetNumScalarRegs(
255 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
256 cl::desc("A flag that overrides the target's number of scalar registers."));
257
258static cl::opt<unsigned> ForceTargetNumVectorRegs(
259 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
260 cl::desc("A flag that overrides the target's number of vector registers."));
261
262static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
263 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
264 cl::desc("A flag that overrides the target's max interleave factor for "
265 "scalar loops."));
266
267static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
268 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
269 cl::desc("A flag that overrides the target's max interleave factor for "
270 "vectorized loops."));
271
272static cl::opt<unsigned> ForceTargetInstructionCost(
273 "force-target-instruction-cost", cl::init(0), cl::Hidden,
274 cl::desc("A flag that overrides the target's expected cost for "
275 "an instruction to a single constant value. Mostly "
276 "useful for getting consistent testing."));
277
278static cl::opt<bool> ForceTargetSupportsScalableVectors(
279 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
280 cl::desc(
281 "Pretend that scalable vectors are supported, even if the target does "
282 "not support them. This flag should only be used for testing."));
283
284static cl::opt<unsigned> SmallLoopCost(
285 "small-loop-cost", cl::init(20), cl::Hidden,
286 cl::desc(
287 "The cost of a loop that is considered 'small' by the interleaver."));
288
289static cl::opt<bool> LoopVectorizeWithBlockFrequency(
290 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
291 cl::desc("Enable the use of the block frequency analysis to access PGO "
292 "heuristics minimizing code growth in cold regions and being more "
293 "aggressive in hot regions."));
294
295// Runtime interleave loops for load/store throughput.
296static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
297 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
298 cl::desc(
299 "Enable runtime interleaving until load/store ports are saturated"));
300
301/// Interleave small loops with scalar reductions.
302static cl::opt<bool> InterleaveSmallLoopScalarReduction(
303 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
304 cl::desc("Enable interleaving for loops with small iteration counts that "
305 "contain scalar reductions to expose ILP."));
306
307/// The number of stores in a loop that are allowed to need predication.
308static cl::opt<unsigned> NumberOfStoresToPredicate(
309 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
310 cl::desc("Max number of stores to be predicated behind an if."));
311
312static cl::opt<bool> EnableIndVarRegisterHeur(
313 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
314 cl::desc("Count the induction variable only once when interleaving"));
315
316static cl::opt<bool> EnableCondStoresVectorization(
317 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
318 cl::desc("Enable if predication of stores during vectorization."));
319
320static cl::opt<unsigned> MaxNestedScalarReductionIC(
321 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
322 cl::desc("The maximum interleave count to use when interleaving a scalar "
323 "reduction in a nested loop."));
324
325static cl::opt<bool>
326 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
327 cl::Hidden,
328 cl::desc("Prefer in-loop vector reductions, "
329 "overriding the targets preference."));
330
331static cl::opt<bool> ForceOrderedReductions(
332 "force-ordered-reductions", cl::init(false), cl::Hidden,
333 cl::desc("Enable the vectorisation of loops with in-order (strict) "
334 "FP reductions"));
335
336static cl::opt<bool> PreferPredicatedReductionSelect(
337 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
338 cl::desc(
339 "Prefer predicating a reduction operation over an after loop select."));
340
341cl::opt<bool> EnableVPlanNativePath(
342 "enable-vplan-native-path", cl::init(false), cl::Hidden,
343 cl::desc("Enable VPlan-native vectorization path with "
344 "support for outer loop vectorization."));
345
346// This flag enables the stress testing of the VPlan H-CFG construction in the
347// VPlan-native vectorization path. It must be used in conjuction with
348// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
349// verification of the H-CFGs built.
350static cl::opt<bool> VPlanBuildStressTest(
351 "vplan-build-stress-test", cl::init(false), cl::Hidden,
352 cl::desc(
353 "Build VPlan for every supported loop nest in the function and bail "
354 "out right after the build (stress test the VPlan H-CFG construction "
355 "in the VPlan-native vectorization path)."));
356
357cl::opt<bool> llvm::EnableLoopInterleaving(
358 "interleave-loops", cl::init(true), cl::Hidden,
359 cl::desc("Enable loop interleaving in Loop vectorization passes"));
360cl::opt<bool> llvm::EnableLoopVectorization(
361 "vectorize-loops", cl::init(true), cl::Hidden,
362 cl::desc("Run the Loop vectorization passes"));
363
364static cl::opt<bool> PrintVPlansInDotFormat(
365 "vplan-print-in-dot-format", cl::Hidden,
366 cl::desc("Use dot format instead of plain text when dumping VPlans"));
367
368static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
369 "force-widen-divrem-via-safe-divisor", cl::Hidden,
370 cl::desc(
371 "Override cost based safe divisor widening for div/rem instructions"));
372
373/// A helper function that returns true if the given type is irregular. The
374/// type is irregular if its allocated size doesn't equal the store size of an
375/// element of the corresponding vector type.
376static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
377 // Determine if an array of N elements of type Ty is "bitcast compatible"
378 // with a <N x Ty> vector.
379 // This is only true if there is no padding between the array elements.
380 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
381}
382
383/// A helper function that returns the reciprocal of the block probability of
384/// predicated blocks. If we return X, we are assuming the predicated block
385/// will execute once for every X iterations of the loop header.
386///
387/// TODO: We should use actual block probability here, if available. Currently,
388/// we always assume predicated blocks have a 50% chance of executing.
389static unsigned getReciprocalPredBlockProb() { return 2; }
390
391/// A helper function that returns an integer or floating-point constant with
392/// value C.
393static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
394 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
395 : ConstantFP::get(Ty, C);
396}
397
398/// Returns "best known" trip count for the specified loop \p L as defined by
399/// the following procedure:
400/// 1) Returns exact trip count if it is known.
401/// 2) Returns expected trip count according to profile data if any.
402/// 3) Returns upper bound estimate if it is known.
403/// 4) Returns std::nullopt if all of the above failed.
404static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
405 Loop *L) {
406 // Check if exact trip count is known.
407 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
408 return ExpectedTC;
409
410 // Check if there is an expected trip count available from profile data.
411 if (LoopVectorizeWithBlockFrequency)
412 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
413 return *EstimatedTC;
414
415 // Check if upper bound estimate is known.
416 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
417 return ExpectedTC;
418
419 return std::nullopt;
420}
421
422namespace {
423// Forward declare GeneratedRTChecks.
424class GeneratedRTChecks;
425} // namespace
426
427namespace llvm {
428
429AnalysisKey ShouldRunExtraVectorPasses::Key;
430
431/// InnerLoopVectorizer vectorizes loops which contain only one basic
432/// block to a specified vectorization factor (VF).
433/// This class performs the widening of scalars into vectors, or multiple
434/// scalars. This class also implements the following features:
435/// * It inserts an epilogue loop for handling loops that don't have iteration
436/// counts that are known to be a multiple of the vectorization factor.
437/// * It handles the code generation for reduction variables.
438/// * Scalarization (implementation using scalars) of un-vectorizable
439/// instructions.
440/// InnerLoopVectorizer does not perform any vectorization-legality
441/// checks, and relies on the caller to check for the different legality
442/// aspects. The InnerLoopVectorizer relies on the
443/// LoopVectorizationLegality class to provide information about the induction
444/// and reduction variables that were found to a given vectorization factor.
445class InnerLoopVectorizer {
446public:
447 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
448 LoopInfo *LI, DominatorTree *DT,
449 const TargetLibraryInfo *TLI,
450 const TargetTransformInfo *TTI, AssumptionCache *AC,
451 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
452 ElementCount MinProfitableTripCount,
453 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
454 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
455 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
456 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
457 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
458 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
459 PSI(PSI), RTChecks(RTChecks) {
460 // Query this against the original loop and save it here because the profile
461 // of the original loop header may change as the transformation happens.
462 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
463 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
464
465 if (MinProfitableTripCount.isZero())
466 this->MinProfitableTripCount = VecWidth;
467 else
468 this->MinProfitableTripCount = MinProfitableTripCount;
469 }
470
471 virtual ~InnerLoopVectorizer() = default;
472
473 /// Create a new empty loop that will contain vectorized instructions later
474 /// on, while the old loop will be used as the scalar remainder. Control flow
475 /// is generated around the vectorized (and scalar epilogue) loops consisting
476 /// of various checks and bypasses. Return the pre-header block of the new
477 /// loop and the start value for the canonical induction, if it is != 0. The
478 /// latter is the case when vectorizing the epilogue loop. In the case of
479 /// epilogue vectorization, this function is overriden to handle the more
480 /// complex control flow around the loops.
481 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
482
483 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
484 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
485
486 // Return true if any runtime check is added.
487 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
488
489 /// A type for vectorized values in the new loop. Each value from the
490 /// original loop, when vectorized, is represented by UF vector values in the
491 /// new unrolled loop, where UF is the unroll factor.
492 using VectorParts = SmallVector<Value *, 2>;
493
494 /// A helper function to scalarize a single Instruction in the innermost loop.
495 /// Generates a sequence of scalar instances for each lane between \p MinLane
496 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
497 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
498 /// Instr's operands.
499 void scalarizeInstruction(const Instruction *Instr,
500 VPReplicateRecipe *RepRecipe,
501 const VPIteration &Instance, bool IfPredicateInstr,
502 VPTransformState &State);
503
504 /// Construct the vector value of a scalarized value \p V one lane at a time.
505 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
506 VPTransformState &State);
507
508 /// Try to vectorize interleaved access group \p Group with the base address
509 /// given in \p Addr, optionally masking the vector operations if \p
510 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
511 /// values in the vectorized loop.
512 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
513 ArrayRef<VPValue *> VPDefs,
514 VPTransformState &State, VPValue *Addr,
515 ArrayRef<VPValue *> StoredValues,
516 VPValue *BlockInMask = nullptr);
517
518 /// Fix the non-induction PHIs in \p Plan.
519 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
520
521 /// Returns true if the reordering of FP operations is not allowed, but we are
522 /// able to vectorize with strict in-order reductions for the given RdxDesc.
523 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
524
525 /// Create a broadcast instruction. This method generates a broadcast
526 /// instruction (shuffle) for loop invariant values and for the induction
527 /// value. If this is the induction variable then we extend it to N, N+1, ...
528 /// this is needed because each iteration in the loop corresponds to a SIMD
529 /// element.
530 virtual Value *getBroadcastInstrs(Value *V);
531
532 // Returns the resume value (bc.merge.rdx) for a reduction as
533 // generated by fixReduction.
534 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
535
536 /// Create a new phi node for the induction variable \p OrigPhi to resume
537 /// iteration count in the scalar epilogue, from where the vectorized loop
538 /// left off. In cases where the loop skeleton is more complicated (eg.
539 /// epilogue vectorization) and the resume values can come from an additional
540 /// bypass block, the \p AdditionalBypass pair provides information about the
541 /// bypass block and the end value on the edge from bypass to this loop.
542 PHINode *createInductionResumeValue(
543 PHINode *OrigPhi, const InductionDescriptor &ID,
544 ArrayRef<BasicBlock *> BypassBlocks,
545 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
546
547protected:
548 friend class LoopVectorizationPlanner;
549
550 /// A small list of PHINodes.
551 using PhiVector = SmallVector<PHINode *, 4>;
552
553 /// A type for scalarized values in the new loop. Each value from the
554 /// original loop, when scalarized, is represented by UF x VF scalar values
555 /// in the new unrolled loop, where UF is the unroll factor and VF is the
556 /// vectorization factor.
557 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
558
559 /// Set up the values of the IVs correctly when exiting the vector loop.
560 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
561 Value *VectorTripCount, Value *EndValue,
562 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
563 VPlan &Plan);
564
565 /// Handle all cross-iteration phis in the header.
566 void fixCrossIterationPHIs(VPTransformState &State);
567
568 /// Create the exit value of first order recurrences in the middle block and
569 /// update their users.
570 void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
571 VPTransformState &State);
572
573 /// Create code for the loop exit value of the reduction.
574 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
575
576 /// Clear NSW/NUW flags from reduction instructions if necessary.
577 void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
578 VPTransformState &State);
579
580 /// Iteratively sink the scalarized operands of a predicated instruction into
581 /// the block that was created for it.
582 void sinkScalarOperands(Instruction *PredInst);
583
584 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
585 /// represented as.
586 void truncateToMinimalBitwidths(VPTransformState &State);
587
588 /// Returns (and creates if needed) the original loop trip count.
589 Value *getOrCreateTripCount(BasicBlock *InsertBlock);
590
591 /// Returns (and creates if needed) the trip count of the widened loop.
592 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
593
594 /// Returns a bitcasted value to the requested vector type.
595 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
596 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
597 const DataLayout &DL);
598
599 /// Emit a bypass check to see if the vector trip count is zero, including if
600 /// it overflows.
601 void emitIterationCountCheck(BasicBlock *Bypass);
602
603 /// Emit a bypass check to see if all of the SCEV assumptions we've
604 /// had to make are correct. Returns the block containing the checks or
605 /// nullptr if no checks have been added.
606 BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
607
608 /// Emit bypass checks to check any memory assumptions we may have made.
609 /// Returns the block containing the checks or nullptr if no checks have been
610 /// added.
611 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
612
613 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
614 /// vector loop preheader, middle block and scalar preheader.
615 void createVectorLoopSkeleton(StringRef Prefix);
616
617 /// Create new phi nodes for the induction variables to resume iteration count
618 /// in the scalar epilogue, from where the vectorized loop left off.
619 /// In cases where the loop skeleton is more complicated (eg. epilogue
620 /// vectorization) and the resume values can come from an additional bypass
621 /// block, the \p AdditionalBypass pair provides information about the bypass
622 /// block and the end value on the edge from bypass to this loop.
623 void createInductionResumeValues(
624 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
625
626 /// Complete the loop skeleton by adding debug MDs, creating appropriate
627 /// conditional branches in the middle block, preparing the builder and
628 /// running the verifier. Return the preheader of the completed vector loop.
629 BasicBlock *completeLoopSkeleton();
630
631 /// Collect poison-generating recipes that may generate a poison value that is
632 /// used after vectorization, even when their operands are not poison. Those
633 /// recipes meet the following conditions:
634 /// * Contribute to the address computation of a recipe generating a widen
635 /// memory load/store (VPWidenMemoryInstructionRecipe or
636 /// VPInterleaveRecipe).
637 /// * Such a widen memory load/store has at least one underlying Instruction
638 /// that is in a basic block that needs predication and after vectorization
639 /// the generated instruction won't be predicated.
640 void collectPoisonGeneratingRecipes(VPTransformState &State);
641
642 /// Allow subclasses to override and print debug traces before/after vplan
643 /// execution, when trace information is requested.
644 virtual void printDebugTracesAtStart(){};
645 virtual void printDebugTracesAtEnd(){};
646
647 /// The original loop.
648 Loop *OrigLoop;
649
650 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
651 /// dynamic knowledge to simplify SCEV expressions and converts them to a
652 /// more usable form.
653 PredicatedScalarEvolution &PSE;
654
655 /// Loop Info.
656 LoopInfo *LI;
657
658 /// Dominator Tree.
659 DominatorTree *DT;
660
661 /// Target Library Info.
662 const TargetLibraryInfo *TLI;
663
664 /// Target Transform Info.
665 const TargetTransformInfo *TTI;
666
667 /// Assumption Cache.
668 AssumptionCache *AC;
669
670 /// Interface to emit optimization remarks.
671 OptimizationRemarkEmitter *ORE;
672
673 /// The vectorization SIMD factor to use. Each vector will have this many
674 /// vector elements.
675 ElementCount VF;
676
677 ElementCount MinProfitableTripCount;
678
679 /// The vectorization unroll factor to use. Each scalar is vectorized to this
680 /// many different vector instructions.
681 unsigned UF;
682
683 /// The builder that we use
684 IRBuilder<> Builder;
685
686 // --- Vectorization state ---
687
688 /// The vector-loop preheader.
689 BasicBlock *LoopVectorPreHeader;
690
691 /// The scalar-loop preheader.
692 BasicBlock *LoopScalarPreHeader;
693
694 /// Middle Block between the vector and the scalar.
695 BasicBlock *LoopMiddleBlock;
696
697 /// The unique ExitBlock of the scalar loop if one exists. Note that
698 /// there can be multiple exiting edges reaching this block.
699 BasicBlock *LoopExitBlock;
700
701 /// The scalar loop body.
702 BasicBlock *LoopScalarBody;
703
704 /// A list of all bypass blocks. The first block is the entry of the loop.
705 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
706
707 /// Store instructions that were predicated.
708 SmallVector<Instruction *, 4> PredicatedInstructions;
709
710 /// Trip count of the original loop.
711 Value *TripCount = nullptr;
712
713 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
714 Value *VectorTripCount = nullptr;
715
716 /// The legality analysis.
717 LoopVectorizationLegality *Legal;
718
719 /// The profitablity analysis.
720 LoopVectorizationCostModel *Cost;
721
722 // Record whether runtime checks are added.
723 bool AddedSafetyChecks = false;
724
725 // Holds the end values for each induction variable. We save the end values
726 // so we can later fix-up the external users of the induction variables.
727 DenseMap<PHINode *, Value *> IVEndValues;
728
729 /// BFI and PSI are used to check for profile guided size optimizations.
730 BlockFrequencyInfo *BFI;
731 ProfileSummaryInfo *PSI;
732
733 // Whether this loop should be optimized for size based on profile guided size
734 // optimizatios.
735 bool OptForSizeBasedOnProfile;
736
737 /// Structure to hold information about generated runtime checks, responsible
738 /// for cleaning the checks, if vectorization turns out unprofitable.
739 GeneratedRTChecks &RTChecks;
740
741 // Holds the resume values for reductions in the loops, used to set the
742 // correct start value of reduction PHIs when vectorizing the epilogue.
743 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
744 ReductionResumeValues;
745};
746
747class InnerLoopUnroller : public InnerLoopVectorizer {
748public:
749 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
750 LoopInfo *LI, DominatorTree *DT,
751 const TargetLibraryInfo *TLI,
752 const TargetTransformInfo *TTI, AssumptionCache *AC,
753 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
754 LoopVectorizationLegality *LVL,
755 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
756 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
757 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
758 ElementCount::getFixed(1),
759 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
760 BFI, PSI, Check) {}
761
762private:
763 Value *getBroadcastInstrs(Value *V) override;
764};
765
766/// Encapsulate information regarding vectorization of a loop and its epilogue.
767/// This information is meant to be updated and used across two stages of
768/// epilogue vectorization.
769struct EpilogueLoopVectorizationInfo {
770 ElementCount MainLoopVF = ElementCount::getFixed(0);
771 unsigned MainLoopUF = 0;
772 ElementCount EpilogueVF = ElementCount::getFixed(0);
773 unsigned EpilogueUF = 0;
774 BasicBlock *MainLoopIterationCountCheck = nullptr;
775 BasicBlock *EpilogueIterationCountCheck = nullptr;
776 BasicBlock *SCEVSafetyCheck = nullptr;
777 BasicBlock *MemSafetyCheck = nullptr;
778 Value *TripCount = nullptr;
779 Value *VectorTripCount = nullptr;
780
781 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
782 ElementCount EVF, unsigned EUF)
783 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
784 assert(EUF == 1 &&(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 785, __extension__
__PRETTY_FUNCTION__))
785 "A high UF for the epilogue loop is likely not beneficial.")(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 785, __extension__
__PRETTY_FUNCTION__))
;
786 }
787};
788
789/// An extension of the inner loop vectorizer that creates a skeleton for a
790/// vectorized loop that has its epilogue (residual) also vectorized.
791/// The idea is to run the vplan on a given loop twice, firstly to setup the
792/// skeleton and vectorize the main loop, and secondly to complete the skeleton
793/// from the first step and vectorize the epilogue. This is achieved by
794/// deriving two concrete strategy classes from this base class and invoking
795/// them in succession from the loop vectorizer planner.
796class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
797public:
798 InnerLoopAndEpilogueVectorizer(
799 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
800 DominatorTree *DT, const TargetLibraryInfo *TLI,
801 const TargetTransformInfo *TTI, AssumptionCache *AC,
802 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
803 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
804 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
805 GeneratedRTChecks &Checks)
806 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
807 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
808 CM, BFI, PSI, Checks),
809 EPI(EPI) {}
810
811 // Override this function to handle the more complex control flow around the
812 // three loops.
813 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton() final {
814 return createEpilogueVectorizedLoopSkeleton();
815 }
816
817 /// The interface for creating a vectorized skeleton using one of two
818 /// different strategies, each corresponding to one execution of the vplan
819 /// as described above.
820 virtual std::pair<BasicBlock *, Value *>
821 createEpilogueVectorizedLoopSkeleton() = 0;
822
823 /// Holds and updates state information required to vectorize the main loop
824 /// and its epilogue in two separate passes. This setup helps us avoid
825 /// regenerating and recomputing runtime safety checks. It also helps us to
826 /// shorten the iteration-count-check path length for the cases where the
827 /// iteration count of the loop is so small that the main vector loop is
828 /// completely skipped.
829 EpilogueLoopVectorizationInfo &EPI;
830};
831
832/// A specialized derived class of inner loop vectorizer that performs
833/// vectorization of *main* loops in the process of vectorizing loops and their
834/// epilogues.
835class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
836public:
837 EpilogueVectorizerMainLoop(
838 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
839 DominatorTree *DT, const TargetLibraryInfo *TLI,
840 const TargetTransformInfo *TTI, AssumptionCache *AC,
841 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
842 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
843 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
844 GeneratedRTChecks &Check)
845 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
846 EPI, LVL, CM, BFI, PSI, Check) {}
847 /// Implements the interface for creating a vectorized skeleton using the
848 /// *main loop* strategy (ie the first pass of vplan execution).
849 std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
850
851protected:
852 /// Emits an iteration count bypass check once for the main loop (when \p
853 /// ForEpilogue is false) and once for the epilogue loop (when \p
854 /// ForEpilogue is true).
855 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
856 void printDebugTracesAtStart() override;
857 void printDebugTracesAtEnd() override;
858};
859
860// A specialized derived class of inner loop vectorizer that performs
861// vectorization of *epilogue* loops in the process of vectorizing loops and
862// their epilogues.
863class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
864public:
865 EpilogueVectorizerEpilogueLoop(
866 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
867 DominatorTree *DT, const TargetLibraryInfo *TLI,
868 const TargetTransformInfo *TTI, AssumptionCache *AC,
869 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
870 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
871 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
872 GeneratedRTChecks &Checks)
873 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
874 EPI, LVL, CM, BFI, PSI, Checks) {
875 TripCount = EPI.TripCount;
876 }
877 /// Implements the interface for creating a vectorized skeleton using the
878 /// *epilogue loop* strategy (ie the second pass of vplan execution).
879 std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
880
881protected:
882 /// Emits an iteration count bypass check after the main vector loop has
883 /// finished to see if there are any iterations left to execute by either
884 /// the vector epilogue or the scalar epilogue.
885 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
886 BasicBlock *Bypass,
887 BasicBlock *Insert);
888 void printDebugTracesAtStart() override;
889 void printDebugTracesAtEnd() override;
890};
891} // end namespace llvm
892
893/// Look for a meaningful debug location on the instruction or it's
894/// operands.
895static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
896 if (!I)
897 return I;
898
899 DebugLoc Empty;
900 if (I->getDebugLoc() != Empty)
901 return I;
902
903 for (Use &Op : I->operands()) {
904 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
905 if (OpInst->getDebugLoc() != Empty)
906 return OpInst;
907 }
908
909 return I;
910}
911
912/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
913/// is passed, the message relates to that particular instruction.
914#ifndef NDEBUG
915static void debugVectorizationMessage(const StringRef Prefix,
916 const StringRef DebugMsg,
917 Instruction *I) {
918 dbgs() << "LV: " << Prefix << DebugMsg;
919 if (I != nullptr)
920 dbgs() << " " << *I;
921 else
922 dbgs() << '.';
923 dbgs() << '\n';
924}
925#endif
926
927/// Create an analysis remark that explains why vectorization failed
928///
929/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
930/// RemarkName is the identifier for the remark. If \p I is passed it is an
931/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
932/// the location of the remark. \return the remark object that can be
933/// streamed to.
934static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
935 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
936 Value *CodeRegion = TheLoop->getHeader();
937 DebugLoc DL = TheLoop->getStartLoc();
938
939 if (I) {
940 CodeRegion = I->getParent();
941 // If there is no debug location attached to the instruction, revert back to
942 // using the loop's.
943 if (I->getDebugLoc())
944 DL = I->getDebugLoc();
945 }
946
947 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
948}
949
950namespace llvm {
951
952/// Return a value for Step multiplied by VF.
953Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
954 int64_t Step) {
955 assert(Ty->isIntegerTy() && "Expected an integer step")(static_cast <bool> (Ty->isIntegerTy() && "Expected an integer step"
) ? void (0) : __assert_fail ("Ty->isIntegerTy() && \"Expected an integer step\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 955, __extension__
__PRETTY_FUNCTION__))
;
956 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
957 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
958}
959
960/// Return the runtime value for VF.
961Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
962 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
963 return VF.isScalable() ? B.CreateVScale(EC) : EC;
964}
965
966const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE) {
967 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
968 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count")(static_cast <bool> (!isa<SCEVCouldNotCompute>(BackedgeTakenCount
) && "Invalid loop count") ? void (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 968, __extension__
__PRETTY_FUNCTION__))
;
969
970 ScalarEvolution &SE = *PSE.getSE();
971
972 // The exit count might have the type of i64 while the phi is i32. This can
973 // happen if we have an induction variable that is sign extended before the
974 // compare. The only way that we get a backedge taken count is that the
975 // induction variable was signed and as such will not overflow. In such a case
976 // truncation is legal.
977 if (SE.getTypeSizeInBits(BackedgeTakenCount->getType()) >
978 IdxTy->getPrimitiveSizeInBits())
979 BackedgeTakenCount = SE.getTruncateOrNoop(BackedgeTakenCount, IdxTy);
980 BackedgeTakenCount = SE.getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
981
982 // Get the total trip count from the count by adding 1.
983 return SE.getAddExpr(BackedgeTakenCount,
984 SE.getOne(BackedgeTakenCount->getType()));
985}
986
987static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
988 ElementCount VF) {
989 assert(FTy->isFloatingPointTy() && "Expected floating point type!")(static_cast <bool> (FTy->isFloatingPointTy() &&
"Expected floating point type!") ? void (0) : __assert_fail (
"FTy->isFloatingPointTy() && \"Expected floating point type!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 989, __extension__
__PRETTY_FUNCTION__))
;
990 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
991 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
992 return B.CreateUIToFP(RuntimeVF, FTy);
993}
994
995void reportVectorizationFailure(const StringRef DebugMsg,
996 const StringRef OREMsg, const StringRef ORETag,
997 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
998 Instruction *I) {
999 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationMessage("Not vectorizing: "
, DebugMsg, I); } } while (false)
;
1000 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1001 ORE->emit(
1002 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1003 << "loop not vectorized: " << OREMsg);
1004}
1005
1006void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1007 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1008 Instruction *I) {
1009 LLVM_DEBUG(debugVectorizationMessage("", Msg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationMessage("", Msg, I); }
} while (false)
;
1010 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1011 ORE->emit(
1012 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1013 << Msg);
1014}
1015
1016} // end namespace llvm
1017
1018#ifndef NDEBUG
1019/// \return string containing a file name and a line # for the given loop.
1020static std::string getDebugLocString(const Loop *L) {
1021 std::string Result;
1022 if (L) {
1023 raw_string_ostream OS(Result);
1024 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1025 LoopDbgLoc.print(OS);
1026 else
1027 // Just print the module name.
1028 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1029 OS.flush();
1030 }
1031 return Result;
1032}
1033#endif
1034
1035void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1036 VPTransformState &State) {
1037
1038 // Collect recipes in the backward slice of `Root` that may generate a poison
1039 // value that is used after vectorization.
1040 SmallPtrSet<VPRecipeBase *, 16> Visited;
1041 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1042 SmallVector<VPRecipeBase *, 16> Worklist;
1043 Worklist.push_back(Root);
1044
1045 // Traverse the backward slice of Root through its use-def chain.
1046 while (!Worklist.empty()) {
1047 VPRecipeBase *CurRec = Worklist.back();
1048 Worklist.pop_back();
1049
1050 if (!Visited.insert(CurRec).second)
1051 continue;
1052
1053 // Prune search if we find another recipe generating a widen memory
1054 // instruction. Widen memory instructions involved in address computation
1055 // will lead to gather/scatter instructions, which don't need to be
1056 // handled.
1057 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1058 isa<VPInterleaveRecipe>(CurRec) ||
1059 isa<VPScalarIVStepsRecipe>(CurRec) ||
1060 isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1061 isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1062 continue;
1063
1064 // This recipe contributes to the address computation of a widen
1065 // load/store. Collect recipe if its underlying instruction has
1066 // poison-generating flags.
1067 Instruction *Instr = CurRec->getUnderlyingInstr();
1068 if (Instr && Instr->hasPoisonGeneratingFlags())
1069 State.MayGeneratePoisonRecipes.insert(CurRec);
1070
1071 // Add new definitions to the worklist.
1072 for (VPValue *operand : CurRec->operands())
1073 if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
1074 Worklist.push_back(OpDef);
1075 }
1076 });
1077
1078 // Traverse all the recipes in the VPlan and collect the poison-generating
1079 // recipes in the backward slice starting at the address of a VPWidenRecipe or
1080 // VPInterleaveRecipe.
1081 auto Iter = vp_depth_first_deep(State.Plan->getEntry());
1082 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1083 for (VPRecipeBase &Recipe : *VPBB) {
1084 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1085 Instruction &UnderlyingInstr = WidenRec->getIngredient();
1086 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
1087 if (AddrDef && WidenRec->isConsecutive() &&
1088 Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1089 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1090 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1091 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
1092 if (AddrDef) {
1093 // Check if any member of the interleave group needs predication.
1094 const InterleaveGroup<Instruction> *InterGroup =
1095 InterleaveRec->getInterleaveGroup();
1096 bool NeedPredication = false;
1097 for (int I = 0, NumMembers = InterGroup->getNumMembers();
1098 I < NumMembers; ++I) {
1099 Instruction *Member = InterGroup->getMember(I);
1100 if (Member)
1101 NeedPredication |=
1102 Legal->blockNeedsPredication(Member->getParent());
1103 }
1104
1105 if (NeedPredication)
1106 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1107 }
1108 }
1109 }
1110 }
1111}
1112
1113PHINode *InnerLoopVectorizer::getReductionResumeValue(
1114 const RecurrenceDescriptor &RdxDesc) {
1115 auto It = ReductionResumeValues.find(&RdxDesc);
1116 assert(It != ReductionResumeValues.end() &&(static_cast <bool> (It != ReductionResumeValues.end() &&
"Expected to find a resume value for the reduction.") ? void
(0) : __assert_fail ("It != ReductionResumeValues.end() && \"Expected to find a resume value for the reduction.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1117, __extension__
__PRETTY_FUNCTION__))
1117 "Expected to find a resume value for the reduction.")(static_cast <bool> (It != ReductionResumeValues.end() &&
"Expected to find a resume value for the reduction.") ? void
(0) : __assert_fail ("It != ReductionResumeValues.end() && \"Expected to find a resume value for the reduction.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1117, __extension__
__PRETTY_FUNCTION__))
;
1118 return It->second;
1119}
1120
1121namespace llvm {
1122
1123// Loop vectorization cost-model hints how the scalar epilogue loop should be
1124// lowered.
1125enum ScalarEpilogueLowering {
1126
1127 // The default: allowing scalar epilogues.
1128 CM_ScalarEpilogueAllowed,
1129
1130 // Vectorization with OptForSize: don't allow epilogues.
1131 CM_ScalarEpilogueNotAllowedOptSize,
1132
1133 // A special case of vectorisation with OptForSize: loops with a very small
1134 // trip count are considered for vectorization under OptForSize, thereby
1135 // making sure the cost of their loop body is dominant, free of runtime
1136 // guards and scalar iteration overheads.
1137 CM_ScalarEpilogueNotAllowedLowTripLoop,
1138
1139 // Loop hint predicate indicating an epilogue is undesired.
1140 CM_ScalarEpilogueNotNeededUsePredicate,
1141
1142 // Directive indicating we must either tail fold or not vectorize
1143 CM_ScalarEpilogueNotAllowedUsePredicate
1144};
1145
1146/// ElementCountComparator creates a total ordering for ElementCount
1147/// for the purposes of using it in a set structure.
1148struct ElementCountComparator {
1149 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1150 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1151 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1152 }
1153};
1154using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1155
1156/// LoopVectorizationCostModel - estimates the expected speedups due to
1157/// vectorization.
1158/// In many cases vectorization is not profitable. This can happen because of
1159/// a number of reasons. In this class we mainly attempt to predict the
1160/// expected speedup/slowdowns due to the supported instruction set. We use the
1161/// TargetTransformInfo to query the different backends for the cost of
1162/// different operations.
1163class LoopVectorizationCostModel {
1164public:
1165 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1166 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1167 LoopVectorizationLegality *Legal,
1168 const TargetTransformInfo &TTI,
1169 const TargetLibraryInfo *TLI, DemandedBits *DB,
1170 AssumptionCache *AC,
1171 OptimizationRemarkEmitter *ORE, const Function *F,
1172 const LoopVectorizeHints *Hints,
1173 InterleavedAccessInfo &IAI)
1174 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1175 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1176 Hints(Hints), InterleaveInfo(IAI) {}
1177
1178 /// \return An upper bound for the vectorization factors (both fixed and
1179 /// scalable). If the factors are 0, vectorization and interleaving should be
1180 /// avoided up front.
1181 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1182
1183 /// \return True if runtime checks are required for vectorization, and false
1184 /// otherwise.
1185 bool runtimeChecksRequired();
1186
1187 /// \return The most profitable vectorization factor and the cost of that VF.
1188 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1189 /// then this vectorization factor will be selected if vectorization is
1190 /// possible.
1191 VectorizationFactor
1192 selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1193
1194 VectorizationFactor
1195 selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1196 const LoopVectorizationPlanner &LVP);
1197
1198 /// Setup cost-based decisions for user vectorization factor.
1199 /// \return true if the UserVF is a feasible VF to be chosen.
1200 bool selectUserVectorizationFactor(ElementCount UserVF) {
1201 collectUniformsAndScalars(UserVF);
1202 collectInstsToScalarize(UserVF);
1203 return expectedCost(UserVF).first.isValid();
1204 }
1205
1206 /// \return The size (in bits) of the smallest and widest types in the code
1207 /// that needs to be vectorized. We ignore values that remain scalar such as
1208 /// 64 bit loop indices.
1209 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1210
1211 /// \return The desired interleave count.
1212 /// If interleave count has been specified by metadata it will be returned.
1213 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1214 /// are the selected vectorization factor and the cost of the selected VF.
1215 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1216
1217 /// Memory access instruction may be vectorized in more than one way.
1218 /// Form of instruction after vectorization depends on cost.
1219 /// This function takes cost-based decisions for Load/Store instructions
1220 /// and collects them in a map. This decisions map is used for building
1221 /// the lists of loop-uniform and loop-scalar instructions.
1222 /// The calculated cost is saved with widening decision in order to
1223 /// avoid redundant calculations.
1224 void setCostBasedWideningDecision(ElementCount VF);
1225
1226 /// A struct that represents some properties of the register usage
1227 /// of a loop.
1228 struct RegisterUsage {
1229 /// Holds the number of loop invariant values that are used in the loop.
1230 /// The key is ClassID of target-provided register class.
1231 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1232 /// Holds the maximum number of concurrent live intervals in the loop.
1233 /// The key is ClassID of target-provided register class.
1234 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1235 };
1236
1237 /// \return Returns information about the register usages of the loop for the
1238 /// given vectorization factors.
1239 SmallVector<RegisterUsage, 8>
1240 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1241
1242 /// Collect values we want to ignore in the cost model.
1243 void collectValuesToIgnore();
1244
1245 /// Collect all element types in the loop for which widening is needed.
1246 void collectElementTypesForWidening();
1247
1248 /// Split reductions into those that happen in the loop, and those that happen
1249 /// outside. In loop reductions are collected into InLoopReductionChains.
1250 void collectInLoopReductions();
1251
1252 /// Returns true if we should use strict in-order reductions for the given
1253 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1254 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1255 /// of FP operations.
1256 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1257 return !Hints->allowReordering() && RdxDesc.isOrdered();
1258 }
1259
1260 /// \returns The smallest bitwidth each instruction can be represented with.
1261 /// The vector equivalents of these instructions should be truncated to this
1262 /// type.
1263 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1264 return MinBWs;
1265 }
1266
1267 /// \returns True if it is more profitable to scalarize instruction \p I for
1268 /// vectorization factor \p VF.
1269 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1270 assert(VF.isVector() &&(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1271, __extension__
__PRETTY_FUNCTION__))
1271 "Profitable to scalarize relevant only for VF > 1.")(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1271, __extension__
__PRETTY_FUNCTION__))
;
1272
1273 // Cost model is not run in the VPlan-native path - return conservative
1274 // result until this changes.
1275 if (EnableVPlanNativePath)
1276 return false;
1277
1278 auto Scalars = InstsToScalarize.find(VF);
1279 assert(Scalars != InstsToScalarize.end() &&(static_cast <bool> (Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability") ? void
(0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1280, __extension__
__PRETTY_FUNCTION__))
1280 "VF not yet analyzed for scalarization profitability")(static_cast <bool> (Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability") ? void
(0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1280, __extension__
__PRETTY_FUNCTION__))
;
1281 return Scalars->second.find(I) != Scalars->second.end();
1282 }
1283
1284 /// Returns true if \p I is known to be uniform after vectorization.
1285 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1286 if (VF.isScalar())
1287 return true;
1288
1289 // Cost model is not run in the VPlan-native path - return conservative
1290 // result until this changes.
1291 if (EnableVPlanNativePath)
1292 return false;
1293
1294 auto UniformsPerVF = Uniforms.find(VF);
1295 assert(UniformsPerVF != Uniforms.end() &&(static_cast <bool> (UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity") ? void (0) : __assert_fail
("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1296, __extension__
__PRETTY_FUNCTION__))
1296 "VF not yet analyzed for uniformity")(static_cast <bool> (UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity") ? void (0) : __assert_fail
("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1296, __extension__
__PRETTY_FUNCTION__))
;
1297 return UniformsPerVF->second.count(I);
1298 }
1299
1300 /// Returns true if \p I is known to be scalar after vectorization.
1301 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1302 if (VF.isScalar())
1303 return true;
1304
1305 // Cost model is not run in the VPlan-native path - return conservative
1306 // result until this changes.
1307 if (EnableVPlanNativePath)
1308 return false;
1309
1310 auto ScalarsPerVF = Scalars.find(VF);
1311 assert(ScalarsPerVF != Scalars.end() &&(static_cast <bool> (ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF") ? void (0) : __assert_fail
("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1312, __extension__
__PRETTY_FUNCTION__))
1312 "Scalar values are not calculated for VF")(static_cast <bool> (ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF") ? void (0) : __assert_fail
("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1312, __extension__
__PRETTY_FUNCTION__))
;
1313 return ScalarsPerVF->second.count(I);
1314 }
1315
1316 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1317 /// for vectorization factor \p VF.
1318 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1319 return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1320 !isProfitableToScalarize(I, VF) &&
1321 !isScalarAfterVectorization(I, VF);
1322 }
1323
1324 /// Decision that was taken during cost calculation for memory instruction.
1325 enum InstWidening {
1326 CM_Unknown,
1327 CM_Widen, // For consecutive accesses with stride +1.
1328 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1329 CM_Interleave,
1330 CM_GatherScatter,
1331 CM_Scalarize
1332 };
1333
1334 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1335 /// instruction \p I and vector width \p VF.
1336 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1337 InstructionCost Cost) {
1338 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1338, __extension__
__PRETTY_FUNCTION__))
;
1339 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1340 }
1341
1342 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1343 /// interleaving group \p Grp and vector width \p VF.
1344 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1345 ElementCount VF, InstWidening W,
1346 InstructionCost Cost) {
1347 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1347, __extension__
__PRETTY_FUNCTION__))
;
1348 /// Broadcast this decicion to all instructions inside the group.
1349 /// But the cost will be assigned to one instruction only.
1350 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1351 if (auto *I = Grp->getMember(i)) {
1352 if (Grp->getInsertPos() == I)
1353 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1354 else
1355 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1356 }
1357 }
1358 }
1359
1360 /// Return the cost model decision for the given instruction \p I and vector
1361 /// width \p VF. Return CM_Unknown if this instruction did not pass
1362 /// through the cost modeling.
1363 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1364 assert(VF.isVector() && "Expected VF to be a vector VF")(static_cast <bool> (VF.isVector() && "Expected VF to be a vector VF"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF to be a vector VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1364, __extension__
__PRETTY_FUNCTION__))
;
1365 // Cost model is not run in the VPlan-native path - return conservative
1366 // result until this changes.
1367 if (EnableVPlanNativePath)
1368 return CM_GatherScatter;
1369
1370 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1371 auto Itr = WideningDecisions.find(InstOnVF);
1372 if (Itr == WideningDecisions.end())
1373 return CM_Unknown;
1374 return Itr->second.first;
1375 }
1376
1377 /// Return the vectorization cost for the given instruction \p I and vector
1378 /// width \p VF.
1379 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1380 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1380, __extension__
__PRETTY_FUNCTION__))
;
1381 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1382 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&(static_cast <bool> (WideningDecisions.find(InstOnVF) !=
WideningDecisions.end() && "The cost is not calculated"
) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1383, __extension__
__PRETTY_FUNCTION__))
1383 "The cost is not calculated")(static_cast <bool> (WideningDecisions.find(InstOnVF) !=
WideningDecisions.end() && "The cost is not calculated"
) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1383, __extension__
__PRETTY_FUNCTION__))
;
1384 return WideningDecisions[InstOnVF].second;
1385 }
1386
1387 /// Return True if instruction \p I is an optimizable truncate whose operand
1388 /// is an induction variable. Such a truncate will be removed by adding a new
1389 /// induction variable with the destination type.
1390 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1391 // If the instruction is not a truncate, return false.
1392 auto *Trunc = dyn_cast<TruncInst>(I);
1393 if (!Trunc)
1394 return false;
1395
1396 // Get the source and destination types of the truncate.
1397 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1398 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1399
1400 // If the truncate is free for the given types, return false. Replacing a
1401 // free truncate with an induction variable would add an induction variable
1402 // update instruction to each iteration of the loop. We exclude from this
1403 // check the primary induction variable since it will need an update
1404 // instruction regardless.
1405 Value *Op = Trunc->getOperand(0);
1406 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1407 return false;
1408
1409 // If the truncated value is not an induction variable, return false.
1410 return Legal->isInductionPhi(Op);
1411 }
1412
1413 /// Collects the instructions to scalarize for each predicated instruction in
1414 /// the loop.
1415 void collectInstsToScalarize(ElementCount VF);
1416
1417 /// Collect Uniform and Scalar values for the given \p VF.
1418 /// The sets depend on CM decision for Load/Store instructions
1419 /// that may be vectorized as interleave, gather-scatter or scalarized.
1420 void collectUniformsAndScalars(ElementCount VF) {
1421 // Do the analysis once.
1422 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1423 return;
1424 setCostBasedWideningDecision(VF);
1425 collectLoopUniforms(VF);
1426 collectLoopScalars(VF);
1427 }
1428
1429 /// Returns true if the target machine supports masked store operation
1430 /// for the given \p DataType and kind of access to \p Ptr.
1431 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1432 return Legal->isConsecutivePtr(DataType, Ptr) &&
1433 TTI.isLegalMaskedStore(DataType, Alignment);
1434 }
1435
1436 /// Returns true if the target machine supports masked load operation
1437 /// for the given \p DataType and kind of access to \p Ptr.
1438 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1439 return Legal->isConsecutivePtr(DataType, Ptr) &&
1440 TTI.isLegalMaskedLoad(DataType, Alignment);
1441 }
1442
1443 /// Returns true if the target machine can represent \p V as a masked gather
1444 /// or scatter operation.
1445 bool isLegalGatherOrScatter(Value *V,
1446 ElementCount VF = ElementCount::getFixed(1)) {
1447 bool LI = isa<LoadInst>(V);
1448 bool SI = isa<StoreInst>(V);
1449 if (!LI && !SI)
1450 return false;
1451 auto *Ty = getLoadStoreType(V);
1452 Align Align = getLoadStoreAlignment(V);
1453 if (VF.isVector())
1454 Ty = VectorType::get(Ty, VF);
1455 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1456 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1457 }
1458
1459 /// Returns true if the target machine supports all of the reduction
1460 /// variables found for the given VF.
1461 bool canVectorizeReductions(ElementCount VF) const {
1462 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1463 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1464 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1465 }));
1466 }
1467
1468 /// Given costs for both strategies, return true if the scalar predication
1469 /// lowering should be used for div/rem. This incorporates an override
1470 /// option so it is not simply a cost comparison.
1471 bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1472 InstructionCost SafeDivisorCost) const {
1473 switch (ForceSafeDivisor) {
1474 case cl::BOU_UNSET:
1475 return ScalarCost < SafeDivisorCost;
1476 case cl::BOU_TRUE:
1477 return false;
1478 case cl::BOU_FALSE:
1479 return true;
1480 };
1481 llvm_unreachable("impossible case value")::llvm::llvm_unreachable_internal("impossible case value", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1481)
;
1482 }
1483
1484 /// Returns true if \p I is an instruction which requires predication and
1485 /// for which our chosen predication strategy is scalarization (i.e. we
1486 /// don't have an alternate strategy such as masking available).
1487 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1488 bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1489
1490 /// Returns true if \p I is an instruction that needs to be predicated
1491 /// at runtime. The result is independent of the predication mechanism.
1492 /// Superset of instructions that return true for isScalarWithPredication.
1493 bool isPredicatedInst(Instruction *I) const;
1494
1495 /// Return the costs for our two available strategies for lowering a
1496 /// div/rem operation which requires speculating at least one lane.
1497 /// First result is for scalarization (will be invalid for scalable
1498 /// vectors); second is for the safe-divisor strategy.
1499 std::pair<InstructionCost, InstructionCost>
1500 getDivRemSpeculationCost(Instruction *I,
1501 ElementCount VF) const;
1502
1503 /// Returns true if \p I is a memory instruction with consecutive memory
1504 /// access that can be widened.
1505 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1506
1507 /// Returns true if \p I is a memory instruction in an interleaved-group
1508 /// of memory accesses that can be vectorized with wide vector loads/stores
1509 /// and shuffles.
1510 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
1511
1512 /// Check if \p Instr belongs to any interleaved access group.
1513 bool isAccessInterleaved(Instruction *Instr) {
1514 return InterleaveInfo.isInterleaved(Instr);
1515 }
1516
1517 /// Get the interleaved access group that \p Instr belongs to.
1518 const InterleaveGroup<Instruction> *
1519 getInterleavedAccessGroup(Instruction *Instr) {
1520 return InterleaveInfo.getInterleaveGroup(Instr);
1521 }
1522
1523 /// Returns true if we're required to use a scalar epilogue for at least
1524 /// the final iteration of the original loop.
1525 bool requiresScalarEpilogue(ElementCount VF) const {
1526 if (!isScalarEpilogueAllowed())
1527 return false;
1528 // If we might exit from anywhere but the latch, must run the exiting
1529 // iteration in scalar form.
1530 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1531 return true;
1532 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1533 }
1534
1535 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1536 /// loop hint annotation.
1537 bool isScalarEpilogueAllowed() const {
1538 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1539 }
1540
1541 /// Returns true if all loop blocks should be masked to fold tail loop.
1542 bool foldTailByMasking() const { return FoldTailByMasking; }
1543
1544 /// Returns true if were tail-folding and want to use the active lane mask
1545 /// for vector loop control flow.
1546 bool useActiveLaneMaskForControlFlow() const {
1547 return FoldTailByMasking &&
1548 TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow;
1549 }
1550
1551 /// Returns true if the instructions in this block requires predication
1552 /// for any reason, e.g. because tail folding now requires a predicate
1553 /// or because the block in the original loop was predicated.
1554 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1555 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1556 }
1557
1558 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1559 /// nodes to the chain of instructions representing the reductions. Uses a
1560 /// MapVector to ensure deterministic iteration order.
1561 using ReductionChainMap =
1562 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1563
1564 /// Return the chain of instructions representing an inloop reduction.
1565 const ReductionChainMap &getInLoopReductionChains() const {
1566 return InLoopReductionChains;
1567 }
1568
1569 /// Returns true if the Phi is part of an inloop reduction.
1570 bool isInLoopReduction(PHINode *Phi) const {
1571 return InLoopReductionChains.count(Phi);
1572 }
1573
1574 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1575 /// with factor VF. Return the cost of the instruction, including
1576 /// scalarization overhead if it's needed.
1577 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1578
1579 /// Estimate cost of a call instruction CI if it were vectorized with factor
1580 /// VF. Return the cost of the instruction, including scalarization overhead
1581 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1582 /// scalarized -
1583 /// i.e. either vector version isn't available, or is too expensive.
1584 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1585 bool &NeedToScalarize) const;
1586
1587 /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1588 /// that of B.
1589 bool isMoreProfitable(const VectorizationFactor &A,
1590 const VectorizationFactor &B) const;
1591
1592 /// Invalidates decisions already taken by the cost model.
1593 void invalidateCostModelingDecisions() {
1594 WideningDecisions.clear();
1595 Uniforms.clear();
1596 Scalars.clear();
1597 }
1598
1599 /// Convenience function that returns the value of vscale_range iff
1600 /// vscale_range.min == vscale_range.max or otherwise returns the value
1601 /// returned by the corresponding TLI method.
1602 std::optional<unsigned> getVScaleForTuning() const;
1603
1604private:
1605 unsigned NumPredStores = 0;
1606
1607 /// \return An upper bound for the vectorization factors for both
1608 /// fixed and scalable vectorization, where the minimum-known number of
1609 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1610 /// disabled or unsupported, then the scalable part will be equal to
1611 /// ElementCount::getScalable(0).
1612 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1613 ElementCount UserVF,
1614 bool FoldTailByMasking);
1615
1616 /// \return the maximized element count based on the targets vector
1617 /// registers and the loop trip-count, but limited to a maximum safe VF.
1618 /// This is a helper function of computeFeasibleMaxVF.
1619 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1620 unsigned SmallestType,
1621 unsigned WidestType,
1622 ElementCount MaxSafeVF,
1623 bool FoldTailByMasking);
1624
1625 /// \return the maximum legal scalable VF, based on the safe max number
1626 /// of elements.
1627 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1628
1629 /// The vectorization cost is a combination of the cost itself and a boolean
1630 /// indicating whether any of the contributing operations will actually
1631 /// operate on vector values after type legalization in the backend. If this
1632 /// latter value is false, then all operations will be scalarized (i.e. no
1633 /// vectorization has actually taken place).
1634 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1635
1636 /// Returns the expected execution cost. The unit of the cost does
1637 /// not matter because we use the 'cost' units to compare different
1638 /// vector widths. The cost that is returned is *not* normalized by
1639 /// the factor width. If \p Invalid is not nullptr, this function
1640 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1641 /// each instruction that has an Invalid cost for the given VF.
1642 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1643 VectorizationCostTy
1644 expectedCost(ElementCount VF,
1645 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1646
1647 /// Returns the execution time cost of an instruction for a given vector
1648 /// width. Vector width of one means scalar.
1649 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1650
1651 /// The cost-computation logic from getInstructionCost which provides
1652 /// the vector type as an output parameter.
1653 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1654 Type *&VectorTy);
1655
1656 /// Return the cost of instructions in an inloop reduction pattern, if I is
1657 /// part of that pattern.
1658 std::optional<InstructionCost>
1659 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1660 TTI::TargetCostKind CostKind);
1661
1662 /// Calculate vectorization cost of memory instruction \p I.
1663 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1664
1665 /// The cost computation for scalarized memory instruction.
1666 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1667
1668 /// The cost computation for interleaving group of memory instructions.
1669 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1670
1671 /// The cost computation for Gather/Scatter instruction.
1672 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1673
1674 /// The cost computation for widening instruction \p I with consecutive
1675 /// memory access.
1676 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1677
1678 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1679 /// Load: scalar load + broadcast.
1680 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1681 /// element)
1682 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1683
1684 /// Estimate the overhead of scalarizing an instruction. This is a
1685 /// convenience wrapper for the type-based getScalarizationOverhead API.
1686 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1687 TTI::TargetCostKind CostKind) const;
1688
1689 /// Returns true if an artificially high cost for emulated masked memrefs
1690 /// should be used.
1691 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1692
1693 /// Map of scalar integer values to the smallest bitwidth they can be legally
1694 /// represented as. The vector equivalents of these values should be truncated
1695 /// to this type.
1696 MapVector<Instruction *, uint64_t> MinBWs;
1697
1698 /// A type representing the costs for instructions if they were to be
1699 /// scalarized rather than vectorized. The entries are Instruction-Cost
1700 /// pairs.
1701 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1702
1703 /// A set containing all BasicBlocks that are known to present after
1704 /// vectorization as a predicated block.
1705 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1706 PredicatedBBsAfterVectorization;
1707
1708 /// Records whether it is allowed to have the original scalar loop execute at
1709 /// least once. This may be needed as a fallback loop in case runtime
1710 /// aliasing/dependence checks fail, or to handle the tail/remainder
1711 /// iterations when the trip count is unknown or doesn't divide by the VF,
1712 /// or as a peel-loop to handle gaps in interleave-groups.
1713 /// Under optsize and when the trip count is very small we don't allow any
1714 /// iterations to execute in the scalar loop.
1715 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1716
1717 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1718 bool FoldTailByMasking = false;
1719
1720 /// A map holding scalar costs for different vectorization factors. The
1721 /// presence of a cost for an instruction in the mapping indicates that the
1722 /// instruction will be scalarized when vectorizing with the associated
1723 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1724 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1725
1726 /// Holds the instructions known to be uniform after vectorization.
1727 /// The data is collected per VF.
1728 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1729
1730 /// Holds the instructions known to be scalar after vectorization.
1731 /// The data is collected per VF.
1732 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1733
1734 /// Holds the instructions (address computations) that are forced to be
1735 /// scalarized.
1736 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1737
1738 /// PHINodes of the reductions that should be expanded in-loop along with
1739 /// their associated chains of reduction operations, in program order from top
1740 /// (PHI) to bottom
1741 ReductionChainMap InLoopReductionChains;
1742
1743 /// A Map of inloop reduction operations and their immediate chain operand.
1744 /// FIXME: This can be removed once reductions can be costed correctly in
1745 /// vplan. This was added to allow quick lookup to the inloop operations,
1746 /// without having to loop through InLoopReductionChains.
1747 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1748
1749 /// Returns the expected difference in cost from scalarizing the expression
1750 /// feeding a predicated instruction \p PredInst. The instructions to
1751 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1752 /// non-negative return value implies the expression will be scalarized.
1753 /// Currently, only single-use chains are considered for scalarization.
1754 InstructionCost computePredInstDiscount(Instruction *PredInst,
1755 ScalarCostsTy &ScalarCosts,
1756 ElementCount VF);
1757
1758 /// Collect the instructions that are uniform after vectorization. An
1759 /// instruction is uniform if we represent it with a single scalar value in
1760 /// the vectorized loop corresponding to each vector iteration. Examples of
1761 /// uniform instructions include pointer operands of consecutive or
1762 /// interleaved memory accesses. Note that although uniformity implies an
1763 /// instruction will be scalar, the reverse is not true. In general, a
1764 /// scalarized instruction will be represented by VF scalar values in the
1765 /// vectorized loop, each corresponding to an iteration of the original
1766 /// scalar loop.
1767 void collectLoopUniforms(ElementCount VF);
1768
1769 /// Collect the instructions that are scalar after vectorization. An
1770 /// instruction is scalar if it is known to be uniform or will be scalarized
1771 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1772 /// to the list if they are used by a load/store instruction that is marked as
1773 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1774 /// VF values in the vectorized loop, each corresponding to an iteration of
1775 /// the original scalar loop.
1776 void collectLoopScalars(ElementCount VF);
1777
1778 /// Keeps cost model vectorization decision and cost for instructions.
1779 /// Right now it is used for memory instructions only.
1780 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1781 std::pair<InstWidening, InstructionCost>>;
1782
1783 DecisionList WideningDecisions;
1784
1785 /// Returns true if \p V is expected to be vectorized and it needs to be
1786 /// extracted.
1787 bool needsExtract(Value *V, ElementCount VF) const {
1788 Instruction *I = dyn_cast<Instruction>(V);
1789 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1790 TheLoop->isLoopInvariant(I))
1791 return false;
1792
1793 // Assume we can vectorize V (and hence we need extraction) if the
1794 // scalars are not computed yet. This can happen, because it is called
1795 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1796 // the scalars are collected. That should be a safe assumption in most
1797 // cases, because we check if the operands have vectorizable types
1798 // beforehand in LoopVectorizationLegality.
1799 return Scalars.find(VF) == Scalars.end() ||
1800 !isScalarAfterVectorization(I, VF);
1801 };
1802
1803 /// Returns a range containing only operands needing to be extracted.
1804 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1805 ElementCount VF) const {
1806 return SmallVector<Value *, 4>(make_filter_range(
1807 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1808 }
1809
1810 /// Determines if we have the infrastructure to vectorize loop \p L and its
1811 /// epilogue, assuming the main loop is vectorized by \p VF.
1812 bool isCandidateForEpilogueVectorization(const Loop &L,
1813 const ElementCount VF) const;
1814
1815 /// Returns true if epilogue vectorization is considered profitable, and
1816 /// false otherwise.
1817 /// \p VF is the vectorization factor chosen for the original loop.
1818 bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1819
1820public:
1821 /// The loop that we evaluate.
1822 Loop *TheLoop;
1823
1824 /// Predicated scalar evolution analysis.
1825 PredicatedScalarEvolution &PSE;
1826
1827 /// Loop Info analysis.
1828 LoopInfo *LI;
1829
1830 /// Vectorization legality.
1831 LoopVectorizationLegality *Legal;
1832
1833 /// Vector target information.
1834 const TargetTransformInfo &TTI;
1835
1836 /// Target Library Info.
1837 const TargetLibraryInfo *TLI;
1838
1839 /// Demanded bits analysis.
1840 DemandedBits *DB;
1841
1842 /// Assumption cache.
1843 AssumptionCache *AC;
1844
1845 /// Interface to emit optimization remarks.
1846 OptimizationRemarkEmitter *ORE;
1847
1848 const Function *TheFunction;
1849
1850 /// Loop Vectorize Hint.
1851 const LoopVectorizeHints *Hints;
1852
1853 /// The interleave access information contains groups of interleaved accesses
1854 /// with the same stride and close to each other.
1855 InterleavedAccessInfo &InterleaveInfo;
1856
1857 /// Values to ignore in the cost model.
1858 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1859
1860 /// Values to ignore in the cost model when VF > 1.
1861 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1862
1863 /// All element types found in the loop.
1864 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1865
1866 /// Profitable vector factors.
1867 SmallVector<VectorizationFactor, 8> ProfitableVFs;
1868};
1869} // end namespace llvm
1870
1871namespace {
1872/// Helper struct to manage generating runtime checks for vectorization.
1873///
1874/// The runtime checks are created up-front in temporary blocks to allow better
1875/// estimating the cost and un-linked from the existing IR. After deciding to
1876/// vectorize, the checks are moved back. If deciding not to vectorize, the
1877/// temporary blocks are completely removed.
1878class GeneratedRTChecks {
1879 /// Basic block which contains the generated SCEV checks, if any.
1880 BasicBlock *SCEVCheckBlock = nullptr;
1881
1882 /// The value representing the result of the generated SCEV checks. If it is
1883 /// nullptr, either no SCEV checks have been generated or they have been used.
1884 Value *SCEVCheckCond = nullptr;
1885
1886 /// Basic block which contains the generated memory runtime checks, if any.
1887 BasicBlock *MemCheckBlock = nullptr;
1888
1889 /// The value representing the result of the generated memory runtime checks.
1890 /// If it is nullptr, either no memory runtime checks have been generated or
1891 /// they have been used.
1892 Value *MemRuntimeCheckCond = nullptr;
1893
1894 DominatorTree *DT;
1895 LoopInfo *LI;
1896 TargetTransformInfo *TTI;
1897
1898 SCEVExpander SCEVExp;
1899 SCEVExpander MemCheckExp;
1900
1901 bool CostTooHigh = false;
1902
1903public:
1904 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1905 TargetTransformInfo *TTI, const DataLayout &DL)
1906 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1907 MemCheckExp(SE, DL, "scev.check") {}
1908
1909 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1910 /// accurately estimate the cost of the runtime checks. The blocks are
1911 /// un-linked from the IR and is added back during vector code generation. If
1912 /// there is no vector code generation, the check blocks are removed
1913 /// completely.
1914 void Create(Loop *L, const LoopAccessInfo &LAI,
1915 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1916
1917 // Hard cutoff to limit compile-time increase in case a very large number of
1918 // runtime checks needs to be generated.
1919 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1920 // profile info.
1921 CostTooHigh =
1922 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1923 if (CostTooHigh)
1924 return;
1925
1926 BasicBlock *LoopHeader = L->getHeader();
1927 BasicBlock *Preheader = L->getLoopPreheader();
1928
1929 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1930 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1931 // may be used by SCEVExpander. The blocks will be un-linked from their
1932 // predecessors and removed from LI & DT at the end of the function.
1933 if (!UnionPred.isAlwaysTrue()) {
1934 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1935 nullptr, "vector.scevcheck");
1936
1937 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1938 &UnionPred, SCEVCheckBlock->getTerminator());
1939 }
1940
1941 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1942 if (RtPtrChecking.Need) {
1943 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1944 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1945 "vector.memcheck");
1946
1947 auto DiffChecks = RtPtrChecking.getDiffChecks();
1948 if (DiffChecks) {
1949 Value *RuntimeVF = nullptr;
1950 MemRuntimeCheckCond = addDiffRuntimeChecks(
1951 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1952 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1953 if (!RuntimeVF)
1954 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1955 return RuntimeVF;
1956 },
1957 IC);
1958 } else {
1959 MemRuntimeCheckCond =
1960 addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1961 RtPtrChecking.getChecks(), MemCheckExp);
1962 }
1963 assert(MemRuntimeCheckCond &&(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1965, __extension__
__PRETTY_FUNCTION__))
1964 "no RT checks generated although RtPtrChecking "(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1965, __extension__
__PRETTY_FUNCTION__))
1965 "claimed checks are required")(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1965, __extension__
__PRETTY_FUNCTION__))
;
1966 }
1967
1968 if (!MemCheckBlock && !SCEVCheckBlock)
1969 return;
1970
1971 // Unhook the temporary block with the checks, update various places
1972 // accordingly.
1973 if (SCEVCheckBlock)
1974 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1975 if (MemCheckBlock)
1976 MemCheckBlock->replaceAllUsesWith(Preheader);
1977
1978 if (SCEVCheckBlock) {
1979 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1980 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1981 Preheader->getTerminator()->eraseFromParent();
1982 }
1983 if (MemCheckBlock) {
1984 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1985 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1986 Preheader->getTerminator()->eraseFromParent();
1987 }
1988
1989 DT->changeImmediateDominator(LoopHeader, Preheader);
1990 if (MemCheckBlock) {
1991 DT->eraseNode(MemCheckBlock);
1992 LI->removeBlock(MemCheckBlock);
1993 }
1994 if (SCEVCheckBlock) {
1995 DT->eraseNode(SCEVCheckBlock);
1996 LI->removeBlock(SCEVCheckBlock);
1997 }
1998 }
1999
2000 InstructionCost getCost() {
2001 if (SCEVCheckBlock || MemCheckBlock)
2002 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Calculating cost of runtime checks:\n"
; } } while (false)
;
2003
2004 if (CostTooHigh) {
2005 InstructionCost Cost;
2006 Cost.setInvalid();
2007 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " number of checks exceeded threshold\n"
; } } while (false)
;
2008 return Cost;
2009 }
2010
2011 InstructionCost RTCheckCost = 0;
2012 if (SCEVCheckBlock)
2013 for (Instruction &I : *SCEVCheckBlock) {
2014 if (SCEVCheckBlock->getTerminator() == &I)
2015 continue;
2016 InstructionCost C =
2017 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2018 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " " << C <<
" for " << I << "\n"; } } while (false)
;
2019 RTCheckCost += C;
2020 }
2021 if (MemCheckBlock)
2022 for (Instruction &I : *MemCheckBlock) {
2023 if (MemCheckBlock->getTerminator() == &I)
2024 continue;
2025 InstructionCost C =
2026 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2027 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " " << C <<
" for " << I << "\n"; } } while (false)
;
2028 RTCheckCost += C;
2029 }
2030
2031 if (SCEVCheckBlock || MemCheckBlock)
2032 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCostdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Total cost of runtime checks: "
<< RTCheckCost << "\n"; } } while (false)
2033 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Total cost of runtime checks: "
<< RTCheckCost << "\n"; } } while (false)
;
2034
2035 return RTCheckCost;
2036 }
2037
2038 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2039 /// unused.
2040 ~GeneratedRTChecks() {
2041 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2042 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2043 if (!SCEVCheckCond)
2044 SCEVCleaner.markResultUsed();
2045
2046 if (!MemRuntimeCheckCond)
2047 MemCheckCleaner.markResultUsed();
2048
2049 if (MemRuntimeCheckCond) {
2050 auto &SE = *MemCheckExp.getSE();
2051 // Memory runtime check generation creates compares that use expanded
2052 // values. Remove them before running the SCEVExpanderCleaners.
2053 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2054 if (MemCheckExp.isInsertedInstruction(&I))
2055 continue;
2056 SE.forgetValue(&I);
2057 I.eraseFromParent();
2058 }
2059 }
2060 MemCheckCleaner.cleanup();
2061 SCEVCleaner.cleanup();
2062
2063 if (SCEVCheckCond)
2064 SCEVCheckBlock->eraseFromParent();
2065 if (MemRuntimeCheckCond)
2066 MemCheckBlock->eraseFromParent();
2067 }
2068
2069 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2070 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2071 /// depending on the generated condition.
2072 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2073 BasicBlock *LoopVectorPreHeader,
2074 BasicBlock *LoopExitBlock) {
2075 if (!SCEVCheckCond)
2076 return nullptr;
2077
2078 Value *Cond = SCEVCheckCond;
2079 // Mark the check as used, to prevent it from being removed during cleanup.
2080 SCEVCheckCond = nullptr;
2081 if (auto *C = dyn_cast<ConstantInt>(Cond))
2082 if (C->isZero())
2083 return nullptr;
2084
2085 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2086
2087 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2088 // Create new preheader for vector loop.
2089 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2090 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2091
2092 SCEVCheckBlock->getTerminator()->eraseFromParent();
2093 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2094 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2095 SCEVCheckBlock);
2096
2097 DT->addNewBlock(SCEVCheckBlock, Pred);
2098 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2099
2100 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
2101 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
2102 return SCEVCheckBlock;
2103 }
2104
2105 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2106 /// the branches to branch to the vector preheader or \p Bypass, depending on
2107 /// the generated condition.
2108 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2109 BasicBlock *LoopVectorPreHeader) {
2110 // Check if we generated code that checks in runtime if arrays overlap.
2111 if (!MemRuntimeCheckCond)
2112 return nullptr;
2113
2114 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2115 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2116 MemCheckBlock);
2117
2118 DT->addNewBlock(MemCheckBlock, Pred);
2119 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2120 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2121
2122 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2123 PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2124
2125 ReplaceInstWithInst(
2126 MemCheckBlock->getTerminator(),
2127 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2128 MemCheckBlock->getTerminator()->setDebugLoc(
2129 Pred->getTerminator()->getDebugLoc());
2130
2131 // Mark the check as used, to prevent it from being removed during cleanup.
2132 MemRuntimeCheckCond = nullptr;
2133 return MemCheckBlock;
2134 }
2135};
2136} // namespace
2137
2138// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2139// vectorization. The loop needs to be annotated with #pragma omp simd
2140// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2141// vector length information is not provided, vectorization is not considered
2142// explicit. Interleave hints are not allowed either. These limitations will be
2143// relaxed in the future.
2144// Please, note that we are currently forced to abuse the pragma 'clang
2145// vectorize' semantics. This pragma provides *auto-vectorization hints*
2146// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2147// provides *explicit vectorization hints* (LV can bypass legal checks and
2148// assume that vectorization is legal). However, both hints are implemented
2149// using the same metadata (llvm.loop.vectorize, processed by
2150// LoopVectorizeHints). This will be fixed in the future when the native IR
2151// representation for pragma 'omp simd' is introduced.
2152static bool isExplicitVecOuterLoop(Loop *OuterLp,
2153 OptimizationRemarkEmitter *ORE) {
2154 assert(!OuterLp->isInnermost() && "This is not an outer loop")(static_cast <bool> (!OuterLp->isInnermost() &&
"This is not an outer loop") ? void (0) : __assert_fail ("!OuterLp->isInnermost() && \"This is not an outer loop\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2154, __extension__
__PRETTY_FUNCTION__))
;
2155 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2156
2157 // Only outer loops with an explicit vectorization hint are supported.
2158 // Unannotated outer loops are ignored.
2159 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2160 return false;
2161
2162 Function *Fn = OuterLp->getHeader()->getParent();
2163 if (!Hints.allowVectorization(Fn, OuterLp,
2164 true /*VectorizeOnlyWhenForced*/)) {
2165 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"
; } } while (false)
;
2166 return false;
2167 }
2168
2169 if (Hints.getInterleave() > 1) {
2170 // TODO: Interleave support is future work.
2171 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
2172 "outer loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
;
2173 Hints.emitRemarkWithHints();
2174 return false;
2175 }
2176
2177 return true;
2178}
2179
2180static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2181 OptimizationRemarkEmitter *ORE,
2182 SmallVectorImpl<Loop *> &V) {
2183 // Collect inner loops and outer loops without irreducible control flow. For
2184 // now, only collect outer loops that have explicit vectorization hints. If we
2185 // are stress testing the VPlan H-CFG construction, we collect the outermost
2186 // loop of every loop nest.
2187 if (L.isInnermost() || VPlanBuildStressTest ||
2188 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2189 LoopBlocksRPO RPOT(&L);
2190 RPOT.perform(LI);
2191 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2192 V.push_back(&L);
2193 // TODO: Collect inner loops inside marked outer loops in case
2194 // vectorization fails for the outer loop. Do not invoke
2195 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2196 // already known to be reducible. We can use an inherited attribute for
2197 // that.
2198 return;
2199 }
2200 }
2201 for (Loop *InnerL : L)
2202 collectSupportedLoops(*InnerL, LI, ORE, V);
2203}
2204
2205namespace {
2206
2207/// The LoopVectorize Pass.
2208struct LoopVectorize : public FunctionPass {
2209 /// Pass identification, replacement for typeid
2210 static char ID;
2211
2212 LoopVectorizePass Impl;
2213
2214 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2215 bool VectorizeOnlyWhenForced = false)
2216 : FunctionPass(ID),
2217 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2218 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2219 }
2220
2221 bool runOnFunction(Function &F) override {
2222 if (skipFunction(F))
2223 return false;
2224
2225 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2226 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2227 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2228 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2229 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2230 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2231 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2232 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2233 auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs();
2234 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2235 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2236 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2237
2238 return Impl
2239 .runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AC, LAIs, *ORE, PSI)
2240 .MadeAnyChange;
2241 }
2242
2243 void getAnalysisUsage(AnalysisUsage &AU) const override {
2244 AU.addRequired<AssumptionCacheTracker>();
2245 AU.addRequired<BlockFrequencyInfoWrapperPass>();
2246 AU.addRequired<DominatorTreeWrapperPass>();
2247 AU.addRequired<LoopInfoWrapperPass>();
2248 AU.addRequired<ScalarEvolutionWrapperPass>();
2249 AU.addRequired<TargetTransformInfoWrapperPass>();
2250 AU.addRequired<LoopAccessLegacyAnalysis>();
2251 AU.addRequired<DemandedBitsWrapperPass>();
2252 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2253 AU.addRequired<InjectTLIMappingsLegacy>();
2254
2255 // We currently do not preserve loopinfo/dominator analyses with outer loop
2256 // vectorization. Until this is addressed, mark these analyses as preserved
2257 // only for non-VPlan-native path.
2258 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2259 if (!EnableVPlanNativePath) {
2260 AU.addPreserved<LoopInfoWrapperPass>();
2261 AU.addPreserved<DominatorTreeWrapperPass>();
2262 }
2263
2264 AU.addPreserved<BasicAAWrapperPass>();
2265 AU.addPreserved<GlobalsAAWrapperPass>();
2266 AU.addRequired<ProfileSummaryInfoWrapperPass>();
2267 }
2268};
2269
2270} // end anonymous namespace
2271
2272//===----------------------------------------------------------------------===//
2273// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2274// LoopVectorizationCostModel and LoopVectorizationPlanner.
2275//===----------------------------------------------------------------------===//
2276
2277Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2278 // We need to place the broadcast of invariant variables outside the loop,
2279 // but only if it's proven safe to do so. Else, broadcast will be inside
2280 // vector loop body.
2281 Instruction *Instr = dyn_cast<Instruction>(V);
2282 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2283 (!Instr ||
2284 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2285 // Place the code for broadcasting invariant variables in the new preheader.
2286 IRBuilder<>::InsertPointGuard Guard(Builder);
2287 if (SafeToHoist)
2288 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2289
2290 // Broadcast the scalar into all locations in the vector.
2291 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2292
2293 return Shuf;
2294}
2295
2296/// This function adds
2297/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2298/// to each vector element of Val. The sequence starts at StartIndex.
2299/// \p Opcode is relevant for FP induction variable.
2300static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2301 Instruction::BinaryOps BinOp, ElementCount VF,
2302 IRBuilderBase &Builder) {
2303 assert(VF.isVector() && "only vector VFs are supported")(static_cast <bool> (VF.isVector() && "only vector VFs are supported"
) ? void (0) : __assert_fail ("VF.isVector() && \"only vector VFs are supported\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2303, __extension__
__PRETTY_FUNCTION__))
;
2304
2305 // Create and check the types.
2306 auto *ValVTy = cast<VectorType>(Val->getType());
2307 ElementCount VLen = ValVTy->getElementCount();
2308
2309 Type *STy = Val->getType()->getScalarType();
2310 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2311, __extension__
__PRETTY_FUNCTION__))
2311 "Induction Step must be an integer or FP")(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2311, __extension__
__PRETTY_FUNCTION__))
;
2312 assert(Step->getType() == STy && "Step has wrong type")(static_cast <bool> (Step->getType() == STy &&
"Step has wrong type") ? void (0) : __assert_fail ("Step->getType() == STy && \"Step has wrong type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2312, __extension__
__PRETTY_FUNCTION__))
;
2313
2314 SmallVector<Constant *, 8> Indices;
2315
2316 // Create a vector of consecutive numbers from zero to VF.
2317 VectorType *InitVecValVTy = ValVTy;
2318 if (STy->isFloatingPointTy()) {
2319 Type *InitVecValSTy =
2320 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2321 InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2322 }
2323 Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2324
2325 // Splat the StartIdx
2326 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2327
2328 if (STy->isIntegerTy()) {
2329 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2330 Step = Builder.CreateVectorSplat(VLen, Step);
2331 assert(Step->getType() == Val->getType() && "Invalid step vec")(static_cast <bool> (Step->getType() == Val->getType
() && "Invalid step vec") ? void (0) : __assert_fail (
"Step->getType() == Val->getType() && \"Invalid step vec\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2331, __extension__
__PRETTY_FUNCTION__))
;
2332 // FIXME: The newly created binary instructions should contain nsw/nuw
2333 // flags, which can be found from the original scalar operations.
2334 Step = Builder.CreateMul(InitVec, Step);
2335 return Builder.CreateAdd(Val, Step, "induction");
2336 }
2337
2338 // Floating point induction.
2339 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
== Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2340, __extension__
__PRETTY_FUNCTION__))
2340 "Binary Opcode should be specified for FP induction")(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
== Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2340, __extension__
__PRETTY_FUNCTION__))
;
2341 InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2342 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2343
2344 Step = Builder.CreateVectorSplat(VLen, Step);
2345 Value *MulOp = Builder.CreateFMul(InitVec, Step);
2346 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2347}
2348
2349/// Compute scalar induction steps. \p ScalarIV is the scalar induction
2350/// variable on which to base the steps, \p Step is the size of the step.
2351static void buildScalarSteps(Value *ScalarIV, Value *Step,
2352 const InductionDescriptor &ID, VPValue *Def,
2353 VPTransformState &State) {
2354 IRBuilderBase &Builder = State.Builder;
2355
2356 // Ensure step has the same type as that of scalar IV.
2357 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2358 if (ScalarIVTy != Step->getType()) {
2359 // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to
2360 // avoid separate truncate here.
2361 assert(Step->getType()->isIntegerTy() &&(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2362, __extension__
__PRETTY_FUNCTION__))
2362 "Truncation requires an integer step")(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2362, __extension__
__PRETTY_FUNCTION__))
;
2363 Step = State.Builder.CreateTrunc(Step, ScalarIVTy);
2364 }
2365
2366 // We build scalar steps for both integer and floating-point induction
2367 // variables. Here, we determine the kind of arithmetic we will perform.
2368 Instruction::BinaryOps AddOp;
2369 Instruction::BinaryOps MulOp;
2370 if (ScalarIVTy->isIntegerTy()) {
2371 AddOp = Instruction::Add;
2372 MulOp = Instruction::Mul;
2373 } else {
2374 AddOp = ID.getInductionOpcode();
2375 MulOp = Instruction::FMul;
2376 }
2377
2378 // Determine the number of scalars we need to generate for each unroll
2379 // iteration.
2380 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2381 // Compute the scalar steps and save the results in State.
2382 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2383 ScalarIVTy->getScalarSizeInBits());
2384 Type *VecIVTy = nullptr;
2385 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2386 if (!FirstLaneOnly && State.VF.isScalable()) {
2387 VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2388 UnitStepVec =
2389 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2390 SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2391 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2392 }
2393
2394 unsigned StartPart = 0;
2395 unsigned EndPart = State.UF;
2396 unsigned StartLane = 0;
2397 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2398 if (State.Instance) {
2399 StartPart = State.Instance->Part;
2400 EndPart = StartPart + 1;
2401 StartLane = State.Instance->Lane.getKnownLane();
2402 EndLane = StartLane + 1;
2403 }
2404 for (unsigned Part = StartPart; Part < EndPart; ++Part) {
2405 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2406
2407 if (!FirstLaneOnly && State.VF.isScalable()) {
2408 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2409 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2410 if (ScalarIVTy->isFloatingPointTy())
2411 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2412 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2413 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2414 State.set(Def, Add, Part);
2415 // It's useful to record the lane values too for the known minimum number
2416 // of elements so we do those below. This improves the code quality when
2417 // trying to extract the first element, for example.
2418 }
2419
2420 if (ScalarIVTy->isFloatingPointTy())
2421 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2422
2423 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2424 Value *StartIdx = Builder.CreateBinOp(
2425 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2426 // The step returned by `createStepForVF` is a runtime-evaluated value
2427 // when VF is scalable. Otherwise, it should be folded into a Constant.
2428 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&(static_cast <bool> ((State.VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2430, __extension__
__PRETTY_FUNCTION__))
2429 "Expected StartIdx to be folded to a constant when VF is not "(static_cast <bool> ((State.VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2430, __extension__
__PRETTY_FUNCTION__))
2430 "scalable")(static_cast <bool> ((State.VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2430, __extension__
__PRETTY_FUNCTION__))
;
2431 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2432 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2433 State.set(Def, Add, VPIteration(Part, Lane));
2434 }
2435 }
2436}
2437
2438// Generate code for the induction step. Note that induction steps are
2439// required to be loop-invariant
2440static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2441 Instruction *InsertBefore,
2442 Loop *OrigLoop = nullptr) {
2443 const DataLayout &DL = SE.getDataLayout();
2444 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&(static_cast <bool> ((!OrigLoop || SE.isLoopInvariant(Step
, OrigLoop)) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("(!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && \"Induction step should be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2445, __extension__
__PRETTY_FUNCTION__))
2445 "Induction step should be loop invariant")(static_cast <bool> ((!OrigLoop || SE.isLoopInvariant(Step
, OrigLoop)) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("(!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && \"Induction step should be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2445, __extension__
__PRETTY_FUNCTION__))
;
2446 if (auto *E = dyn_cast<SCEVUnknown>(Step))
2447 return E->getValue();
2448
2449 SCEVExpander Exp(SE, DL, "induction");
2450 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2451}
2452
2453/// Compute the transformed value of Index at offset StartValue using step
2454/// StepValue.
2455/// For integer induction, returns StartValue + Index * StepValue.
2456/// For pointer induction, returns StartValue[Index * StepValue].
2457/// FIXME: The newly created binary instructions should contain nsw/nuw
2458/// flags, which can be found from the original scalar operations.
2459static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
2460 Value *StartValue, Value *Step,
2461 const InductionDescriptor &ID) {
2462 Type *StepTy = Step->getType();
2463 Value *CastedIndex = StepTy->isIntegerTy()
2464 ? B.CreateSExtOrTrunc(Index, StepTy)
2465 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2466 if (CastedIndex != Index) {
2467 CastedIndex->setName(CastedIndex->getName() + ".cast");
2468 Index = CastedIndex;
2469 }
2470
2471 // Note: the IR at this point is broken. We cannot use SE to create any new
2472 // SCEV and then expand it, hoping that SCEV's simplification will give us
2473 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2474 // lead to various SCEV crashes. So all we can do is to use builder and rely
2475 // on InstCombine for future simplifications. Here we handle some trivial
2476 // cases only.
2477 auto CreateAdd = [&B](Value *X, Value *Y) {
2478 assert(X->getType() == Y->getType() && "Types don't match!")(static_cast <bool> (X->getType() == Y->getType()
&& "Types don't match!") ? void (0) : __assert_fail (
"X->getType() == Y->getType() && \"Types don't match!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2478, __extension__
__PRETTY_FUNCTION__))
;
2479 if (auto *CX = dyn_cast<ConstantInt>(X))
2480 if (CX->isZero())
2481 return Y;
2482 if (auto *CY = dyn_cast<ConstantInt>(Y))
2483 if (CY->isZero())
2484 return X;
2485 return B.CreateAdd(X, Y);
2486 };
2487
2488 // We allow X to be a vector type, in which case Y will potentially be
2489 // splatted into a vector with the same element count.
2490 auto CreateMul = [&B](Value *X, Value *Y) {
2491 assert(X->getType()->getScalarType() == Y->getType() &&(static_cast <bool> (X->getType()->getScalarType(
) == Y->getType() && "Types don't match!") ? void (
0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2492, __extension__
__PRETTY_FUNCTION__))
2492 "Types don't match!")(static_cast <bool> (X->getType()->getScalarType(
) == Y->getType() && "Types don't match!") ? void (
0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2492, __extension__
__PRETTY_FUNCTION__))
;
2493 if (auto *CX = dyn_cast<ConstantInt>(X))
2494 if (CX->isOne())
2495 return Y;
2496 if (auto *CY = dyn_cast<ConstantInt>(Y))
2497 if (CY->isOne())
2498 return X;
2499 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2500 if (XVTy && !isa<VectorType>(Y->getType()))
2501 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2502 return B.CreateMul(X, Y);
2503 };
2504
2505 switch (ID.getKind()) {
2506 case InductionDescriptor::IK_IntInduction: {
2507 assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for integer inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2508, __extension__
__PRETTY_FUNCTION__))
2508 "Vector indices not supported for integer inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for integer inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2508, __extension__
__PRETTY_FUNCTION__))
;
2509 assert(Index->getType() == StartValue->getType() &&(static_cast <bool> (Index->getType() == StartValue->
getType() && "Index type does not match StartValue type"
) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2510, __extension__
__PRETTY_FUNCTION__))
2510 "Index type does not match StartValue type")(static_cast <bool> (Index->getType() == StartValue->
getType() && "Index type does not match StartValue type"
) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2510, __extension__
__PRETTY_FUNCTION__))
;
2511 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2512 return B.CreateSub(StartValue, Index);
2513 auto *Offset = CreateMul(Index, Step);
2514 return CreateAdd(StartValue, Offset);
2515 }
2516 case InductionDescriptor::IK_PtrInduction: {
2517 assert(isa<Constant>(Step) &&(static_cast <bool> (isa<Constant>(Step) &&
"Expected constant step for pointer induction") ? void (0) :
__assert_fail ("isa<Constant>(Step) && \"Expected constant step for pointer induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2518, __extension__
__PRETTY_FUNCTION__))
2518 "Expected constant step for pointer induction")(static_cast <bool> (isa<Constant>(Step) &&
"Expected constant step for pointer induction") ? void (0) :
__assert_fail ("isa<Constant>(Step) && \"Expected constant step for pointer induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2518, __extension__
__PRETTY_FUNCTION__))
;
2519 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2520 }
2521 case InductionDescriptor::IK_FpInduction: {
2522 assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for FP inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2523, __extension__
__PRETTY_FUNCTION__))
2523 "Vector indices not supported for FP inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for FP inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2523, __extension__
__PRETTY_FUNCTION__))
;
2524 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value")(static_cast <bool> (Step->getType()->isFloatingPointTy
() && "Expected FP Step value") ? void (0) : __assert_fail
("Step->getType()->isFloatingPointTy() && \"Expected FP Step value\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2524, __extension__
__PRETTY_FUNCTION__))
;
2525 auto InductionBinOp = ID.getInductionBinOp();
2526 assert(InductionBinOp &&(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2529, __extension__
__PRETTY_FUNCTION__))
2527 (InductionBinOp->getOpcode() == Instruction::FAdd ||(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2529, __extension__
__PRETTY_FUNCTION__))
2528 InductionBinOp->getOpcode() == Instruction::FSub) &&(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2529, __extension__
__PRETTY_FUNCTION__))
2529 "Original bin op should be defined for FP induction")(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2529, __extension__
__PRETTY_FUNCTION__))
;
2530
2531 Value *MulExp = B.CreateFMul(Step, Index);
2532 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2533 "induction");
2534 }
2535 case InductionDescriptor::IK_NoInduction:
2536 return nullptr;
2537 }
2538 llvm_unreachable("invalid enum")::llvm::llvm_unreachable_internal("invalid enum", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2538)
;
2539}
2540
2541void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2542 const VPIteration &Instance,
2543 VPTransformState &State) {
2544 Value *ScalarInst = State.get(Def, Instance);
2545 Value *VectorValue = State.get(Def, Instance.Part);
2546 VectorValue = Builder.CreateInsertElement(
2547 VectorValue, ScalarInst,
2548 Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2549 State.set(Def, VectorValue, Instance.Part);
2550}
2551
2552// Return whether we allow using masked interleave-groups (for dealing with
2553// strided loads/stores that reside in predicated blocks, or for dealing
2554// with gaps).
2555static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2556 // If an override option has been passed in for interleaved accesses, use it.
2557 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2558 return EnableMaskedInterleavedMemAccesses;
2559
2560 return TTI.enableMaskedInterleavedAccessVectorization();
2561}
2562
2563// Try to vectorize the interleave group that \p Instr belongs to.
2564//
2565// E.g. Translate following interleaved load group (factor = 3):
2566// for (i = 0; i < N; i+=3) {
2567// R = Pic[i]; // Member of index 0
2568// G = Pic[i+1]; // Member of index 1
2569// B = Pic[i+2]; // Member of index 2
2570// ... // do something to R, G, B
2571// }
2572// To:
2573// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2574// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2575// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2576// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2577//
2578// Or translate following interleaved store group (factor = 3):
2579// for (i = 0; i < N; i+=3) {
2580// ... do something to R, G, B
2581// Pic[i] = R; // Member of index 0
2582// Pic[i+1] = G; // Member of index 1
2583// Pic[i+2] = B; // Member of index 2
2584// }
2585// To:
2586// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2587// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2588// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2589// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2590// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2591void InnerLoopVectorizer::vectorizeInterleaveGroup(
2592 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2593 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2594 VPValue *BlockInMask) {
2595 Instruction *Instr = Group->getInsertPos();
2596 const DataLayout &DL = Instr->getModule()->getDataLayout();
2597
2598 // Prepare for the vector type of the interleaved load/store.
2599 Type *ScalarTy = getLoadStoreType(Instr);
2600 unsigned InterleaveFactor = Group->getFactor();
2601 assert(!VF.isScalable() && "scalable vectors not yet supported.")(static_cast <bool> (!VF.isScalable() && "scalable vectors not yet supported."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2601, __extension__
__PRETTY_FUNCTION__))
;
2602 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2603
2604 // Prepare for the new pointers.
2605 SmallVector<Value *, 2> AddrParts;
2606 unsigned Index = Group->getIndex(Instr);
2607
2608 // TODO: extend the masked interleaved-group support to reversed access.
2609 assert((!BlockInMask || !Group->isReverse()) &&(static_cast <bool> ((!BlockInMask || !Group->isReverse
()) && "Reversed masked interleave-group not supported."
) ? void (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2610, __extension__
__PRETTY_FUNCTION__))
2610 "Reversed masked interleave-group not supported.")(static_cast <bool> ((!BlockInMask || !Group->isReverse
()) && "Reversed masked interleave-group not supported."
) ? void (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2610, __extension__
__PRETTY_FUNCTION__))
;
2611
2612 // If the group is reverse, adjust the index to refer to the last vector lane
2613 // instead of the first. We adjust the index from the first vector lane,
2614 // rather than directly getting the pointer for lane VF - 1, because the
2615 // pointer operand of the interleaved access is supposed to be uniform. For
2616 // uniform instructions, we're only required to generate a value for the
2617 // first vector lane in each unroll iteration.
2618 if (Group->isReverse())
2619 Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2620
2621 for (unsigned Part = 0; Part < UF; Part++) {
2622 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2623 State.setDebugLocFromInst(AddrPart);
2624
2625 // Notice current instruction could be any index. Need to adjust the address
2626 // to the member of index 0.
2627 //
2628 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2629 // b = A[i]; // Member of index 0
2630 // Current pointer is pointed to A[i+1], adjust it to A[i].
2631 //
2632 // E.g. A[i+1] = a; // Member of index 1
2633 // A[i] = b; // Member of index 0
2634 // A[i+2] = c; // Member of index 2 (Current instruction)
2635 // Current pointer is pointed to A[i+2], adjust it to A[i].
2636
2637 bool InBounds = false;
2638 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2639 InBounds = gep->isInBounds();
2640 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2641 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2642
2643 // Cast to the vector pointer type.
2644 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2645 Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2646 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2647 }
2648
2649 State.setDebugLocFromInst(Instr);
2650 Value *PoisonVec = PoisonValue::get(VecTy);
2651
2652 Value *MaskForGaps = nullptr;
2653 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2654 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2655 assert(MaskForGaps && "Mask for Gaps is required but it is null")(static_cast <bool> (MaskForGaps && "Mask for Gaps is required but it is null"
) ? void (0) : __assert_fail ("MaskForGaps && \"Mask for Gaps is required but it is null\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2655, __extension__
__PRETTY_FUNCTION__))
;
2656 }
2657
2658 // Vectorize the interleaved load group.
2659 if (isa<LoadInst>(Instr)) {
2660 // For each unroll part, create a wide load for the group.
2661 SmallVector<Value *, 2> NewLoads;
2662 for (unsigned Part = 0; Part < UF; Part++) {
2663 Instruction *NewLoad;
2664 if (BlockInMask || MaskForGaps) {
2665 assert(useMaskedInterleavedAccesses(*TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(*TTI)
&& "masked interleaved groups are not allowed.") ? void
(0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2666, __extension__
__PRETTY_FUNCTION__))
2666 "masked interleaved groups are not allowed.")(static_cast <bool> (useMaskedInterleavedAccesses(*TTI)
&& "masked interleaved groups are not allowed.") ? void
(0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2666, __extension__
__PRETTY_FUNCTION__))
;
2667 Value *GroupMask = MaskForGaps;
2668 if (BlockInMask) {
2669 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2670 Value *ShuffledMask = Builder.CreateShuffleVector(
2671 BlockInMaskPart,
2672 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2673 "interleaved.mask");
2674 GroupMask = MaskForGaps
2675 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2676 MaskForGaps)
2677 : ShuffledMask;
2678 }
2679 NewLoad =
2680 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2681 GroupMask, PoisonVec, "wide.masked.vec");
2682 }
2683 else
2684 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2685 Group->getAlign(), "wide.vec");
2686 Group->addMetadata(NewLoad);
2687 NewLoads.push_back(NewLoad);
2688 }
2689
2690 // For each member in the group, shuffle out the appropriate data from the
2691 // wide loads.
2692 unsigned J = 0;
2693 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2694 Instruction *Member = Group->getMember(I);
2695
2696 // Skip the gaps in the group.
2697 if (!Member)
2698 continue;
2699
2700 auto StrideMask =
2701 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2702 for (unsigned Part = 0; Part < UF; Part++) {
2703 Value *StridedVec = Builder.CreateShuffleVector(
2704 NewLoads[Part], StrideMask, "strided.vec");
2705
2706 // If this member has different type, cast the result type.
2707 if (Member->getType() != ScalarTy) {
2708 assert(!VF.isScalable() && "VF is assumed to be non scalable.")(static_cast <bool> (!VF.isScalable() && "VF is assumed to be non scalable."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2708, __extension__
__PRETTY_FUNCTION__))
;
2709 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2710 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2711 }
2712
2713 if (Group->isReverse())
2714 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2715
2716 State.set(VPDefs[J], StridedVec, Part);
2717 }
2718 ++J;
2719 }
2720 return;
2721 }
2722
2723 // The sub vector type for current instruction.
2724 auto *SubVT = VectorType::get(ScalarTy, VF);
2725
2726 // Vectorize the interleaved store group.
2727 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2728 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&(static_cast <bool> ((!MaskForGaps || useMaskedInterleavedAccesses
(*TTI)) && "masked interleaved groups are not allowed."
) ? void (0) : __assert_fail ("(!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2729, __extension__
__PRETTY_FUNCTION__))
2729 "masked interleaved groups are not allowed.")(static_cast <bool> ((!MaskForGaps || useMaskedInterleavedAccesses
(*TTI)) && "masked interleaved groups are not allowed."
) ? void (0) : __assert_fail ("(!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2729, __extension__
__PRETTY_FUNCTION__))
;
2730 assert((!MaskForGaps || !VF.isScalable()) &&(static_cast <bool> ((!MaskForGaps || !VF.isScalable())
&& "masking gaps for scalable vectors is not yet supported."
) ? void (0) : __assert_fail ("(!MaskForGaps || !VF.isScalable()) && \"masking gaps for scalable vectors is not yet supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2731, __extension__
__PRETTY_FUNCTION__))
2731 "masking gaps for scalable vectors is not yet supported.")(static_cast <bool> ((!MaskForGaps || !VF.isScalable())
&& "masking gaps for scalable vectors is not yet supported."
) ? void (0) : __assert_fail ("(!MaskForGaps || !VF.isScalable()) && \"masking gaps for scalable vectors is not yet supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2731, __extension__
__PRETTY_FUNCTION__))
;
2732 for (unsigned Part = 0; Part < UF; Part++) {
2733 // Collect the stored vector from each member.
2734 SmallVector<Value *, 4> StoredVecs;
2735 unsigned StoredIdx = 0;
2736 for (unsigned i = 0; i < InterleaveFactor; i++) {
2737 assert((Group->getMember(i) || MaskForGaps) &&(static_cast <bool> ((Group->getMember(i) || MaskForGaps
) && "Fail to get a member from an interleaved store group"
) ? void (0) : __assert_fail ("(Group->getMember(i) || MaskForGaps) && \"Fail to get a member from an interleaved store group\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2738, __extension__
__PRETTY_FUNCTION__))
2738 "Fail to get a member from an interleaved store group")(static_cast <bool> ((Group->getMember(i) || MaskForGaps
) && "Fail to get a member from an interleaved store group"
) ? void (0) : __assert_fail ("(Group->getMember(i) || MaskForGaps) && \"Fail to get a member from an interleaved store group\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2738, __extension__
__PRETTY_FUNCTION__))
;
2739 Instruction *Member = Group->getMember(i);
2740
2741 // Skip the gaps in the group.
2742 if (!Member) {
2743 Value *Undef = PoisonValue::get(SubVT);
2744 StoredVecs.push_back(Undef);
2745 continue;
2746 }
2747
2748 Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2749 ++StoredIdx;
2750
2751 if (Group->isReverse())
2752 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2753
2754 // If this member has different type, cast it to a unified type.
2755
2756 if (StoredVec->getType() != SubVT)
2757 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2758
2759 StoredVecs.push_back(StoredVec);
2760 }
2761
2762 // Concatenate all vectors into a wide vector.
2763 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2764
2765 // Interleave the elements in the wide vector.
2766 Value *IVec = Builder.CreateShuffleVector(
2767 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2768 "interleaved.vec");
2769
2770 Instruction *NewStoreInstr;
2771 if (BlockInMask || MaskForGaps) {
2772 Value *GroupMask = MaskForGaps;
2773 if (BlockInMask) {
2774 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2775 Value *ShuffledMask = Builder.CreateShuffleVector(
2776 BlockInMaskPart,
2777 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2778 "interleaved.mask");
2779 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2780 ShuffledMask, MaskForGaps)
2781 : ShuffledMask;
2782 }
2783 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2784 Group->getAlign(), GroupMask);
2785 } else
2786 NewStoreInstr =
2787 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2788
2789 Group->addMetadata(NewStoreInstr);
2790 }
2791}
2792
2793void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2794 VPReplicateRecipe *RepRecipe,
2795 const VPIteration &Instance,
2796 bool IfPredicateInstr,
2797 VPTransformState &State) {
2798 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")(static_cast <bool> (!Instr->getType()->isAggregateType
() && "Can't handle vectors") ? void (0) : __assert_fail
("!Instr->getType()->isAggregateType() && \"Can't handle vectors\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2798, __extension__
__PRETTY_FUNCTION__))
;
2799
2800 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2801 // the first lane and part.
2802 if (isa<NoAliasScopeDeclInst>(Instr))
2803 if (!Instance.isFirstIteration())
2804 return;
2805
2806 // Does this instruction return a value ?
2807 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2808
2809 Instruction *Cloned = Instr->clone();
2810 if (!IsVoidRetTy)
2811 Cloned->setName(Instr->getName() + ".cloned");
2812
2813 // If the scalarized instruction contributes to the address computation of a
2814 // widen masked load/store which was in a basic block that needed predication
2815 // and is not predicated after vectorization, we can't propagate
2816 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2817 // instruction could feed a poison value to the base address of the widen
2818 // load/store.
2819 if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2820 Cloned->dropPoisonGeneratingFlags();
2821
2822 if (Instr->getDebugLoc())
2823 State.setDebugLocFromInst(Instr);
2824
2825 // Replace the operands of the cloned instructions with their scalar
2826 // equivalents in the new loop.
2827 for (const auto &I : enumerate(RepRecipe->operands())) {
2828 auto InputInstance = Instance;
2829 VPValue *Operand = I.value();
2830 if (vputils::isUniformAfterVectorization(Operand))
2831 InputInstance.Lane = VPLane::getFirstLane();
2832 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2833 }
2834 State.addNewMetadata(Cloned, Instr);
2835
2836 // Place the cloned scalar in the new loop.
2837 State.Builder.Insert(Cloned);
2838
2839 State.set(RepRecipe, Cloned, Instance);
2840
2841 // If we just cloned a new assumption, add it the assumption cache.
2842 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2843 AC->registerAssumption(II);
2844
2845 // End if-block.
2846 if (IfPredicateInstr)
2847 PredicatedInstructions.push_back(Cloned);
2848}
2849
2850Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
2851 if (TripCount)
2852 return TripCount;
2853
2854 assert(InsertBlock)(static_cast <bool> (InsertBlock) ? void (0) : __assert_fail
("InsertBlock", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2854, __extension__ __PRETTY_FUNCTION__))
;
2855 IRBuilder<> Builder(InsertBlock->getTerminator());
2856 // Find the loop boundaries.
2857 Type *IdxTy = Legal->getWidestInductionType();
2858 assert(IdxTy && "No type for induction")(static_cast <bool> (IdxTy && "No type for induction"
) ? void (0) : __assert_fail ("IdxTy && \"No type for induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2858, __extension__
__PRETTY_FUNCTION__))
;
2859 const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE);
2860
2861 const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
2862
2863 // Expand the trip count and place the new instructions in the preheader.
2864 // Notice that the pre-header does not change, only the loop body.
2865 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2866
2867 // Count holds the overall loop count (N).
2868 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2869 InsertBlock->getTerminator());
2870
2871 if (TripCount->getType()->isPointerTy())
2872 TripCount =
2873 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2874 InsertBlock->getTerminator());
2875
2876 return TripCount;
2877}
2878
2879Value *
2880InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2881 if (VectorTripCount)
2882 return VectorTripCount;
2883
2884 Value *TC = getOrCreateTripCount(InsertBlock);
2885 IRBuilder<> Builder(InsertBlock->getTerminator());
2886
2887 Type *Ty = TC->getType();
2888 // This is where we can make the step a runtime constant.
2889 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2890
2891 // If the tail is to be folded by masking, round the number of iterations N
2892 // up to a multiple of Step instead of rounding down. This is done by first
2893 // adding Step-1 and then rounding down. Note that it's ok if this addition
2894 // overflows: the vector induction variable will eventually wrap to zero given
2895 // that it starts at zero and its Step is a power of two; the loop will then
2896 // exit, with the last early-exit vector comparison also producing all-true.
2897 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2898 // is accounted for in emitIterationCountCheck that adds an overflow check.
2899 if (Cost->foldTailByMasking()) {
2900 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
) * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2901, __extension__
__PRETTY_FUNCTION__))
2901 "VF*UF must be a power of 2 when folding tail by masking")(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
) * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2901, __extension__
__PRETTY_FUNCTION__))
;
2902 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2903 TC = Builder.CreateAdd(
2904 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2905 }
2906
2907 // Now we need to generate the expression for the part of the loop that the
2908 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2909 // iterations are not required for correctness, or N - Step, otherwise. Step
2910 // is equal to the vectorization factor (number of SIMD elements) times the
2911 // unroll factor (number of SIMD instructions).
2912 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2913
2914 // There are cases where we *must* run at least one iteration in the remainder
2915 // loop. See the cost model for when this can happen. If the step evenly
2916 // divides the trip count, we set the remainder to be equal to the step. If
2917 // the step does not evenly divide the trip count, no adjustment is necessary
2918 // since there will already be scalar iterations. Note that the minimum
2919 // iterations check ensures that N >= Step.
2920 if (Cost->requiresScalarEpilogue(VF)) {
2921 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2922 R = Builder.CreateSelect(IsZero, Step, R);
2923 }
2924
2925 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2926
2927 return VectorTripCount;
2928}
2929
2930Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2931 const DataLayout &DL) {
2932 // Verify that V is a vector type with same number of elements as DstVTy.
2933 auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2934 unsigned VF = DstFVTy->getNumElements();
2935 auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2936 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match")(static_cast <bool> ((VF == SrcVecTy->getNumElements
()) && "Vector dimensions do not match") ? void (0) :
__assert_fail ("(VF == SrcVecTy->getNumElements()) && \"Vector dimensions do not match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2936, __extension__
__PRETTY_FUNCTION__))
;
2937 Type *SrcElemTy = SrcVecTy->getElementType();
2938 Type *DstElemTy = DstFVTy->getElementType();
2939 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2940, __extension__
__PRETTY_FUNCTION__))
2940 "Vector elements must have same size")(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2940, __extension__
__PRETTY_FUNCTION__))
;
2941
2942 // Do a direct cast if element types are castable.
2943 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2944 return Builder.CreateBitOrPointerCast(V, DstFVTy);
2945 }
2946 // V cannot be directly casted to desired vector type.
2947 // May happen when V is a floating point vector but DstVTy is a vector of
2948 // pointers or vice-versa. Handle this using a two-step bitcast using an
2949 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2950 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2951, __extension__
__PRETTY_FUNCTION__))
2951 "Only one type should be a pointer type")(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2951, __extension__
__PRETTY_FUNCTION__))
;
2952 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&(static_cast <bool> ((DstElemTy->isFloatingPointTy()
!= SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2953, __extension__
__PRETTY_FUNCTION__))
2953 "Only one type should be a floating point type")(static_cast <bool> ((DstElemTy->isFloatingPointTy()
!= SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2953, __extension__
__PRETTY_FUNCTION__))
;
2954 Type *IntTy =
2955 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2956 auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2957 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2958 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2959}
2960
2961void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2962 Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
2963 // Reuse existing vector loop preheader for TC checks.
2964 // Note that new preheader block is generated for vector loop.
2965 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2966 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2967
2968 // Generate code to check if the loop's trip count is less than VF * UF, or
2969 // equal to it in case a scalar epilogue is required; this implies that the
2970 // vector trip count is zero. This check also covers the case where adding one
2971 // to the backedge-taken count overflowed leading to an incorrect trip count
2972 // of zero. In this case we will also jump to the scalar loop.
2973 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2974 : ICmpInst::ICMP_ULT;
2975
2976 // If tail is to be folded, vector loop takes care of all iterations.
2977 Type *CountTy = Count->getType();
2978 Value *CheckMinIters = Builder.getFalse();
2979 auto CreateStep = [&]() -> Value * {
2980 // Create step with max(MinProTripCount, UF * VF).
2981 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2982 return createStepForVF(Builder, CountTy, VF, UF);
2983
2984 Value *MinProfTC =
2985 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2986 if (!VF.isScalable())
2987 return MinProfTC;
2988 return Builder.CreateBinaryIntrinsic(
2989 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2990 };
2991
2992 if (!Cost->foldTailByMasking())
2993 CheckMinIters =
2994 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2995 else if (VF.isScalable()) {
2996 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2997 // an overflow to zero when updating induction variables and so an
2998 // additional overflow check is required before entering the vector loop.
2999
3000 // Get the maximum unsigned value for the type.
3001 Value *MaxUIntTripCount =
3002 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
3003 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
3004
3005 // Don't execute the vector loop if (UMax - n) < (VF * UF).
3006 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
3007 }
3008
3009 // Create new preheader for vector loop.
3010 LoopVectorPreHeader =
3011 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3012 "vector.ph");
3013
3014 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3016, __extension__
__PRETTY_FUNCTION__))
3015 DT->getNode(Bypass)->getIDom()) &&(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3016, __extension__
__PRETTY_FUNCTION__))
3016 "TC check is expected to dominate Bypass")(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3016, __extension__
__PRETTY_FUNCTION__))
;
3017
3018 // Update dominator for Bypass & LoopExit (if needed).
3019 DT->changeImmediateDominator(Bypass, TCCheckBlock);
3020 if (!Cost->requiresScalarEpilogue(VF))
3021 // If there is an epilogue which must run, there's no edge from the
3022 // middle block to exit blocks and thus no need to update the immediate
3023 // dominator of the exit blocks.
3024 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3025
3026 ReplaceInstWithInst(
3027 TCCheckBlock->getTerminator(),
3028 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3029 LoopBypassBlocks.push_back(TCCheckBlock);
3030}
3031
3032BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
3033 BasicBlock *const SCEVCheckBlock =
3034 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
3035 if (!SCEVCheckBlock)
3036 return nullptr;
3037
3038 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3041, __extension__
__PRETTY_FUNCTION__))
3039 (OptForSizeBasedOnProfile &&(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3041, __extension__
__PRETTY_FUNCTION__))
3040 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3041, __extension__
__PRETTY_FUNCTION__))
3041 "Cannot SCEV check stride or overflow when optimizing for size")(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3041, __extension__
__PRETTY_FUNCTION__))
;
3042
3043
3044 // Update dominator only if this is first RT check.
3045 if (LoopBypassBlocks.empty()) {
3046 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3047 if (!Cost->requiresScalarEpilogue(VF))
3048 // If there is an epilogue which must run, there's no edge from the
3049 // middle block to exit blocks and thus no need to update the immediate
3050 // dominator of the exit blocks.
3051 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3052 }
3053
3054 LoopBypassBlocks.push_back(SCEVCheckBlock);
3055 AddedSafetyChecks = true;
3056 return SCEVCheckBlock;
3057}
3058
3059BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
3060 // VPlan-native path does not do any analysis for runtime checks currently.
3061 if (EnableVPlanNativePath)
3062 return nullptr;
3063
3064 BasicBlock *const MemCheckBlock =
3065 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
3066
3067 // Check if we generated code that checks in runtime if arrays overlap. We put
3068 // the checks into a separate block to make the more common case of few
3069 // elements faster.
3070 if (!MemCheckBlock)
3071 return nullptr;
3072
3073 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3074 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3076, __extension__
__PRETTY_FUNCTION__))
3075 "Cannot emit memory checks when optimizing for size, unless forced "(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3076, __extension__
__PRETTY_FUNCTION__))
3076 "to vectorize.")(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3076, __extension__
__PRETTY_FUNCTION__))
;
3077 ORE->emit([&]() {
3078 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationCodeSize",
3079 OrigLoop->getStartLoc(),
3080 OrigLoop->getHeader())
3081 << "Code-size may be reduced by not forcing "
3082 "vectorization, or by source-code modifications "
3083 "eliminating the need for runtime checks "
3084 "(e.g., adding 'restrict').";
3085 });
3086 }
3087
3088 LoopBypassBlocks.push_back(MemCheckBlock);
3089
3090 AddedSafetyChecks = true;
3091
3092 return MemCheckBlock;
3093}
3094
3095void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3096 LoopScalarBody = OrigLoop->getHeader();
3097 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3098 assert(LoopVectorPreHeader && "Invalid loop structure")(static_cast <bool> (LoopVectorPreHeader && "Invalid loop structure"
) ? void (0) : __assert_fail ("LoopVectorPreHeader && \"Invalid loop structure\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3098, __extension__
__PRETTY_FUNCTION__))
;
3099 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3100 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&(static_cast <bool> ((LoopExitBlock || Cost->requiresScalarEpilogue
(VF)) && "multiple exit loop without required epilogue?"
) ? void (0) : __assert_fail ("(LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && \"multiple exit loop without required epilogue?\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3101, __extension__
__PRETTY_FUNCTION__))
3101 "multiple exit loop without required epilogue?")(static_cast <bool> ((LoopExitBlock || Cost->requiresScalarEpilogue
(VF)) && "multiple exit loop without required epilogue?"
) ? void (0) : __assert_fail ("(LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && \"multiple exit loop without required epilogue?\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3101, __extension__
__PRETTY_FUNCTION__))
;
3102
3103 LoopMiddleBlock =
3104 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3105 LI, nullptr, Twine(Prefix) + "middle.block");
3106 LoopScalarPreHeader =
3107 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3108 nullptr, Twine(Prefix) + "scalar.ph");
3109
3110 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3111
3112 // Set up the middle block terminator. Two cases:
3113 // 1) If we know that we must execute the scalar epilogue, emit an
3114 // unconditional branch.
3115 // 2) Otherwise, we must have a single unique exit block (due to how we
3116 // implement the multiple exit case). In this case, set up a conditional
3117 // branch from the middle block to the loop scalar preheader, and the
3118 // exit block. completeLoopSkeleton will update the condition to use an
3119 // iteration check, if required to decide whether to execute the remainder.
3120 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3121 BranchInst::Create(LoopScalarPreHeader) :
3122 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3123 Builder.getTrue());
3124 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3125 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3126
3127 // Update dominator for loop exit. During skeleton creation, only the vector
3128 // pre-header and the middle block are created. The vector loop is entirely
3129 // created during VPlan exection.
3130 if (!Cost->requiresScalarEpilogue(VF))
3131 // If there is an epilogue which must run, there's no edge from the
3132 // middle block to exit blocks and thus no need to update the immediate
3133 // dominator of the exit blocks.
3134 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3135}
3136
3137PHINode *InnerLoopVectorizer::createInductionResumeValue(
3138 PHINode *OrigPhi, const InductionDescriptor &II,
3139 ArrayRef<BasicBlock *> BypassBlocks,
3140 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3141 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3142 assert(VectorTripCount && "Expected valid arguments")(static_cast <bool> (VectorTripCount && "Expected valid arguments"
) ? void (0) : __assert_fail ("VectorTripCount && \"Expected valid arguments\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3142, __extension__
__PRETTY_FUNCTION__))
;
3143
3144 Instruction *OldInduction = Legal->getPrimaryInduction();
3145 Value *&EndValue = IVEndValues[OrigPhi];
3146 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3147 if (OrigPhi == OldInduction) {
3148 // We know what the end value is.
3149 EndValue = VectorTripCount;
3150 } else {
3151 IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3152
3153 // Fast-math-flags propagate from the original induction instruction.
3154 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3155 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3156
3157 Value *Step =
3158 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3159 EndValue =
3160 emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II);
3161 EndValue->setName("ind.end");
3162
3163 // Compute the end value for the additional bypass (if applicable).
3164 if (AdditionalBypass.first) {
3165 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3166 Value *Step =
3167 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3168 EndValueFromAdditionalBypass = emitTransformedIndex(
3169 B, AdditionalBypass.second, II.getStartValue(), Step, II);
3170 EndValueFromAdditionalBypass->setName("ind.end");
3171 }
3172 }
3173
3174 // Create phi nodes to merge from the backedge-taken check block.
3175 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3176 LoopScalarPreHeader->getTerminator());
3177 // Copy original phi DL over to the new one.
3178 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3179
3180 // The new PHI merges the original incoming value, in case of a bypass,
3181 // or the value at the end of the vectorized loop.
3182 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3183
3184 // Fix the scalar body counter (PHI node).
3185 // The old induction's phi node in the scalar body needs the truncated
3186 // value.
3187 for (BasicBlock *BB : BypassBlocks)
3188 BCResumeVal->addIncoming(II.getStartValue(), BB);
3189
3190 if (AdditionalBypass.first)
3191 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3192 EndValueFromAdditionalBypass);
3193 return BCResumeVal;
3194}
3195
3196void InnerLoopVectorizer::createInductionResumeValues(
3197 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3198 assert(((AdditionalBypass.first && AdditionalBypass.second) ||(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3200, __extension__
__PRETTY_FUNCTION__))
3199 (!AdditionalBypass.first && !AdditionalBypass.second)) &&(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3200, __extension__
__PRETTY_FUNCTION__))
3200 "Inconsistent information about additional bypass.")(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3200, __extension__
__PRETTY_FUNCTION__))
;
3201 // We are going to resume the execution of the scalar loop.
3202 // Go over all of the induction variables that we found and fix the
3203 // PHIs that are left in the scalar version of the loop.
3204 // The starting values of PHI nodes depend on the counter of the last
3205 // iteration in the vectorized loop.
3206 // If we come from a bypass edge then we need to start from the original
3207 // start value.
3208 for (const auto &InductionEntry : Legal->getInductionVars()) {
3209 PHINode *OrigPhi = InductionEntry.first;
3210 const InductionDescriptor &II = InductionEntry.second;
3211 PHINode *BCResumeVal = createInductionResumeValue(
3212 OrigPhi, II, LoopBypassBlocks, AdditionalBypass);
3213 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3214 }
3215}
3216
3217BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3218 // The trip counts should be cached by now.
3219 Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
3220 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3221
3222 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3223
3224 // Add a check in the middle block to see if we have completed
3225 // all of the iterations in the first vector loop. Three cases:
3226 // 1) If we require a scalar epilogue, there is no conditional branch as
3227 // we unconditionally branch to the scalar preheader. Do nothing.
3228 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3229 // Thus if tail is to be folded, we know we don't need to run the
3230 // remainder and we can use the previous value for the condition (true).
3231 // 3) Otherwise, construct a runtime check.
3232 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3233 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3234 Count, VectorTripCount, "cmp.n",
3235 LoopMiddleBlock->getTerminator());
3236
3237 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3238 // of the corresponding compare because they may have ended up with
3239 // different line numbers and we want to avoid awkward line stepping while
3240 // debugging. Eg. if the compare has got a line number inside the loop.
3241 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3242 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3243 }
3244
3245#ifdef EXPENSIVE_CHECKS
3246 assert(DT->verify(DominatorTree::VerificationLevel::Fast))(static_cast <bool> (DT->verify(DominatorTree::VerificationLevel
::Fast)) ? void (0) : __assert_fail ("DT->verify(DominatorTree::VerificationLevel::Fast)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3246, __extension__
__PRETTY_FUNCTION__))
;
3247#endif
3248
3249 return LoopVectorPreHeader;
3250}
3251
3252std::pair<BasicBlock *, Value *>
3253InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3254 /*
3255 In this function we generate a new loop. The new loop will contain
3256 the vectorized instructions while the old loop will continue to run the
3257 scalar remainder.
3258
3259 [ ] <-- loop iteration number check.
3260 / |
3261 / v
3262 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3263 | / |
3264 | / v
3265 || [ ] <-- vector pre header.
3266 |/ |
3267 | v
3268 | [ ] \
3269 | [ ]_| <-- vector loop (created during VPlan execution).
3270 | |
3271 | v
3272 \ -[ ] <--- middle-block.
3273 \/ |
3274 /\ v
3275 | ->[ ] <--- new preheader.
3276 | |
3277 (opt) v <-- edge from middle to exit iff epilogue is not required.
3278 | [ ] \
3279 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3280 \ |
3281 \ v
3282 >[ ] <-- exit block(s).
3283 ...
3284 */
3285
3286 // Create an empty vector loop, and prepare basic blocks for the runtime
3287 // checks.
3288 createVectorLoopSkeleton("");
3289
3290 // Now, compare the new count to zero. If it is zero skip the vector loop and
3291 // jump to the scalar loop. This check also covers the case where the
3292 // backedge-taken count is uint##_max: adding one to it will overflow leading
3293 // to an incorrect trip count of zero. In this (rare) case we will also jump
3294 // to the scalar loop.
3295 emitIterationCountCheck(LoopScalarPreHeader);
3296
3297 // Generate the code to check any assumptions that we've made for SCEV
3298 // expressions.
3299 emitSCEVChecks(LoopScalarPreHeader);
3300
3301 // Generate the code that checks in runtime if arrays overlap. We put the
3302 // checks into a separate block to make the more common case of few elements
3303 // faster.
3304 emitMemRuntimeChecks(LoopScalarPreHeader);
3305
3306 // Emit phis for the new starting index of the scalar loop.
3307 createInductionResumeValues();
3308
3309 return {completeLoopSkeleton(), nullptr};
3310}
3311
3312// Fix up external users of the induction variable. At this point, we are
3313// in LCSSA form, with all external PHIs that use the IV having one input value,
3314// coming from the remainder loop. We need those PHIs to also have a correct
3315// value for the IV when arriving directly from the middle block.
3316void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3317 const InductionDescriptor &II,
3318 Value *VectorTripCount, Value *EndValue,
3319 BasicBlock *MiddleBlock,
3320 BasicBlock *VectorHeader, VPlan &Plan) {
3321 // There are two kinds of external IV usages - those that use the value
3322 // computed in the last iteration (the PHI) and those that use the penultimate
3323 // value (the value that feeds into the phi from the loop latch).
3324 // We allow both, but they, obviously, have different values.
3325
3326 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block")(static_cast <bool> (OrigLoop->getUniqueExitBlock() &&
"Expected a single exit block") ? void (0) : __assert_fail (
"OrigLoop->getUniqueExitBlock() && \"Expected a single exit block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3326, __extension__
__PRETTY_FUNCTION__))
;
3327
3328 DenseMap<Value *, Value *> MissingVals;
3329
3330 // An external user of the last iteration's value should see the value that
3331 // the remainder loop uses to initialize its own IV.
3332 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3333 for (User *U : PostInc->users()) {
3334 Instruction *UI = cast<Instruction>(U);
3335 if (!OrigLoop->contains(UI)) {
3336 assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3336, __extension__
__PRETTY_FUNCTION__))
;
3337 MissingVals[UI] = EndValue;
3338 }
3339 }
3340
3341 // An external user of the penultimate value need to see EndValue - Step.
3342 // The simplest way to get this is to recompute it from the constituent SCEVs,
3343 // that is Start + (Step * (CRD - 1)).
3344 for (User *U : OrigPhi->users()) {
3345 auto *UI = cast<Instruction>(U);
3346 if (!OrigLoop->contains(UI)) {
3347 assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3347, __extension__
__PRETTY_FUNCTION__))
;
3348
3349 IRBuilder<> B(MiddleBlock->getTerminator());
3350
3351 // Fast-math-flags propagate from the original induction instruction.
3352 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3353 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3354
3355 Value *CountMinusOne = B.CreateSub(
3356 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3357 CountMinusOne->setName("cmo");
3358 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3359 VectorHeader->getTerminator());
3360 Value *Escape =
3361 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II);
3362 Escape->setName("ind.escape");
3363 MissingVals[UI] = Escape;
3364 }
3365 }
3366
3367 for (auto &I : MissingVals) {
3368 PHINode *PHI = cast<PHINode>(I.first);
3369 // One corner case we have to handle is two IVs "chasing" each-other,
3370 // that is %IV2 = phi [...], [ %IV1, %latch ]
3371 // In this case, if IV1 has an external use, we need to avoid adding both
3372 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3373 // don't already have an incoming value for the middle block.
3374 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3375 PHI->addIncoming(I.second, MiddleBlock);
3376 Plan.removeLiveOut(PHI);
3377 }
3378 }
3379}
3380
3381namespace {
3382
3383struct CSEDenseMapInfo {
3384 static bool canHandle(const Instruction *I) {
3385 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3386 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3387 }
3388
3389 static inline Instruction *getEmptyKey() {
3390 return DenseMapInfo<Instruction *>::getEmptyKey();
3391 }
3392
3393 static inline Instruction *getTombstoneKey() {
3394 return DenseMapInfo<Instruction *>::getTombstoneKey();
3395 }
3396
3397 static unsigned getHashValue(const Instruction *I) {
3398 assert(canHandle(I) && "Unknown instruction!")(static_cast <bool> (canHandle(I) && "Unknown instruction!"
) ? void (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3398, __extension__
__PRETTY_FUNCTION__))
;
3399 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3400 I->value_op_end()));
3401 }
3402
3403 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3404 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3405 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3406 return LHS == RHS;
3407 return LHS->isIdenticalTo(RHS);
3408 }
3409};
3410
3411} // end anonymous namespace
3412
3413///Perform cse of induction variable instructions.
3414static void cse(BasicBlock *BB) {
3415 // Perform simple cse.
3416 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3417 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3418 if (!CSEDenseMapInfo::canHandle(&In))
3419 continue;
3420
3421 // Check if we can replace this instruction with any of the
3422 // visited instructions.
3423 if (Instruction *V = CSEMap.lookup(&In)) {
3424 In.replaceAllUsesWith(V);
3425 In.eraseFromParent();
3426 continue;
3427 }
3428
3429 CSEMap[&In] = &In;
3430 }
3431}
3432
3433InstructionCost
3434LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3435 bool &NeedToScalarize) const {
3436 Function *F = CI->getCalledFunction();
3437 Type *ScalarRetTy = CI->getType();
3438 SmallVector<Type *, 4> Tys, ScalarTys;
3439 for (auto &ArgOp : CI->args())
3440 ScalarTys.push_back(ArgOp->getType());
3441
3442 // Estimate cost of scalarized vector call. The source operands are assumed
3443 // to be vectors, so we need to extract individual elements from there,
3444 // execute VF scalar calls, and then gather the result into the vector return
3445 // value.
3446 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3447 InstructionCost ScalarCallCost =
3448 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind);
3449 if (VF.isScalar())
3450 return ScalarCallCost;
3451
3452 // Compute corresponding vector type for return value and arguments.
3453 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3454 for (Type *ScalarTy : ScalarTys)
3455 Tys.push_back(ToVectorTy(ScalarTy, VF));
3456
3457 // Compute costs of unpacking argument values for the scalar calls and
3458 // packing the return values to a vector.
3459 InstructionCost ScalarizationCost =
3460 getScalarizationOverhead(CI, VF, CostKind);
3461
3462 InstructionCost Cost =
3463 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3464
3465 // If we can't emit a vector call for this function, then the currently found
3466 // cost is the cost we need to return.
3467 NeedToScalarize = true;
3468 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3469 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3470
3471 if (!TLI || CI->isNoBuiltin() || !VecFunc)
3472 return Cost;
3473
3474 // If the corresponding vector cost is cheaper, return its cost.
3475 InstructionCost VectorCallCost =
3476 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
3477 if (VectorCallCost < Cost) {
3478 NeedToScalarize = false;
3479 Cost = VectorCallCost;
3480 }
3481 return Cost;
3482}
3483
3484static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3485 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3486 return Elt;
3487 return VectorType::get(Elt, VF);
3488}
3489
3490InstructionCost
3491LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3492 ElementCount VF) const {
3493 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3494 assert(ID && "Expected intrinsic call!")(static_cast <bool> (ID && "Expected intrinsic call!"
) ? void (0) : __assert_fail ("ID && \"Expected intrinsic call!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3494, __extension__
__PRETTY_FUNCTION__))
;
3495 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3496 FastMathFlags FMF;
3497 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3498 FMF = FPMO->getFastMathFlags();
3499
3500 SmallVector<const Value *> Arguments(CI->args());
3501 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3502 SmallVector<Type *> ParamTys;
3503 std::transform(FTy->param_begin(), FTy->param_end(),
3504 std::back_inserter(ParamTys),
3505 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3506
3507 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3508 dyn_cast<IntrinsicInst>(CI));
3509 return TTI.getIntrinsicInstrCost(CostAttrs,
3510 TargetTransformInfo::TCK_RecipThroughput);
3511}
3512
3513static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3514 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3515 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3516 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3517}
3518
3519static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3520 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3521 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3522 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3523}
3524
3525void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3526 // For every instruction `I` in MinBWs, truncate the operands, create a
3527 // truncated version of `I` and reextend its result. InstCombine runs
3528 // later and will remove any ext/trunc pairs.
3529 SmallPtrSet<Value *, 4> Erased;
3530 for (const auto &KV : Cost->getMinimalBitwidths()) {
3531 // If the value wasn't vectorized, we must maintain the original scalar
3532 // type. The absence of the value from State indicates that it
3533 // wasn't vectorized.
3534 // FIXME: Should not rely on getVPValue at this point.
3535 VPValue *Def = State.Plan->getVPValue(KV.first, true);
3536 if (!State.hasAnyVectorValue(Def))
3537 continue;
3538 for (unsigned Part = 0; Part < UF; ++Part) {
3539 Value *I = State.get(Def, Part);
3540 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3541 continue;
3542 Type *OriginalTy = I->getType();
3543 Type *ScalarTruncatedTy =
3544 IntegerType::get(OriginalTy->getContext(), KV.second);
3545 auto *TruncatedTy = VectorType::get(
3546 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3547 if (TruncatedTy == OriginalTy)
3548 continue;
3549
3550 IRBuilder<> B(cast<Instruction>(I));
3551 auto ShrinkOperand = [&](Value *V) -> Value * {
3552 if (auto *ZI = dyn_cast<ZExtInst>(V))
3553 if (ZI->getSrcTy() == TruncatedTy)
3554 return ZI->getOperand(0);
3555 return B.CreateZExtOrTrunc(V, TruncatedTy);
3556 };
3557
3558 // The actual instruction modification depends on the instruction type,
3559 // unfortunately.
3560 Value *NewI = nullptr;
3561 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3562 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3563 ShrinkOperand(BO->getOperand(1)));
3564
3565 // Any wrapping introduced by shrinking this operation shouldn't be
3566 // considered undefined behavior. So, we can't unconditionally copy
3567 // arithmetic wrapping flags to NewI.
3568 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3569 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3570 NewI =
3571 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3572 ShrinkOperand(CI->getOperand(1)));
3573 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3574 NewI = B.CreateSelect(SI->getCondition(),
3575 ShrinkOperand(SI->getTrueValue()),
3576 ShrinkOperand(SI->getFalseValue()));
3577 } else if (auto *CI = dyn_cast<CastInst>(I)) {
3578 switch (CI->getOpcode()) {
3579 default:
3580 llvm_unreachable("Unhandled cast!")::llvm::llvm_unreachable_internal("Unhandled cast!", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3580)
;
3581 case Instruction::Trunc:
3582 NewI = ShrinkOperand(CI->getOperand(0));
3583 break;
3584 case Instruction::SExt:
3585 NewI = B.CreateSExtOrTrunc(
3586 CI->getOperand(0),
3587 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3588 break;
3589 case Instruction::ZExt:
3590 NewI = B.CreateZExtOrTrunc(
3591 CI->getOperand(0),
3592 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3593 break;
3594 }
3595 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3596 auto Elements0 =
3597 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3598 auto *O0 = B.CreateZExtOrTrunc(
3599 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3600 auto Elements1 =
3601 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3602 auto *O1 = B.CreateZExtOrTrunc(
3603 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3604
3605 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3606 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3607 // Don't do anything with the operands, just extend the result.
3608 continue;
3609 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3610 auto Elements =
3611 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3612 auto *O0 = B.CreateZExtOrTrunc(
3613 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3614 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3615 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3616 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3617 auto Elements =
3618 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3619 auto *O0 = B.CreateZExtOrTrunc(
3620 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3621 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3622 } else {
3623 // If we don't know what to do, be conservative and don't do anything.
3624 continue;
3625 }
3626
3627 // Lastly, extend the result.
3628 NewI->takeName(cast<Instruction>(I));
3629 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3630 I->replaceAllUsesWith(Res);
3631 cast<Instruction>(I)->eraseFromParent();
3632 Erased.insert(I);
3633 State.reset(Def, Res, Part);
3634 }
3635 }
3636
3637 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3638 for (const auto &KV : Cost->getMinimalBitwidths()) {
3639 // If the value wasn't vectorized, we must maintain the original scalar
3640 // type. The absence of the value from State indicates that it
3641 // wasn't vectorized.
3642 // FIXME: Should not rely on getVPValue at this point.
3643 VPValue *Def = State.Plan->getVPValue(KV.first, true);
3644 if (!State.hasAnyVectorValue(Def))
3645 continue;
3646 for (unsigned Part = 0; Part < UF; ++Part) {
3647 Value *I = State.get(Def, Part);
3648 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3649 if (Inst && Inst->use_empty()) {
3650 Value *NewI = Inst->getOperand(0);
3651 Inst->eraseFromParent();
3652 State.reset(Def, NewI, Part);
3653 }
3654 }
3655 }
3656}
3657
3658void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3659 VPlan &Plan) {
3660 // Insert truncates and extends for any truncated instructions as hints to
3661 // InstCombine.
3662 if (VF.isVector())
3663 truncateToMinimalBitwidths(State);
3664
3665 // Fix widened non-induction PHIs by setting up the PHI operands.
3666 if (EnableVPlanNativePath)
3667 fixNonInductionPHIs(Plan, State);
3668
3669 // At this point every instruction in the original loop is widened to a
3670 // vector form. Now we need to fix the recurrences in the loop. These PHI
3671 // nodes are currently empty because we did not want to introduce cycles.
3672 // This is the second stage of vectorizing recurrences.
3673 fixCrossIterationPHIs(State);
3674
3675 // Forget the original basic block.
3676 PSE.getSE()->forgetLoop(OrigLoop);
3677
3678 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
3679 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3680 if (Cost->requiresScalarEpilogue(VF)) {
3681 // No edge from the middle block to the unique exit block has been inserted
3682 // and there is nothing to fix from vector loop; phis should have incoming
3683 // from scalar loop only.
3684 Plan.clearLiveOuts();
3685 } else {
3686 // If we inserted an edge from the middle block to the unique exit block,
3687 // update uses outside the loop (phis) to account for the newly inserted
3688 // edge.
3689
3690 // Fix-up external users of the induction variables.
3691 for (const auto &Entry : Legal->getInductionVars())
3692 fixupIVUsers(Entry.first, Entry.second,
3693 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3694 IVEndValues[Entry.first], LoopMiddleBlock,
3695 VectorLoop->getHeader(), Plan);
3696 }
3697
3698 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3699 // in the exit block, so update the builder.
3700 State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
3701 for (const auto &KV : Plan.getLiveOuts())
3702 KV.second->fixPhi(Plan, State);
3703
3704 for (Instruction *PI : PredicatedInstructions)
3705 sinkScalarOperands(&*PI);
3706
3707 // Remove redundant induction instructions.
3708 cse(VectorLoop->getHeader());
3709
3710 // Set/update profile weights for the vector and remainder loops as original
3711 // loop iterations are now distributed among them. Note that original loop
3712 // represented by LoopScalarBody becomes remainder loop after vectorization.
3713 //
3714 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3715 // end up getting slightly roughened result but that should be OK since
3716 // profile is not inherently precise anyway. Note also possible bypass of
3717 // vector code caused by legality checks is ignored, assigning all the weight
3718 // to the vector loop, optimistically.
3719 //
3720 // For scalable vectorization we can't know at compile time how many iterations
3721 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3722 // vscale of '1'.
3723 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3724 LI->getLoopFor(LoopScalarBody),
3725 VF.getKnownMinValue() * UF);
3726}
3727
3728void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3729 // In order to support recurrences we need to be able to vectorize Phi nodes.
3730 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3731 // stage #2: We now need to fix the recurrences by adding incoming edges to
3732 // the currently empty PHI nodes. At this point every instruction in the
3733 // original loop is widened to a vector form so we can use them to construct
3734 // the incoming edges.
3735 VPBasicBlock *Header =
3736 State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
3737 for (VPRecipeBase &R : Header->phis()) {
3738 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3739 fixReduction(ReductionPhi, State);
3740 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3741 fixFixedOrderRecurrence(FOR, State);
3742 }
3743}
3744
3745void InnerLoopVectorizer::fixFixedOrderRecurrence(
3746 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3747 // This is the second phase of vectorizing first-order recurrences. An
3748 // overview of the transformation is described below. Suppose we have the
3749 // following loop.
3750 //
3751 // for (int i = 0; i < n; ++i)
3752 // b[i] = a[i] - a[i - 1];
3753 //
3754 // There is a first-order recurrence on "a". For this loop, the shorthand
3755 // scalar IR looks like:
3756 //
3757 // scalar.ph:
3758 // s_init = a[-1]
3759 // br scalar.body
3760 //
3761 // scalar.body:
3762 // i = phi [0, scalar.ph], [i+1, scalar.body]
3763 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3764 // s2 = a[i]
3765 // b[i] = s2 - s1
3766 // br cond, scalar.body, ...
3767 //
3768 // In this example, s1 is a recurrence because it's value depends on the
3769 // previous iteration. In the first phase of vectorization, we created a
3770 // vector phi v1 for s1. We now complete the vectorization and produce the
3771 // shorthand vector IR shown below (for VF = 4, UF = 1).
3772 //
3773 // vector.ph:
3774 // v_init = vector(..., ..., ..., a[-1])
3775 // br vector.body
3776 //
3777 // vector.body
3778 // i = phi [0, vector.ph], [i+4, vector.body]
3779 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3780 // v2 = a[i, i+1, i+2, i+3];
3781 // v3 = vector(v1(3), v2(0, 1, 2))
3782 // b[i, i+1, i+2, i+3] = v2 - v3
3783 // br cond, vector.body, middle.block
3784 //
3785 // middle.block:
3786 // x = v2(3)
3787 // br scalar.ph
3788 //
3789 // scalar.ph:
3790 // s_init = phi [x, middle.block], [a[-1], otherwise]
3791 // br scalar.body
3792 //
3793 // After execution completes the vector loop, we extract the next value of
3794 // the recurrence (x) to use as the initial value in the scalar loop.
3795
3796 // Extract the last vector element in the middle block. This will be the
3797 // initial value for the recurrence when jumping to the scalar loop.
3798 VPValue *PreviousDef = PhiR->getBackedgeValue();
3799 Value *Incoming = State.get(PreviousDef, UF - 1);
3800 auto *ExtractForScalar = Incoming;
3801 auto *IdxTy = Builder.getInt32Ty();
3802 if (VF.isVector()) {
3803 auto *One = ConstantInt::get(IdxTy, 1);
3804 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3805 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3806 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3807 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
3808 "vector.recur.extract");
3809 }
3810 // Extract the second last element in the middle block if the
3811 // Phi is used outside the loop. We need to extract the phi itself
3812 // and not the last element (the phi update in the current iteration). This
3813 // will be the value when jumping to the exit block from the LoopMiddleBlock,
3814 // when the scalar loop is not run at all.
3815 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3816 if (VF.isVector()) {
3817 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3818 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3819 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3820 Incoming, Idx, "vector.recur.extract.for.phi");
3821 } else if (UF > 1)
3822 // When loop is unrolled without vectorizing, initialize
3823 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
3824 // of `Incoming`. This is analogous to the vectorized case above: extracting
3825 // the second last element when VF > 1.
3826 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3827
3828 // Fix the initial value of the original recurrence in the scalar loop.
3829 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3830 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3831 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3832 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3833 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3834 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3835 Start->addIncoming(Incoming, BB);
3836 }
3837
3838 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3839 Phi->setName("scalar.recur");
3840
3841 // Finally, fix users of the recurrence outside the loop. The users will need
3842 // either the last value of the scalar recurrence or the last value of the
3843 // vector recurrence we extracted in the middle block. Since the loop is in
3844 // LCSSA form, we just need to find all the phi nodes for the original scalar
3845 // recurrence in the exit block, and then add an edge for the middle block.
3846 // Note that LCSSA does not imply single entry when the original scalar loop
3847 // had multiple exiting edges (as we always run the last iteration in the
3848 // scalar epilogue); in that case, there is no edge from middle to exit and
3849 // and thus no phis which needed updated.
3850 if (!Cost->requiresScalarEpilogue(VF))
3851 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3852 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
3853 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3854 State.Plan->removeLiveOut(&LCSSAPhi);
3855 }
3856}
3857
3858void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3859 VPTransformState &State) {
3860 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3861 // Get it's reduction variable descriptor.
3862 assert(Legal->isReductionVariable(OrigPhi) &&(static_cast <bool> (Legal->isReductionVariable(OrigPhi
) && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3863, __extension__
__PRETTY_FUNCTION__))
3863 "Unable to find the reduction variable")(static_cast <bool> (Legal->isReductionVariable(OrigPhi
) && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3863, __extension__
__PRETTY_FUNCTION__))
;
3864 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3865
3866 RecurKind RK = RdxDesc.getRecurrenceKind();
3867 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3868 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3869 State.setDebugLocFromInst(ReductionStartValue);
3870
3871 VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3872 // This is the vector-clone of the value that leaves the loop.
3873 Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3874
3875 // Wrap flags are in general invalid after vectorization, clear them.
3876 clearReductionWrapFlags(PhiR, State);
3877
3878 // Before each round, move the insertion point right between
3879 // the PHIs and the values we are going to write.
3880 // This allows us to write both PHINodes and the extractelement
3881 // instructions.
3882 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3883
3884 State.setDebugLocFromInst(LoopExitInst);
3885
3886 Type *PhiTy = OrigPhi->getType();
3887
3888 VPBasicBlock *LatchVPBB =
3889 PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
3890 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
3891 // If tail is folded by masking, the vector value to leave the loop should be
3892 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3893 // instead of the former. For an inloop reduction the reduction will already
3894 // be predicated, and does not need to be handled here.
3895 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3896 for (unsigned Part = 0; Part < UF; ++Part) {
3897 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3898 SelectInst *Sel = nullptr;
3899 for (User *U : VecLoopExitInst->users()) {
3900 if (isa<SelectInst>(U)) {
3901 assert(!Sel && "Reduction exit feeding two selects")(static_cast <bool> (!Sel && "Reduction exit feeding two selects"
) ? void (0) : __assert_fail ("!Sel && \"Reduction exit feeding two selects\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3901, __extension__
__PRETTY_FUNCTION__))
;
3902 Sel = cast<SelectInst>(U);
3903 } else
3904 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select")(static_cast <bool> (isa<PHINode>(U) && "Reduction exit must feed Phi's or select"
) ? void (0) : __assert_fail ("isa<PHINode>(U) && \"Reduction exit must feed Phi's or select\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3904, __extension__
__PRETTY_FUNCTION__))
;
3905 }
3906 assert(Sel && "Reduction exit feeds no select")(static_cast <bool> (Sel && "Reduction exit feeds no select"
) ? void (0) : __assert_fail ("Sel && \"Reduction exit feeds no select\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3906, __extension__
__PRETTY_FUNCTION__))
;
3907 State.reset(LoopExitInstDef, Sel, Part);
3908
3909 if (isa<FPMathOperator>(Sel))
3910 Sel->setFastMathFlags(RdxDesc.getFastMathFlags());
3911
3912 // If the target can create a predicated operator for the reduction at no
3913 // extra cost in the loop (for example a predicated vadd), it can be
3914 // cheaper for the select to remain in the loop than be sunk out of it,
3915 // and so use the select value for the phi instead of the old
3916 // LoopExitValue.
3917 if (PreferPredicatedReductionSelect ||
3918 TTI->preferPredicatedReductionSelect(
3919 RdxDesc.getOpcode(), PhiTy,
3920 TargetTransformInfo::ReductionFlags())) {
3921 auto *VecRdxPhi =
3922 cast<PHINode>(State.get(PhiR, Part));
3923 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3924 }
3925 }
3926 }
3927
3928 // If the vector reduction can be performed in a smaller type, we truncate
3929 // then extend the loop exit value to enable InstCombine to evaluate the
3930 // entire expression in the smaller type.
3931 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3932 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!")(static_cast <bool> (!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"
) ? void (0) : __assert_fail ("!PhiR->isInLoop() && \"Unexpected truncated inloop reduction!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3932, __extension__
__PRETTY_FUNCTION__))
;
3933 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3934 Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3935 VectorParts RdxParts(UF);
3936 for (unsigned Part = 0; Part < UF; ++Part) {
3937 RdxParts[Part] = State.get(LoopExitInstDef, Part);
3938 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3939 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3940 : Builder.CreateZExt(Trunc, VecTy);
3941 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3942 if (U != Trunc) {
3943 U->replaceUsesOfWith(RdxParts[Part], Extnd);
3944 RdxParts[Part] = Extnd;
3945 }
3946 }
3947 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3948 for (unsigned Part = 0; Part < UF; ++Part) {
3949 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3950 State.reset(LoopExitInstDef, RdxParts[Part], Part);
3951 }
3952 }
3953
3954 // Reduce all of the unrolled parts into a single vector.
3955 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3956 unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3957
3958 // The middle block terminator has already been assigned a DebugLoc here (the
3959 // OrigLoop's single latch terminator). We want the whole middle block to
3960 // appear to execute on this line because: (a) it is all compiler generated,
3961 // (b) these instructions are always executed after evaluating the latch
3962 // conditional branch, and (c) other passes may add new predecessors which
3963 // terminate on this line. This is the easiest way to ensure we don't
3964 // accidentally cause an extra step back into the loop while debugging.
3965 State.setDebugLocFromInst(LoopMiddleBlock->getTerminator());
3966 if (PhiR->isOrdered())
3967 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3968 else {
3969 // Floating-point operations should have some FMF to enable the reduction.
3970 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3971 Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3972 for (unsigned Part = 1; Part < UF; ++Part) {
3973 Value *RdxPart = State.get(LoopExitInstDef, Part);
3974 if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
3975 ReducedPartRdx = Builder.CreateBinOp(
3976 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
3977 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
3978 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
3979 ReducedPartRdx, RdxPart);
3980 else
3981 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
3982 }
3983 }
3984
3985 // Create the reduction after the loop. Note that inloop reductions create the
3986 // target reduction in the loop using a Reduction recipe.
3987 if (VF.isVector() && !PhiR->isInLoop()) {
3988 ReducedPartRdx =
3989 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
3990 // If the reduction can be performed in a smaller type, we need to extend
3991 // the reduction to the wider type before we branch to the original loop.
3992 if (PhiTy != RdxDesc.getRecurrenceType())
3993 ReducedPartRdx = RdxDesc.isSigned()
3994 ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
3995 : Builder.CreateZExt(ReducedPartRdx, PhiTy);
3996 }
3997
3998 PHINode *ResumePhi =
3999 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
4000
4001 // Create a phi node that merges control-flow from the backedge-taken check
4002 // block and the middle block.
4003 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4004 LoopScalarPreHeader->getTerminator());
4005
4006 // If we are fixing reductions in the epilogue loop then we should already
4007 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
4008 // we carry over the incoming values correctly.
4009 for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
4010 if (Incoming == LoopMiddleBlock)
4011 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
4012 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
4013 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
4014 Incoming);
4015 else
4016 BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4017 }
4018
4019 // Set the resume value for this reduction
4020 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4021
4022 // If there were stores of the reduction value to a uniform memory address
4023 // inside the loop, create the final store here.
4024 if (StoreInst *SI = RdxDesc.IntermediateStore) {
4025 StoreInst *NewSI =
4026 Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
4027 propagateMetadata(NewSI, SI);
4028
4029 // If the reduction value is used in other places,
4030 // then let the code below create PHI's for that.
4031 }
4032
4033 // Now, we need to fix the users of the reduction variable
4034 // inside and outside of the scalar remainder loop.
4035
4036 // We know that the loop is in LCSSA form. We need to update the PHI nodes
4037 // in the exit blocks. See comment on analogous loop in
4038 // fixFixedOrderRecurrence for a more complete explaination of the logic.
4039 if (!Cost->requiresScalarEpilogue(VF))
4040 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4041 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
4042 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4043 State.Plan->removeLiveOut(&LCSSAPhi);
4044 }
4045
4046 // Fix the scalar loop reduction variable with the incoming reduction sum
4047 // from the vector body and from the backedge value.
4048 int IncomingEdgeBlockIdx =
4049 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4050 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")(static_cast <bool> (IncomingEdgeBlockIdx >= 0 &&
"Invalid block index") ? void (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4050, __extension__
__PRETTY_FUNCTION__))
;
4051 // Pick the other block.
4052 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4053 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4054 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4055}
4056
4057void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
4058 VPTransformState &State) {
4059 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4060 RecurKind RK = RdxDesc.getRecurrenceKind();
4061 if (RK != RecurKind::Add && RK != RecurKind::Mul)
4062 return;
4063
4064 SmallVector<VPValue *, 8> Worklist;
4065 SmallPtrSet<VPValue *, 8> Visited;
4066 Worklist.push_back(PhiR);
4067 Visited.insert(PhiR);
4068
4069 while (!Worklist.empty()) {
4070 VPValue *Cur = Worklist.pop_back_val();
4071 for (unsigned Part = 0; Part < UF; ++Part) {
4072 Value *V = State.get(Cur, Part);
4073 if (!isa<OverflowingBinaryOperator>(V))
4074 break;
4075 cast<Instruction>(V)->dropPoisonGeneratingFlags();
4076 }
4077
4078 for (VPUser *U : Cur->users()) {
4079 auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
4080 if (!UserRecipe)
4081 continue;
4082 for (VPValue *V : UserRecipe->definedValues())
4083 if (Visited.insert(V).second)
4084 Worklist.push_back(V);
4085 }
4086 }
4087}
4088
4089void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4090 // The basic block and loop containing the predicated instruction.
4091 auto *PredBB = PredInst->getParent();
4092 auto *VectorLoop = LI->getLoopFor(PredBB);
4093
4094 // Initialize a worklist with the operands of the predicated instruction.
4095 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4096
4097 // Holds instructions that we need to analyze again. An instruction may be
4098 // reanalyzed if we don't yet know if we can sink it or not.
4099 SmallVector<Instruction *, 8> InstsToReanalyze;
4100
4101 // Returns true if a given use occurs in the predicated block. Phi nodes use
4102 // their operands in their corresponding predecessor blocks.
4103 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4104 auto *I = cast<Instruction>(U.getUser());
4105 BasicBlock *BB = I->getParent();
4106 if (auto *Phi = dyn_cast<PHINode>(I))
4107 BB = Phi->getIncomingBlock(
4108 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4109 return BB == PredBB;
4110 };
4111
4112 // Iteratively sink the scalarized operands of the predicated instruction
4113 // into the block we created for it. When an instruction is sunk, it's
4114 // operands are then added to the worklist. The algorithm ends after one pass
4115 // through the worklist doesn't sink a single instruction.
4116 bool Changed;
4117 do {
4118 // Add the instructions that need to be reanalyzed to the worklist, and
4119 // reset the changed indicator.
4120 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4121 InstsToReanalyze.clear();
4122 Changed = false;
4123
4124 while (!Worklist.empty()) {
4125 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4126
4127 // We can't sink an instruction if it is a phi node, is not in the loop,
4128 // or may have side effects.
4129 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4130 I->mayHaveSideEffects())
4131 continue;
4132
4133 // If the instruction is already in PredBB, check if we can sink its
4134 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4135 // sinking the scalar instruction I, hence it appears in PredBB; but it
4136 // may have failed to sink I's operands (recursively), which we try
4137 // (again) here.
4138 if (I->getParent() == PredBB) {
4139 Worklist.insert(I->op_begin(), I->op_end());
4140 continue;
4141 }
4142
4143 // It's legal to sink the instruction if all its uses occur in the
4144 // predicated block. Otherwise, there's nothing to do yet, and we may
4145 // need to reanalyze the instruction.
4146 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4147 InstsToReanalyze.push_back(I);
4148 continue;
4149 }
4150
4151 // Move the instruction to the beginning of the predicated block, and add
4152 // it's operands to the worklist.
4153 I->moveBefore(&*PredBB->getFirstInsertionPt());
4154 Worklist.insert(I->op_begin(), I->op_end());
4155
4156 // The sinking may have enabled other instructions to be sunk, so we will
4157 // need to iterate.
4158 Changed = true;
4159 }
4160 } while (Changed);
4161}
4162
4163void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
4164 VPTransformState &State) {
4165 auto Iter = vp_depth_first_deep(Plan.getEntry());
4166 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4167 for (VPRecipeBase &P : VPBB->phis()) {
4168 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
4169 if (!VPPhi)
4170 continue;
4171 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4172 // Make sure the builder has a valid insert point.
4173 Builder.SetInsertPoint(NewPhi);
4174 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4175 VPValue *Inc = VPPhi->getIncomingValue(i);
4176 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4177 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4178 }
4179 }
4180 }
4181}
4182
4183bool InnerLoopVectorizer::useOrderedReductions(
4184 const RecurrenceDescriptor &RdxDesc) {
4185 return Cost->useOrderedReductions(RdxDesc);
4186}
4187
4188void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4189 // We should not collect Scalars more than once per VF. Right now, this
4190 // function is called from collectUniformsAndScalars(), which already does
4191 // this check. Collecting Scalars for VF=1 does not make any sense.
4192 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&(static_cast <bool> (VF.isVector() && Scalars.find
(VF) == Scalars.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4193, __extension__
__PRETTY_FUNCTION__))
4193 "This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Scalars.find
(VF) == Scalars.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4193, __extension__
__PRETTY_FUNCTION__))
;
4194
4195 // This avoids any chances of creating a REPLICATE recipe during planning
4196 // since that would result in generation of scalarized code during execution,
4197 // which is not supported for scalable vectors.
4198 if (VF.isScalable()) {
4199 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
4200 return;
4201 }
4202
4203 SmallSetVector<Instruction *, 8> Worklist;
4204
4205 // These sets are used to seed the analysis with pointers used by memory
4206 // accesses that will remain scalar.
4207 SmallSetVector<Instruction *, 8> ScalarPtrs;
4208 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4209 auto *Latch = TheLoop->getLoopLatch();
4210
4211 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4212 // The pointer operands of loads and stores will be scalar as long as the
4213 // memory access is not a gather or scatter operation. The value operand of a
4214 // store will remain scalar if the store is scalarized.
4215 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4216 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4217 assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4218, __extension__
__PRETTY_FUNCTION__))
4218 "Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4218, __extension__
__PRETTY_FUNCTION__))
;
4219 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4220 if (Ptr == Store->getValueOperand())
4221 return WideningDecision == CM_Scalarize;
4222 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4223, __extension__
__PRETTY_FUNCTION__))
4223 "Ptr is neither a value or pointer operand")(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4223, __extension__
__PRETTY_FUNCTION__))
;
4224 return WideningDecision != CM_GatherScatter;
4225 };
4226
4227 // A helper that returns true if the given value is a bitcast or
4228 // getelementptr instruction contained in the loop.
4229 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4230 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4231 isa<GetElementPtrInst>(V)) &&
4232 !TheLoop->isLoopInvariant(V);
4233 };
4234
4235 // A helper that evaluates a memory access's use of a pointer. If the use will
4236 // be a scalar use and the pointer is only used by memory accesses, we place
4237 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4238 // PossibleNonScalarPtrs.
4239 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4240 // We only care about bitcast and getelementptr instructions contained in
4241 // the loop.
4242 if (!isLoopVaryingBitCastOrGEP(Ptr))
4243 return;
4244
4245 // If the pointer has already been identified as scalar (e.g., if it was
4246 // also identified as uniform), there's nothing to do.
4247 auto *I = cast<Instruction>(Ptr);
4248 if (Worklist.count(I))
4249 return;
4250
4251 // If the use of the pointer will be a scalar use, and all users of the
4252 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4253 // place the pointer in PossibleNonScalarPtrs.
4254 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4255 return isa<LoadInst>(U) || isa<StoreInst>(U);
4256 }))
4257 ScalarPtrs.insert(I);
4258 else
4259 PossibleNonScalarPtrs.insert(I);
4260 };
4261
4262 // We seed the scalars analysis with three classes of instructions: (1)
4263 // instructions marked uniform-after-vectorization and (2) bitcast,
4264 // getelementptr and (pointer) phi instructions used by memory accesses
4265 // requiring a scalar use.
4266 //
4267 // (1) Add to the worklist all instructions that have been identified as
4268 // uniform-after-vectorization.
4269 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4270
4271 // (2) Add to the worklist all bitcast and getelementptr instructions used by
4272 // memory accesses requiring a scalar use. The pointer operands of loads and
4273 // stores will be scalar as long as the memory accesses is not a gather or
4274 // scatter operation. The value operand of a store will remain scalar if the
4275 // store is scalarized.
4276 for (auto *BB : TheLoop->blocks())
4277 for (auto &I : *BB) {
4278 if (auto *Load = dyn_cast<LoadInst>(&I)) {
4279 evaluatePtrUse(Load, Load->getPointerOperand());
4280 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4281 evaluatePtrUse(Store, Store->getPointerOperand());
4282 evaluatePtrUse(Store, Store->getValueOperand());
4283 }
4284 }
4285 for (auto *I : ScalarPtrs)
4286 if (!PossibleNonScalarPtrs.count(I)) {
4287 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *I << "\n"; } } while (false)
;
4288 Worklist.insert(I);
4289 }
4290
4291 // Insert the forced scalars.
4292 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
4293 // induction variable when the PHI user is scalarized.
4294 auto ForcedScalar = ForcedScalars.find(VF);
4295 if (ForcedScalar != ForcedScalars.end())
4296 for (auto *I : ForcedScalar->second) {
4297 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found (forced) scalar instruction: "
<< *I << "\n"; } } while (false)
;
4298 Worklist.insert(I);
4299 }
4300
4301 // Expand the worklist by looking through any bitcasts and getelementptr
4302 // instructions we've already identified as scalar. This is similar to the
4303 // expansion step in collectLoopUniforms(); however, here we're only
4304 // expanding to include additional bitcasts and getelementptr instructions.
4305 unsigned Idx = 0;
4306 while (Idx != Worklist.size()) {
4307 Instruction *Dst = Worklist[Idx++];
4308 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4309 continue;
4310 auto *Src = cast<Instruction>(Dst->getOperand(0));
4311 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4312 auto *J = cast<Instruction>(U);
4313 return !TheLoop->contains(J) || Worklist.count(J) ||
4314 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4315 isScalarUse(J, Src));
4316 })) {
4317 Worklist.insert(Src);
4318 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Src << "\n"; } } while (false)
;
4319 }
4320 }
4321
4322 // An induction variable will remain scalar if all users of the induction
4323 // variable and induction variable update remain scalar.
4324 for (const auto &Induction : Legal->getInductionVars()) {
4325 auto *Ind = Induction.first;
4326 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4327
4328 // If tail-folding is applied, the primary induction variable will be used
4329 // to feed a vector compare.
4330 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4331 continue;
4332
4333 // Returns true if \p Indvar is a pointer induction that is used directly by
4334 // load/store instruction \p I.
4335 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4336 Instruction *I) {
4337 return Induction.second.getKind() ==
4338 InductionDescriptor::IK_PtrInduction &&
4339 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4340 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4341 };
4342
4343 // Determine if all users of the induction variable are scalar after
4344 // vectorization.
4345 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4346 auto *I = cast<Instruction>(U);
4347 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4348 IsDirectLoadStoreFromPtrIndvar(Ind, I);
4349 });
4350 if (!ScalarInd)
4351 continue;
4352
4353 // Determine if all users of the induction variable update instruction are
4354 // scalar after vectorization.
4355 auto ScalarIndUpdate =
4356 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4357 auto *I = cast<Instruction>(U);
4358 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4359 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4360 });
4361 if (!ScalarIndUpdate)
4362 continue;
4363
4364 // The induction variable and its update instruction will remain scalar.
4365 Worklist.insert(Ind);
4366 Worklist.insert(IndUpdate);
4367 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Ind << "\n"; } } while (false)
;
4368 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
4369 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
;
4370 }
4371
4372 Scalars[VF].insert(Worklist.begin(), Worklist.end());
4373}
4374
4375bool LoopVectorizationCostModel::isScalarWithPredication(
4376 Instruction *I, ElementCount VF) const {
4377 if (!isPredicatedInst(I))
4378 return false;
4379
4380 // Do we have a non-scalar lowering for this predicated
4381 // instruction? No - it is scalar with predication.
4382 switch(I->getOpcode()) {
4383 default:
4384 return true;
4385 case Instruction::Load:
4386 case Instruction::Store: {
4387 auto *Ptr = getLoadStorePointerOperand(I);
4388 auto *Ty = getLoadStoreType(I);
4389 Type *VTy = Ty;
4390 if (VF.isVector())
4391 VTy = VectorType::get(Ty, VF);
4392 const Align Alignment = getLoadStoreAlignment(I);
4393 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4394 TTI.isLegalMaskedGather(VTy, Alignment))
4395 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4396 TTI.isLegalMaskedScatter(VTy, Alignment));
4397 }
4398 case Instruction::UDiv:
4399 case Instruction::SDiv:
4400 case Instruction::SRem:
4401 case Instruction::URem: {
4402 // We have the option to use the safe-divisor idiom to avoid predication.
4403 // The cost based decision here will always select safe-divisor for
4404 // scalable vectors as scalarization isn't legal.
4405 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
4406 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
4407 }
4408 }
4409}
4410
4411bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
4412 if (!blockNeedsPredicationForAnyReason(I->getParent()))
4413 return false;
4414
4415 // Can we prove this instruction is safe to unconditionally execute?
4416 // If not, we must use some form of predication.
4417 switch(I->getOpcode()) {
4418 default:
4419 return false;
4420 case Instruction::Load:
4421 case Instruction::Store: {
4422 if (!Legal->isMaskRequired(I))
4423 return false;
4424 // When we know the load's address is loop invariant and the instruction
4425 // in the original scalar loop was unconditionally executed then we
4426 // don't need to mark it as a predicated instruction. Tail folding may
4427 // introduce additional predication, but we're guaranteed to always have
4428 // at least one active lane. We call Legal->blockNeedsPredication here
4429 // because it doesn't query tail-folding. For stores, we need to prove
4430 // both speculation safety (which follows from the same argument as loads),
4431 // but also must prove the value being stored is correct. The easiest
4432 // form of the later is to require that all values stored are the same.
4433 if (Legal->isUniformMemOp(*I) &&
4434 (isa<LoadInst>(I) ||
4435 (isa<StoreInst>(I) &&
4436 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
4437 !Legal->blockNeedsPredication(I->getParent()))
4438 return false;
4439 return true;
4440 }
4441 case Instruction::UDiv:
4442 case Instruction::SDiv:
4443 case Instruction::SRem:
4444 case Instruction::URem:
4445 // TODO: We can use the loop-preheader as context point here and get
4446 // context sensitive reasoning
4447 return !isSafeToSpeculativelyExecute(I);
4448 }
4449}
4450
4451std::pair<InstructionCost, InstructionCost>
4452LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
4453 ElementCount VF) const {
4454 assert(I->getOpcode() == Instruction::UDiv ||(static_cast <bool> (I->getOpcode() == Instruction::
UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode
() == Instruction::SRem || I->getOpcode() == Instruction::
URem) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4457, __extension__
__PRETTY_FUNCTION__))
4455 I->getOpcode() == Instruction::SDiv ||(static_cast <bool> (I->getOpcode() == Instruction::
UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode
() == Instruction::SRem || I->getOpcode() == Instruction::
URem) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4457, __extension__
__PRETTY_FUNCTION__))
4456 I->getOpcode() == Instruction::SRem ||(static_cast <bool> (I->getOpcode() == Instruction::
UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode
() == Instruction::SRem || I->getOpcode() == Instruction::
URem) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4457, __extension__
__PRETTY_FUNCTION__))
4457 I->getOpcode() == Instruction::URem)(static_cast <bool> (I->getOpcode() == Instruction::
UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode
() == Instruction::SRem || I->getOpcode() == Instruction::
URem) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4457, __extension__
__PRETTY_FUNCTION__))
;
4458 assert(!isSafeToSpeculativelyExecute(I))(static_cast <bool> (!isSafeToSpeculativelyExecute(I)) ?
void (0) : __assert_fail ("!isSafeToSpeculativelyExecute(I)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4458, __extension__
__PRETTY_FUNCTION__))
;
4459
4460 const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4461
4462 // Scalarization isn't legal for scalable vector types
4463 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
4464 if (!VF.isScalable()) {
4465 // Get the scalarization cost and scale this amount by the probability of
4466 // executing the predicated block. If the instruction is not predicated,
4467 // we fall through to the next case.
4468 ScalarizationCost = 0;
4469
4470 // These instructions have a non-void type, so account for the phi nodes
4471 // that we will create. This cost is likely to be zero. The phi node
4472 // cost, if any, should be scaled by the block probability because it
4473 // models a copy at the end of each predicated block.
4474 ScalarizationCost += VF.getKnownMinValue() *
4475 TTI.getCFInstrCost(Instruction::PHI, CostKind);
4476
4477 // The cost of the non-predicated instruction.
4478 ScalarizationCost += VF.getKnownMinValue() *
4479 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4480
4481 // The cost of insertelement and extractelement instructions needed for
4482 // scalarization.
4483 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4484
4485 // Scale the cost by the probability of executing the predicated blocks.
4486 // This assumes the predicated block for each vector lane is equally
4487 // likely.
4488 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4489 }
4490 InstructionCost SafeDivisorCost = 0;
4491
4492 auto *VecTy = ToVectorTy(I->getType(), VF);
4493
4494 // The cost of the select guard to ensure all lanes are well defined
4495 // after we speculate above any internal control flow.
4496 SafeDivisorCost += TTI.getCmpSelInstrCost(
4497 Instruction::Select, VecTy,
4498 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4499 CmpInst::BAD_ICMP_PREDICATE, CostKind);
4500
4501 // Certain instructions can be cheaper to vectorize if they have a constant
4502 // second vector operand. One example of this are shifts on x86.
4503 Value *Op2 = I->getOperand(1);
4504 auto Op2Info = TTI.getOperandInfo(Op2);
4505 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
4506 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
4507
4508 SmallVector<const Value *, 4> Operands(I->operand_values());
4509 SafeDivisorCost += TTI.getArithmeticInstrCost(
4510 I->getOpcode(), VecTy, CostKind,
4511 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4512 Op2Info, Operands, I);
4513 return {ScalarizationCost, SafeDivisorCost};
4514}
4515
4516bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4517 Instruction *I, ElementCount VF) {
4518 assert(isAccessInterleaved(I) && "Expecting interleaved access.")(static_cast <bool> (isAccessInterleaved(I) && "Expecting interleaved access."
) ? void (0) : __assert_fail ("isAccessInterleaved(I) && \"Expecting interleaved access.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4518, __extension__
__PRETTY_FUNCTION__))
;
4519 assert(getWideningDecision(I, VF) == CM_Unknown &&(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown
&& "Decision should not be set yet.") ? void (0) : __assert_fail
("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4520, __extension__
__PRETTY_FUNCTION__))
4520 "Decision should not be set yet.")(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown
&& "Decision should not be set yet.") ? void (0) : __assert_fail
("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4520, __extension__
__PRETTY_FUNCTION__))
;
4521 auto *Group = getInterleavedAccessGroup(I);
4522 assert(Group && "Must have a group.")(static_cast <bool> (Group && "Must have a group."
) ? void (0) : __assert_fail ("Group && \"Must have a group.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4522, __extension__
__PRETTY_FUNCTION__))
;
4523
4524 // If the instruction's allocated size doesn't equal it's type size, it
4525 // requires padding and will be scalarized.
4526 auto &DL = I->getModule()->getDataLayout();
4527 auto *ScalarTy = getLoadStoreType(I);
4528 if (hasIrregularType(ScalarTy, DL))
4529 return false;
4530
4531 // If the group involves a non-integral pointer, we may not be able to
4532 // losslessly cast all values to a common type.
4533 unsigned InterleaveFactor = Group->getFactor();
4534 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4535 for (unsigned i = 0; i < InterleaveFactor; i++) {
4536 Instruction *Member = Group->getMember(i);
4537 if (!Member)
4538 continue;
4539 auto *MemberTy = getLoadStoreType(Member);
4540 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4541 // Don't coerce non-integral pointers to integers or vice versa.
4542 if (MemberNI != ScalarNI) {
4543 // TODO: Consider adding special nullptr value case here
4544 return false;
4545 } else if (MemberNI && ScalarNI &&
4546 ScalarTy->getPointerAddressSpace() !=
4547 MemberTy->getPointerAddressSpace()) {
4548 return false;
4549 }
4550 }
4551
4552 // Check if masking is required.
4553 // A Group may need masking for one of two reasons: it resides in a block that
4554 // needs predication, or it was decided to use masking to deal with gaps
4555 // (either a gap at the end of a load-access that may result in a speculative
4556 // load, or any gaps in a store-access).
4557 bool PredicatedAccessRequiresMasking =
4558 blockNeedsPredicationForAnyReason(I->getParent()) &&
4559 Legal->isMaskRequired(I);
4560 bool LoadAccessWithGapsRequiresEpilogMasking =
4561 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4562 !isScalarEpilogueAllowed();
4563 bool StoreAccessWithGapsRequiresMasking =
4564 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4565 if (!PredicatedAccessRequiresMasking &&
4566 !LoadAccessWithGapsRequiresEpilogMasking &&
4567 !StoreAccessWithGapsRequiresMasking)
4568 return true;
4569
4570 // If masked interleaving is required, we expect that the user/target had
4571 // enabled it, because otherwise it either wouldn't have been created or
4572 // it should have been invalidated by the CostModel.
4573 assert(useMaskedInterleavedAccesses(TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled."
) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4574, __extension__
__PRETTY_FUNCTION__))
4574 "Masked interleave-groups for predicated accesses are not enabled.")(static_cast <bool> (useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled."
) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4574, __extension__
__PRETTY_FUNCTION__))
;
4575
4576 if (Group->isReverse())
4577 return false;
4578
4579 auto *Ty = getLoadStoreType(I);
4580 const Align Alignment = getLoadStoreAlignment(I);
4581 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4582 : TTI.isLegalMaskedStore(Ty, Alignment);
4583}
4584
4585bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4586 Instruction *I, ElementCount VF) {
4587 // Get and ensure we have a valid memory instruction.
4588 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction")(static_cast <bool> ((isa<LoadInst, StoreInst>(I)
) && "Invalid memory instruction") ? void (0) : __assert_fail
("(isa<LoadInst, StoreInst>(I)) && \"Invalid memory instruction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4588, __extension__
__PRETTY_FUNCTION__))
;
4589
4590 auto *Ptr = getLoadStorePointerOperand(I);
4591 auto *ScalarTy = getLoadStoreType(I);
4592
4593 // In order to be widened, the pointer should be consecutive, first of all.
4594 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4595 return false;
4596
4597 // If the instruction is a store located in a predicated block, it will be
4598 // scalarized.
4599 if (isScalarWithPredication(I, VF))
4600 return false;
4601
4602 // If the instruction's allocated size doesn't equal it's type size, it
4603 // requires padding and will be scalarized.
4604 auto &DL = I->getModule()->getDataLayout();
4605 if (hasIrregularType(ScalarTy, DL))
4606 return false;
4607
4608 return true;
4609}
4610
4611void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4612 // We should not collect Uniforms more than once per VF. Right now,
4613 // this function is called from collectUniformsAndScalars(), which
4614 // already does this check. Collecting Uniforms for VF=1 does not make any
4615 // sense.
4616
4617 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&(static_cast <bool> (VF.isVector() && Uniforms.
find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4618, __extension__
__PRETTY_FUNCTION__))
4618 "This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Uniforms.
find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4618, __extension__
__PRETTY_FUNCTION__))
;
4619
4620 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4621 // not analyze again. Uniforms.count(VF) will return 1.
4622 Uniforms[VF].clear();
4623
4624 // We now know that the loop is vectorizable!
4625 // Collect instructions inside the loop that will remain uniform after
4626 // vectorization.
4627
4628 // Global values, params and instructions outside of current loop are out of
4629 // scope.
4630 auto isOutOfScope = [&](Value *V) -> bool {
4631 Instruction *I = dyn_cast<Instruction>(V);
4632 return (!I || !TheLoop->contains(I));
4633 };
4634
4635 // Worklist containing uniform instructions demanding lane 0.
4636 SetVector<Instruction *> Worklist;
4637 BasicBlock *Latch = TheLoop->getLoopLatch();
4638
4639 // Add uniform instructions demanding lane 0 to the worklist. Instructions
4640 // that are scalar with predication must not be considered uniform after
4641 // vectorization, because that would create an erroneous replicating region
4642 // where only a single instance out of VF should be formed.
4643 // TODO: optimize such seldom cases if found important, see PR40816.
4644 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4645 if (isOutOfScope(I)) {
4646 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
4647 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
;
4648 return;
4649 }
4650 if (isScalarWithPredication(I, VF)) {
4651 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
4652 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
;
4653 return;
4654 }
4655 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
<< *I << "\n"; } } while (false)
;
4656 Worklist.insert(I);
4657 };
4658
4659 // Start with the conditional branch. If the branch condition is an
4660 // instruction contained in the loop that is only used by the branch, it is
4661 // uniform.
4662 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4663 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4664 addToWorklistIfAllowed(Cmp);
4665
4666 // Return true if all lanes perform the same memory operation, and we can
4667 // thus chose to execute only one.
4668 auto isUniformMemOpUse = [&](Instruction *I) {
4669 if (!Legal->isUniformMemOp(*I))
4670 return false;
4671 if (isa<LoadInst>(I))
4672 // Loading the same address always produces the same result - at least
4673 // assuming aliasing and ordering which have already been checked.
4674 return true;
4675 // Storing the same value on every iteration.
4676 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4677 };
4678
4679 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4680 InstWidening WideningDecision = getWideningDecision(I, VF);
4681 assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4682, __extension__
__PRETTY_FUNCTION__))
4682 "Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4682, __extension__
__PRETTY_FUNCTION__))
;
4683
4684 if (isUniformMemOpUse(I))
4685 return true;
4686
4687 return (WideningDecision == CM_Widen ||
4688 WideningDecision == CM_Widen_Reverse ||
4689 WideningDecision == CM_Interleave);
4690 };
4691
4692
4693 // Returns true if Ptr is the pointer operand of a memory access instruction
4694 // I, and I is known to not require scalarization.
4695 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4696 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4697 };
4698
4699 // Holds a list of values which are known to have at least one uniform use.
4700 // Note that there may be other uses which aren't uniform. A "uniform use"
4701 // here is something which only demands lane 0 of the unrolled iterations;
4702 // it does not imply that all lanes produce the same value (e.g. this is not
4703 // the usual meaning of uniform)
4704 SetVector<Value *> HasUniformUse;
4705
4706 // Scan the loop for instructions which are either a) known to have only
4707 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4708 for (auto *BB : TheLoop->blocks())
4709 for (auto &I : *BB) {
4710 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4711 switch (II->getIntrinsicID()) {
4712 case Intrinsic::sideeffect:
4713 case Intrinsic::experimental_noalias_scope_decl:
4714 case Intrinsic::assume:
4715 case Intrinsic::lifetime_start:
4716 case Intrinsic::lifetime_end:
4717 if (TheLoop->hasLoopInvariantOperands(&I))
4718 addToWorklistIfAllowed(&I);
4719 break;
4720 default:
4721 break;
4722 }
4723 }
4724
4725 // ExtractValue instructions must be uniform, because the operands are
4726 // known to be loop-invariant.
4727 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4728 assert(isOutOfScope(EVI->getAggregateOperand()) &&(static_cast <bool> (isOutOfScope(EVI->getAggregateOperand
()) && "Expected aggregate value to be loop invariant"
) ? void (0) : __assert_fail ("isOutOfScope(EVI->getAggregateOperand()) && \"Expected aggregate value to be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4729, __extension__
__PRETTY_FUNCTION__))
4729 "Expected aggregate value to be loop invariant")(static_cast <bool> (isOutOfScope(EVI->getAggregateOperand
()) && "Expected aggregate value to be loop invariant"
) ? void (0) : __assert_fail ("isOutOfScope(EVI->getAggregateOperand()) && \"Expected aggregate value to be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4729, __extension__
__PRETTY_FUNCTION__))
;
4730 addToWorklistIfAllowed(EVI);
4731 continue;
4732 }
4733
4734 // If there's no pointer operand, there's nothing to do.
4735 auto *Ptr = getLoadStorePointerOperand(&I);
4736 if (!Ptr)
4737 continue;
4738
4739 if (isUniformMemOpUse(&I))
4740 addToWorklistIfAllowed(&I);
4741
4742 if (isUniformDecision(&I, VF)) {
4743 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check")(static_cast <bool> (isVectorizedMemAccessUse(&I, Ptr
) && "consistency check") ? void (0) : __assert_fail (
"isVectorizedMemAccessUse(&I, Ptr) && \"consistency check\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4743, __extension__
__PRETTY_FUNCTION__))
;
4744 HasUniformUse.insert(Ptr);
4745 }
4746 }
4747
4748 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4749 // demanding) users. Since loops are assumed to be in LCSSA form, this
4750 // disallows uses outside the loop as well.
4751 for (auto *V : HasUniformUse) {
4752 if (isOutOfScope(V))
4753 continue;
4754 auto *I = cast<Instruction>(V);
4755 auto UsersAreMemAccesses =
4756 llvm::all_of(I->users(), [&](User *U) -> bool {
4757 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4758 });
4759 if (UsersAreMemAccesses)
4760 addToWorklistIfAllowed(I);
4761 }
4762
4763 // Expand Worklist in topological order: whenever a new instruction
4764 // is added , its users should be already inside Worklist. It ensures
4765 // a uniform instruction will only be used by uniform instructions.
4766 unsigned idx = 0;
4767 while (idx != Worklist.size()) {
4768 Instruction *I = Worklist[idx++];
4769
4770 for (auto *OV : I->operand_values()) {
4771 // isOutOfScope operands cannot be uniform instructions.
4772 if (isOutOfScope(OV))
4773 continue;
4774 // First order recurrence Phi's should typically be considered
4775 // non-uniform.
4776 auto *OP = dyn_cast<PHINode>(OV);
4777 if (OP && Legal->isFixedOrderRecurrence(OP))
4778 continue;
4779 // If all the users of the operand are uniform, then add the
4780 // operand into the uniform worklist.
4781 auto *OI = cast<Instruction>(OV);
4782 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4783 auto *J = cast<Instruction>(U);
4784 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4785 }))
4786 addToWorklistIfAllowed(OI);
4787 }
4788 }
4789
4790 // For an instruction to be added into Worklist above, all its users inside
4791 // the loop should also be in Worklist. However, this condition cannot be
4792 // true for phi nodes that form a cyclic dependence. We must process phi
4793 // nodes separately. An induction variable will remain uniform if all users
4794 // of the induction variable and induction variable update remain uniform.
4795 // The code below handles both pointer and non-pointer induction variables.
4796 for (const auto &Induction : Legal->getInductionVars()) {
4797 auto *Ind = Induction.first;
4798 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4799
4800 // Determine if all users of the induction variable are uniform after
4801 // vectorization.
4802 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4803 auto *I = cast<Instruction>(U);
4804 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4805 isVectorizedMemAccessUse(I, Ind);
4806 });
4807 if (!UniformInd)
4808 continue;
4809
4810 // Determine if all users of the induction variable update instruction are
4811 // uniform after vectorization.
4812 auto UniformIndUpdate =
4813 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4814 auto *I = cast<Instruction>(U);
4815 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4816 isVectorizedMemAccessUse(I, IndUpdate);
4817 });
4818 if (!UniformIndUpdate)
4819 continue;
4820
4821 // The induction variable and its update instruction will remain uniform.
4822 addToWorklistIfAllowed(Ind);
4823 addToWorklistIfAllowed(IndUpdate);
4824 }
4825
4826 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4827}
4828
4829bool LoopVectorizationCostModel::runtimeChecksRequired() {
4830 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Performing code size checks.\n"
; } } while (false)
;
4831
4832 if (Legal->getRuntimePointerChecking()->Need) {
4833 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4834 "runtime pointer checks needed. Enable vectorization of this "
4835 "loop with '#pragma clang loop vectorize(enable)' when "
4836 "compiling with -Os/-Oz",
4837 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4838 return true;
4839 }
4840
4841 if (!PSE.getPredicate().isAlwaysTrue()) {
4842 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4843 "runtime SCEV checks needed. Enable vectorization of this "
4844 "loop with '#pragma clang loop vectorize(enable)' when "
4845 "compiling with -Os/-Oz",
4846 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4847 return true;
4848 }
4849
4850 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4851 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4852 reportVectorizationFailure("Runtime stride check for small trip count",
4853 "runtime stride == 1 checks needed. Enable vectorization of "
4854 "this loop without such check by compiling with -Os/-Oz",
4855 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4856 return true;
4857 }
4858
4859 return false;
4860}
4861
4862ElementCount
4863LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4864 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4865 return ElementCount::getScalable(0);
4866
4867 if (Hints->isScalableVectorizationDisabled()) {
4868 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4869 "ScalableVectorizationDisabled", ORE, TheLoop);
4870 return ElementCount::getScalable(0);
4871 }
4872
4873 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalable vectorization is available\n"
; } } while (false)
;
4874
4875 auto MaxScalableVF = ElementCount::getScalable(
4876 std::numeric_limits<ElementCount::ScalarTy>::max());
4877
4878 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4879 // FIXME: While for scalable vectors this is currently sufficient, this should
4880 // be replaced by a more detailed mechanism that filters out specific VFs,
4881 // instead of invalidating vectorization for a whole set of VFs based on the
4882 // MaxVF.
4883
4884 // Disable scalable vectorization if the loop contains unsupported reductions.
4885 if (!canVectorizeReductions(MaxScalableVF)) {
4886 reportVectorizationInfo(
4887 "Scalable vectorization not supported for the reduction "
4888 "operations found in this loop.",
4889 "ScalableVFUnfeasible", ORE, TheLoop);
4890 return ElementCount::getScalable(0);
4891 }
4892
4893 // Disable scalable vectorization if the loop contains any instructions
4894 // with element types not supported for scalable vectors.
4895 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4896 return !Ty->isVoidTy() &&
4897 !this->TTI.isElementTypeLegalForScalableVector(Ty);
4898 })) {
4899 reportVectorizationInfo("Scalable vectorization is not supported "
4900 "for all element types found in this loop.",
4901 "ScalableVFUnfeasible", ORE, TheLoop);
4902 return ElementCount::getScalable(0);
4903 }
4904
4905 if (Legal->isSafeForAnyVectorWidth())
4906 return MaxScalableVF;
4907
4908 // Limit MaxScalableVF by the maximum safe dependence distance.
4909 std::optional<unsigned> MaxVScale = TTI.getMaxVScale();
4910 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
4911 MaxVScale =
4912 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
4913 MaxScalableVF =
4914 ElementCount::getScalable(MaxVScale ? (MaxSafeElements / *MaxVScale) : 0);
4915 if (!MaxScalableVF)
4916 reportVectorizationInfo(
4917 "Max legal vector width too small, scalable vectorization "
4918 "unfeasible.",
4919 "ScalableVFUnfeasible", ORE, TheLoop);
4920
4921 return MaxScalableVF;
4922}
4923
4924FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4925 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4926 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4927 unsigned SmallestType, WidestType;
4928 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4929
4930 // Get the maximum safe dependence distance in bits computed by LAA.
4931 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4932 // the memory accesses that is most restrictive (involved in the smallest
4933 // dependence distance).
4934 unsigned MaxSafeElements =
4935 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4936
4937 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4938 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4939
4940 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe fixed VF is: "
<< MaxSafeFixedVF << ".\n"; } } while (false)
4941 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe fixed VF is: "
<< MaxSafeFixedVF << ".\n"; } } while (false)
;
4942 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe scalable VF is: "
<< MaxSafeScalableVF << ".\n"; } } while (false)
4943 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe scalable VF is: "
<< MaxSafeScalableVF << ".\n"; } } while (false)
;
4944
4945 // First analyze the UserVF, fall back if the UserVF should be ignored.
4946 if (UserVF) {
4947 auto MaxSafeUserVF =
4948 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4949
4950 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4951 // If `VF=vscale x N` is safe, then so is `VF=N`
4952 if (UserVF.isScalable())
4953 return FixedScalableVFPair(
4954 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4955 else
4956 return UserVF;
4957 }
4958
4959 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF))(static_cast <bool> (ElementCount::isKnownGT(UserVF, MaxSafeUserVF
)) ? void (0) : __assert_fail ("ElementCount::isKnownGT(UserVF, MaxSafeUserVF)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4959, __extension__
__PRETTY_FUNCTION__))
;
4960
4961 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4962 // is better to ignore the hint and let the compiler choose a suitable VF.
4963 if (!UserVF.isScalable()) {
4964 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeFixedVF << ".\n"; } } while (false)
4965 << " is unsafe, clamping to max safe VF="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeFixedVF << ".\n"; } } while (false)
4966 << MaxSafeFixedVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeFixedVF << ".\n"; } } while (false)
;
4967 ORE->emit([&]() {
4968 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
4969 TheLoop->getStartLoc(),
4970 TheLoop->getHeader())
4971 << "User-specified vectorization factor "
4972 << ore::NV("UserVectorizationFactor", UserVF)
4973 << " is unsafe, clamping to maximum safe vectorization factor "
4974 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4975 });
4976 return MaxSafeFixedVF;
4977 }
4978
4979 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4980 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is ignored because scalable vectors are not "
"available.\n"; } } while (false)
4981 << " is ignored because scalable vectors are not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is ignored because scalable vectors are not "
"available.\n"; } } while (false)
4982 "available.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is ignored because scalable vectors are not "
"available.\n"; } } while (false)
;
4983 ORE->emit([&]() {
4984 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
4985 TheLoop->getStartLoc(),
4986 TheLoop->getHeader())
4987 << "User-specified vectorization factor "
4988 << ore::NV("UserVectorizationFactor", UserVF)
4989 << " is ignored because the target does not support scalable "
4990 "vectors. The compiler will pick a more suitable value.";
4991 });
4992 } else {
4993 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe. Ignoring scalable UserVF.\n"; }
} while (false)
4994 << " is unsafe. Ignoring scalable UserVF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe. Ignoring scalable UserVF.\n"; }
} while (false)
;
4995 ORE->emit([&]() {
4996 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
4997 TheLoop->getStartLoc(),
4998 TheLoop->getHeader())
4999 << "User-specified vectorization factor "
5000 << ore::NV("UserVectorizationFactor", UserVF)
5001 << " is unsafe. Ignoring the hint to let the compiler pick a "
5002 "more suitable value.";
5003 });
5004 }
5005 }
5006
5007 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestTypedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
5008 << " / " << WidestType << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
;
5009
5010 FixedScalableVFPair Result(ElementCount::getFixed(1),
5011 ElementCount::getScalable(0));
5012 if (auto MaxVF =
5013 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5014 MaxSafeFixedVF, FoldTailByMasking))
5015 Result.FixedVF = MaxVF;
5016
5017 if (auto MaxVF =
5018 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5019 MaxSafeScalableVF, FoldTailByMasking))
5020 if (MaxVF.isScalable()) {
5021 Result.ScalableVF = MaxVF;
5022 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found feasible scalable VF = "
<< MaxVF << "\n"; } } while (false)
5023 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found feasible scalable VF = "
<< MaxVF << "\n"; } } while (false)
;
5024 }
5025
5026 return Result;
5027}
5028
5029FixedScalableVFPair
5030LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5031 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5032 // TODO: It may by useful to do since it's still likely to be dynamically
5033 // uniform if the target can skip.
5034 reportVectorizationFailure(
5035 "Not inserting runtime ptr check for divergent target",
5036 "runtime pointer checks needed. Not enabled for divergent target",
5037 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5038 return FixedScalableVFPair::getNone();
5039 }
5040
5041 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5042 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found trip count: "
<< TC << '\n'; } } while (false)
;
5043 if (TC == 1) {
5044 reportVectorizationFailure("Single iteration (non) loop",
5045 "loop trip count is one, irrelevant for vectorization",
5046 "SingleIterationLoop", ORE, TheLoop);
5047 return FixedScalableVFPair::getNone();
5048 }
5049
5050 switch (ScalarEpilogueStatus) {
5051 case CM_ScalarEpilogueAllowed:
5052 return computeFeasibleMaxVF(TC, UserVF, false);
5053 case CM_ScalarEpilogueNotAllowedUsePredicate:
5054 [[fallthrough]];
5055 case CM_ScalarEpilogueNotNeededUsePredicate:
5056 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5057 dbgs() << "LV: vector predicate hint/switch found.\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5058 << "LV: Not allowing scalar epilogue, creating predicated "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5059 << "vector loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
;
5060 break;
5061 case CM_ScalarEpilogueNotAllowedLowTripLoop:
5062 // fallthrough as a special case of OptForSize
5063 case CM_ScalarEpilogueNotAllowedOptSize:
5064 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5065 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
5066 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
;
5067 else
5068 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
<< "count.\n"; } } while (false)
5069 << "count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
<< "count.\n"; } } while (false)
;
5070
5071 // Bail if runtime checks are required, which are not good when optimising
5072 // for size.
5073 if (runtimeChecksRequired())
5074 return FixedScalableVFPair::getNone();
5075
5076 break;
5077 }
5078
5079 // The only loops we can vectorize without a scalar epilogue, are loops with
5080 // a bottom-test and a single exiting block. We'd have to handle the fact
5081 // that not every instruction executes on the last iteration. This will
5082 // require a lane mask which varies through the vector loop body. (TODO)
5083 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5084 // If there was a tail-folding hint/switch, but we can't fold the tail by
5085 // masking, fallback to a vectorization with a scalar epilogue.
5086 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5087 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
5088 "scalar epilogue instead.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
;
5089 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5090 return computeFeasibleMaxVF(TC, UserVF, false);
5091 }
5092 return FixedScalableVFPair::getNone();
5093 }
5094
5095 // Now try the tail folding
5096
5097 // Invalidate interleave groups that require an epilogue if we can't mask
5098 // the interleave-group.
5099 if (!useMaskedInterleavedAccesses(TTI)) {
5100 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&(static_cast <bool> (WideningDecisions.empty() &&
Uniforms.empty() && Scalars.empty() && "No decisions should have been taken at this point"
) ? void (0) : __assert_fail ("WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && \"No decisions should have been taken at this point\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5101, __extension__
__PRETTY_FUNCTION__))
5101 "No decisions should have been taken at this point")(static_cast <bool> (WideningDecisions.empty() &&
Uniforms.empty() && Scalars.empty() && "No decisions should have been taken at this point"
) ? void (0) : __assert_fail ("WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && \"No decisions should have been taken at this point\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5101, __extension__
__PRETTY_FUNCTION__))
;
5102 // Note: There is no need to invalidate any cost modeling decisions here, as
5103 // non where taken so far.
5104 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5105 }
5106
5107 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5108 // Avoid tail folding if the trip count is known to be a multiple of any VF
5109 // we chose.
5110 // FIXME: The condition below pessimises the case for fixed-width vectors,
5111 // when scalable VFs are also candidates for vectorization.
5112 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5113 ElementCount MaxFixedVF = MaxFactors.FixedVF;
5114 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&(static_cast <bool> ((UserVF.isNonZero() || isPowerOf2_32
(MaxFixedVF.getFixedValue())) && "MaxFixedVF must be a power of 2"
) ? void (0) : __assert_fail ("(UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && \"MaxFixedVF must be a power of 2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5115, __extension__
__PRETTY_FUNCTION__))
5115 "MaxFixedVF must be a power of 2")(static_cast <bool> ((UserVF.isNonZero() || isPowerOf2_32
(MaxFixedVF.getFixedValue())) && "MaxFixedVF must be a power of 2"
) ? void (0) : __assert_fail ("(UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && \"MaxFixedVF must be a power of 2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5115, __extension__
__PRETTY_FUNCTION__))
;
5116 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5117 : MaxFixedVF.getFixedValue();
5118 ScalarEvolution *SE = PSE.getSE();
5119 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5120 const SCEV *ExitCount = SE->getAddExpr(
5121 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5122 const SCEV *Rem = SE->getURemExpr(
5123 SE->applyLoopGuards(ExitCount, TheLoop),
5124 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5125 if (Rem->isZero()) {
5126 // Accept MaxFixedVF if we do not have a tail.
5127 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: No tail will remain for any chosen VF.\n"
; } } while (false)
;
5128 return MaxFactors;
5129 }
5130 }
5131
5132 // If we don't know the precise trip count, or if the trip count that we
5133 // found modulo the vectorization factor is not zero, try to fold the tail
5134 // by masking.
5135 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5136 if (Legal->prepareToFoldTailByMasking()) {
5137 FoldTailByMasking = true;
5138 return MaxFactors;
5139 }
5140
5141 // If there was a tail-folding hint/switch, but we can't fold the tail by
5142 // masking, fallback to a vectorization with a scalar epilogue.
5143 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5144 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
5145 "scalar epilogue instead.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
;
5146 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5147 return MaxFactors;
5148 }
5149
5150 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5151 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"
; } } while (false)
;
5152 return FixedScalableVFPair::getNone();
5153 }
5154
5155 if (TC == 0) {
5156 reportVectorizationFailure(
5157 "Unable to calculate the loop count due to complex control flow",
5158 "unable to calculate the loop count due to complex control flow",
5159 "UnknownLoopCountComplexCFG", ORE, TheLoop);
5160 return FixedScalableVFPair::getNone();
5161 }
5162
5163 reportVectorizationFailure(
5164 "Cannot optimize for size and vectorize at the same time.",
5165 "cannot optimize for size and vectorize at the same time. "
5166 "Enable vectorization of this loop with '#pragma clang loop "
5167 "vectorize(enable)' when compiling with -Os/-Oz",
5168 "NoTailLoopWithOptForSize", ORE, TheLoop);
5169 return FixedScalableVFPair::getNone();
5170}
5171
5172ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5173 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5174 ElementCount MaxSafeVF, bool FoldTailByMasking) {
5175 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5176 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
5177 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5178 : TargetTransformInfo::RGK_FixedWidthVector);
5179
5180 // Convenience function to return the minimum of two ElementCounts.
5181 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5182 assert((LHS.isScalable() == RHS.isScalable()) &&(static_cast <bool> ((LHS.isScalable() == RHS.isScalable
()) && "Scalable flags must match") ? void (0) : __assert_fail
("(LHS.isScalable() == RHS.isScalable()) && \"Scalable flags must match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5183, __extension__
__PRETTY_FUNCTION__))
5183 "Scalable flags must match")(static_cast <bool> ((LHS.isScalable() == RHS.isScalable
()) && "Scalable flags must match") ? void (0) : __assert_fail
("(LHS.isScalable() == RHS.isScalable()) && \"Scalable flags must match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5183, __extension__
__PRETTY_FUNCTION__))
;
5184 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5185 };
5186
5187 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5188 // Note that both WidestRegister and WidestType may not be a powers of 2.
5189 auto MaxVectorElementCount = ElementCount::get(
5190 PowerOf2Floor(WidestRegister.getKnownMinValue() / WidestType),
5191 ComputeScalableMaxVF);
5192 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5193 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< (MaxVectorElementCount * WidestType) << " bits.\n"
; } } while (false)
5194 << (MaxVectorElementCount * WidestType) << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< (MaxVectorElementCount * WidestType) << " bits.\n"
; } } while (false)
;
5195
5196 if (!MaxVectorElementCount) {
5197 LLVM_DEBUG(dbgs() << "LV: The target has no "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no "
<< (ComputeScalableMaxVF ? "scalable" : "fixed") <<
" vector registers.\n"; } } while (false)
5198 << (ComputeScalableMaxVF ? "scalable" : "fixed")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no "
<< (ComputeScalableMaxVF ? "scalable" : "fixed") <<
" vector registers.\n"; } } while (false)
5199 << " vector registers.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no "
<< (ComputeScalableMaxVF ? "scalable" : "fixed") <<
" vector registers.\n"; } } while (false)
;
5200 return ElementCount::getFixed(1);
5201 }
5202
5203 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
5204 if (MaxVectorElementCount.isScalable() &&
5205 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5206 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5207 auto Min = Attr.getVScaleRangeMin();
5208 WidestRegisterMinEC *= Min;
5209 }
5210 if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC &&
5211 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5212 // If loop trip count (TC) is known at compile time there is no point in
5213 // choosing VF greater than TC (as done in the loop below). Select maximum
5214 // power of two which doesn't exceed TC.
5215 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5216 // when the TC is less than or equal to the known number of lanes.
5217 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5218 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: " << ClampedConstTripCount
<< "\n"; } } while (false)
5219 "exceeding the constant trip count: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: " << ClampedConstTripCount
<< "\n"; } } while (false)
5220 << ClampedConstTripCount << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: " << ClampedConstTripCount
<< "\n"; } } while (false)
;
5221 return ElementCount::getFixed(ClampedConstTripCount);
5222 }
5223
5224 TargetTransformInfo::RegisterKind RegKind =
5225 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5226 : TargetTransformInfo::RGK_FixedWidthVector;
5227 ElementCount MaxVF = MaxVectorElementCount;
5228 if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
5229 TTI.shouldMaximizeVectorBandwidth(RegKind))) {
5230 auto MaxVectorElementCountMaxBW = ElementCount::get(
5231 PowerOf2Floor(WidestRegister.getKnownMinValue() / SmallestType),
5232 ComputeScalableMaxVF);
5233 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5234
5235 // Collect all viable vectorization factors larger than the default MaxVF
5236 // (i.e. MaxVectorElementCount).
5237 SmallVector<ElementCount, 8> VFs;
5238 for (ElementCount VS = MaxVectorElementCount * 2;
5239 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5240 VFs.push_back(VS);
5241
5242 // For each VF calculate its register usage.
5243 auto RUs = calculateRegisterUsage(VFs);
5244
5245 // Select the largest VF which doesn't require more registers than existing
5246 // ones.
5247 for (int i = RUs.size() - 1; i >= 0; --i) {
5248 bool Selected = true;
5249 for (auto &pair : RUs[i].MaxLocalUsers) {
5250 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5251 if (pair.second > TargetNumRegisters)
5252 Selected = false;
5253 }
5254 if (Selected) {
5255 MaxVF = VFs[i];
5256 break;
5257 }
5258 }
5259 if (ElementCount MinVF =
5260 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5261 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5262 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
<< MaxVF << ") with target's minimum: " <<
MinVF << '\n'; } } while (false)
5263 << ") with target's minimum: " << MinVF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
<< MaxVF << ") with target's minimum: " <<
MinVF << '\n'; } } while (false)
;
5264 MaxVF = MinVF;
5265 }
5266 }
5267
5268 // Invalidate any widening decisions we might have made, in case the loop
5269 // requires prediction (decided later), but we have already made some
5270 // load/store widening decisions.
5271 invalidateCostModelingDecisions();
5272 }
5273 return MaxVF;
5274}
5275
5276std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5277 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5278 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5279 auto Min = Attr.getVScaleRangeMin();
5280 auto Max = Attr.getVScaleRangeMax();
5281 if (Max && Min == Max)
5282 return Max;
5283 }
5284
5285 return TTI.getVScaleForTuning();
5286}
5287
5288bool LoopVectorizationCostModel::isMoreProfitable(
5289 const VectorizationFactor &A, const VectorizationFactor &B) const {
5290 InstructionCost CostA = A.Cost;
5291 InstructionCost CostB = B.Cost;
5292
5293 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5294
5295 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5296 MaxTripCount) {
5297 // If we are folding the tail and the trip count is a known (possibly small)
5298 // constant, the trip count will be rounded up to an integer number of
5299 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5300 // which we compare directly. When not folding the tail, the total cost will
5301 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5302 // approximated with the per-lane cost below instead of using the tripcount
5303 // as here.
5304 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5305 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5306 return RTCostA < RTCostB;
5307 }
5308
5309 // Improve estimate for the vector width if it is scalable.
5310 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5311 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5312 if (std::optional<unsigned> VScale = getVScaleForTuning()) {
5313 if (A.Width.isScalable())
5314 EstimatedWidthA *= *VScale;
5315 if (B.Width.isScalable())
5316 EstimatedWidthB *= *VScale;
5317 }
5318
5319 // Assume vscale may be larger than 1 (or the value being tuned for),
5320 // so that scalable vectorization is slightly favorable over fixed-width
5321 // vectorization.
5322 if (A.Width.isScalable() && !B.Width.isScalable())
5323 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5324
5325 // To avoid the need for FP division:
5326 // (CostA / A.Width) < (CostB / B.Width)
5327 // <=> (CostA * B.Width) < (CostB * A.Width)
5328 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5329}
5330
5331VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5332 const ElementCountSet &VFCandidates) {
5333 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5334 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalar loop costs: "
<< ExpectedCost << ".\n"; } } while (false)
;
5335 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop")(static_cast <bool> (ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"
) ? void (0) : __assert_fail ("ExpectedCost.isValid() && \"Unexpected invalid cost for scalar loop\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5335, __extension__
__PRETTY_FUNCTION__))
;
5336 assert(VFCandidates.count(ElementCount::getFixed(1)) &&(static_cast <bool> (VFCandidates.count(ElementCount::getFixed
(1)) && "Expected Scalar VF to be a candidate") ? void
(0) : __assert_fail ("VFCandidates.count(ElementCount::getFixed(1)) && \"Expected Scalar VF to be a candidate\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5337, __extension__
__PRETTY_FUNCTION__))
5337 "Expected Scalar VF to be a candidate")(static_cast <bool> (VFCandidates.count(ElementCount::getFixed
(1)) && "Expected Scalar VF to be a candidate") ? void
(0) : __assert_fail ("VFCandidates.count(ElementCount::getFixed(1)) && \"Expected Scalar VF to be a candidate\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5337, __extension__
__PRETTY_FUNCTION__))
;
5338
5339 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
5340 ExpectedCost);
5341 VectorizationFactor ChosenFactor = ScalarCost;
5342
5343 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5344 if (ForceVectorization && VFCandidates.size() > 1) {
5345 // Ignore scalar width, because the user explicitly wants vectorization.
5346 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5347 // evaluation.
5348 ChosenFactor.Cost = InstructionCost::getMax();
5349 }
5350
5351 SmallVector<InstructionVFPair> InvalidCosts;
5352 for (const auto &i : VFCandidates) {
5353 // The cost for scalar VF=1 is already calculated, so ignore it.
5354 if (i.isScalar())
5355 continue;
5356
5357 VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5358 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5359
5360#ifndef NDEBUG
5361 unsigned AssumedMinimumVscale = 1;
5362 if (std::optional<unsigned> VScale = getVScaleForTuning())
5363 AssumedMinimumVscale = *VScale;
5364 unsigned Width =
5365 Candidate.Width.isScalable()
5366 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5367 : Candidate.Width.getFixedValue();
5368 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (Candidate.Cost / Width
); } } while (false)
5369 << " costs: " << (Candidate.Cost / Width))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (Candidate.Cost / Width
); } } while (false)
;
5370 if (i.isScalable())
5371 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " (assuming a minimum vscale of "
<< AssumedMinimumVscale << ")"; } } while (false
)
5372 << AssumedMinimumVscale << ")")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " (assuming a minimum vscale of "
<< AssumedMinimumVscale << ")"; } } while (false
)
;
5373 LLVM_DEBUG(dbgs() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << ".\n"; } } while (false
)
;
5374#endif
5375
5376 if (!C.second && !ForceVectorization) {
5377 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
5378 dbgs() << "LV: Not considering vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
5379 << " because it will not generate any vector instructions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
;
5380 continue;
5381 }
5382
5383 // If profitable add it to ProfitableVF list.
5384 if (isMoreProfitable(Candidate, ScalarCost))
5385 ProfitableVFs.push_back(Candidate);
5386
5387 if (isMoreProfitable(Candidate, ChosenFactor))
5388 ChosenFactor = Candidate;
5389 }
5390
5391 // Emit a report of VFs with invalid costs in the loop.
5392 if (!InvalidCosts.empty()) {
5393 // Group the remarks per instruction, keeping the instruction order from
5394 // InvalidCosts.
5395 std::map<Instruction *, unsigned> Numbering;
5396 unsigned I = 0;
5397 for (auto &Pair : InvalidCosts)
5398 if (!Numbering.count(Pair.first))
5399 Numbering[Pair.first] = I++;
5400
5401 // Sort the list, first on instruction(number) then on VF.
5402 llvm::sort(InvalidCosts,
5403 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5404 if (Numbering[A.first] != Numbering[B.first])
5405 return Numbering[A.first] < Numbering[B.first];
5406 ElementCountComparator ECC;
5407 return ECC(A.second, B.second);
5408 });
5409
5410 // For a list of ordered instruction-vf pairs:
5411 // [(load, vf1), (load, vf2), (store, vf1)]
5412 // Group the instructions together to emit separate remarks for:
5413 // load (vf1, vf2)
5414 // store (vf1)
5415 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5416 auto Subset = ArrayRef<InstructionVFPair>();
5417 do {
5418 if (Subset.empty())
5419 Subset = Tail.take_front(1);
5420
5421 Instruction *I = Subset.front().first;
5422
5423 // If the next instruction is different, or if there are no other pairs,
5424 // emit a remark for the collated subset. e.g.
5425 // [(load, vf1), (load, vf2))]
5426 // to emit:
5427 // remark: invalid costs for 'load' at VF=(vf, vf2)
5428 if (Subset == Tail || Tail[Subset.size()].first != I) {
5429 std::string OutString;
5430 raw_string_ostream OS(OutString);
5431 assert(!Subset.empty() && "Unexpected empty range")(static_cast <bool> (!Subset.empty() && "Unexpected empty range"
) ? void (0) : __assert_fail ("!Subset.empty() && \"Unexpected empty range\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5431, __extension__
__PRETTY_FUNCTION__))
;
5432 OS << "Instruction with invalid costs prevented vectorization at VF=(";
5433 for (const auto &Pair : Subset)
5434 OS << (Pair.second == Subset.front().second ? "" : ", ")
5435 << Pair.second;
5436 OS << "):";
5437 if (auto *CI = dyn_cast<CallInst>(I))
5438 OS << " call to " << CI->getCalledFunction()->getName();
5439 else
5440 OS << " " << I->getOpcodeName();
5441 OS.flush();
5442 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5443 Tail = Tail.drop_front(Subset.size());
5444 Subset = {};
5445 } else
5446 // Grow the subset by one element
5447 Subset = Tail.take_front(Subset.size() + 1);
5448 } while (!Tail.empty());