Bug Summary

File:llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Warning:line 8743, column 35
Potential leak of memory pointed to by 'BlockMask'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopVectorize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/build-llvm/lib/Transforms/Vectorize -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/build-llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/build-llvm/include -I /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/build-llvm/lib/Transforms/Vectorize -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0=. -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-08-28-193554-24367-1 -x c++ /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/Proposal/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanHCFGBuilder.h"
61#include "VPlanPredicator.h"
62#include "VPlanTransforms.h"
63#include "llvm/ADT/APInt.h"
64#include "llvm/ADT/ArrayRef.h"
65#include "llvm/ADT/DenseMap.h"
66#include "llvm/ADT/DenseMapInfo.h"
67#include "llvm/ADT/Hashing.h"
68#include "llvm/ADT/MapVector.h"
69#include "llvm/ADT/None.h"
70#include "llvm/ADT/Optional.h"
71#include "llvm/ADT/STLExtras.h"
72#include "llvm/ADT/SmallPtrSet.h"
73#include "llvm/ADT/SmallSet.h"
74#include "llvm/ADT/SmallVector.h"
75#include "llvm/ADT/Statistic.h"
76#include "llvm/ADT/StringRef.h"
77#include "llvm/ADT/Twine.h"
78#include "llvm/ADT/iterator_range.h"
79#include "llvm/Analysis/AssumptionCache.h"
80#include "llvm/Analysis/BasicAliasAnalysis.h"
81#include "llvm/Analysis/BlockFrequencyInfo.h"
82#include "llvm/Analysis/CFG.h"
83#include "llvm/Analysis/CodeMetrics.h"
84#include "llvm/Analysis/DemandedBits.h"
85#include "llvm/Analysis/GlobalsModRef.h"
86#include "llvm/Analysis/LoopAccessAnalysis.h"
87#include "llvm/Analysis/LoopAnalysisManager.h"
88#include "llvm/Analysis/LoopInfo.h"
89#include "llvm/Analysis/LoopIterator.h"
90#include "llvm/Analysis/OptimizationRemarkEmitter.h"
91#include "llvm/Analysis/ProfileSummaryInfo.h"
92#include "llvm/Analysis/ScalarEvolution.h"
93#include "llvm/Analysis/ScalarEvolutionExpressions.h"
94#include "llvm/Analysis/TargetLibraryInfo.h"
95#include "llvm/Analysis/TargetTransformInfo.h"
96#include "llvm/Analysis/VectorUtils.h"
97#include "llvm/IR/Attributes.h"
98#include "llvm/IR/BasicBlock.h"
99#include "llvm/IR/CFG.h"
100#include "llvm/IR/Constant.h"
101#include "llvm/IR/Constants.h"
102#include "llvm/IR/DataLayout.h"
103#include "llvm/IR/DebugInfoMetadata.h"
104#include "llvm/IR/DebugLoc.h"
105#include "llvm/IR/DerivedTypes.h"
106#include "llvm/IR/DiagnosticInfo.h"
107#include "llvm/IR/Dominators.h"
108#include "llvm/IR/Function.h"
109#include "llvm/IR/IRBuilder.h"
110#include "llvm/IR/InstrTypes.h"
111#include "llvm/IR/Instruction.h"
112#include "llvm/IR/Instructions.h"
113#include "llvm/IR/IntrinsicInst.h"
114#include "llvm/IR/Intrinsics.h"
115#include "llvm/IR/LLVMContext.h"
116#include "llvm/IR/Metadata.h"
117#include "llvm/IR/Module.h"
118#include "llvm/IR/Operator.h"
119#include "llvm/IR/PatternMatch.h"
120#include "llvm/IR/Type.h"
121#include "llvm/IR/Use.h"
122#include "llvm/IR/User.h"
123#include "llvm/IR/Value.h"
124#include "llvm/IR/ValueHandle.h"
125#include "llvm/IR/Verifier.h"
126#include "llvm/InitializePasses.h"
127#include "llvm/Pass.h"
128#include "llvm/Support/Casting.h"
129#include "llvm/Support/CommandLine.h"
130#include "llvm/Support/Compiler.h"
131#include "llvm/Support/Debug.h"
132#include "llvm/Support/ErrorHandling.h"
133#include "llvm/Support/InstructionCost.h"
134#include "llvm/Support/MathExtras.h"
135#include "llvm/Support/raw_ostream.h"
136#include "llvm/Transforms/Utils/BasicBlockUtils.h"
137#include "llvm/Transforms/Utils/InjectTLIMappings.h"
138#include "llvm/Transforms/Utils/LoopSimplify.h"
139#include "llvm/Transforms/Utils/LoopUtils.h"
140#include "llvm/Transforms/Utils/LoopVersioning.h"
141#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
142#include "llvm/Transforms/Utils/SizeOpts.h"
143#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
144#include <algorithm>
145#include <cassert>
146#include <cstdint>
147#include <cstdlib>
148#include <functional>
149#include <iterator>
150#include <limits>
151#include <memory>
152#include <string>
153#include <tuple>
154#include <utility>
155
156using namespace llvm;
157
158#define LV_NAME"loop-vectorize" "loop-vectorize"
159#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"
160
161#ifndef NDEBUG
162const char VerboseDebug[] = DEBUG_TYPE"loop-vectorize" "-verbose";
163#endif
164
165/// @{
166/// Metadata attribute names
167const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168const char LLVMLoopVectorizeFollowupVectorized[] =
169 "llvm.loop.vectorize.followup_vectorized";
170const char LLVMLoopVectorizeFollowupEpilogue[] =
171 "llvm.loop.vectorize.followup_epilogue";
172/// @}
173
174STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = {"loop-vectorize", "LoopsVectorized"
, "Number of loops vectorized"}
;
175STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = {"loop-vectorize", "LoopsAnalyzed"
, "Number of loops analyzed for vectorization"}
;
176STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized")static llvm::Statistic LoopsEpilogueVectorized = {"loop-vectorize"
, "LoopsEpilogueVectorized", "Number of epilogues vectorized"
}
;
177
178static cl::opt<bool> EnableEpilogueVectorization(
179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180 cl::desc("Enable vectorization of epilogue loops."));
181
182static cl::opt<unsigned> EpilogueVectorizationForceVF(
183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184 cl::desc("When epilogue vectorization is enabled, and a value greater than "
185 "1 is specified, forces the given VF for all applicable epilogue "
186 "loops."));
187
188static cl::opt<unsigned> EpilogueVectorizationMinVF(
189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190 cl::desc("Only loops with vectorization factor equal to or larger than "
191 "the specified value are considered for epilogue vectorization."));
192
193/// Loops with a known constant trip count below this number are vectorized only
194/// if no scalar iteration overheads are incurred.
195static cl::opt<unsigned> TinyTripCountVectorThreshold(
196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197 cl::desc("Loops with a constant trip count that is smaller than this "
198 "value are vectorized only if no scalar iteration overheads "
199 "are incurred."));
200
201static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203 cl::desc("The maximum allowed number of runtime memory checks with a "
204 "vectorize(enable) pragma."));
205
206// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207// that predication is preferred, and this lists all options. I.e., the
208// vectorizer will try to fold the tail-loop (epilogue) into the vector body
209// and predicate the instructions accordingly. If tail-folding fails, there are
210// different fallback strategies depending on these values:
211namespace PreferPredicateTy {
212 enum Option {
213 ScalarEpilogue = 0,
214 PredicateElseScalarEpilogue,
215 PredicateOrDontVectorize
216 };
217} // namespace PreferPredicateTy
218
219static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
220 "prefer-predicate-over-epilogue",
221 cl::init(PreferPredicateTy::ScalarEpilogue),
222 cl::Hidden,
223 cl::desc("Tail-folding and predication preferences over creating a scalar "
224 "epilogue loop."),
225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
226 "scalar-epilogue",llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
227 "Don't tail-predicate loops, create scalar epilogue")llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
,
228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
229 "predicate-else-scalar-epilogue",llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
230 "prefer tail-folding, create scalar epilogue if tail "llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
231 "folding fails.")llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
,
232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
233 "predicate-dont-vectorize",llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
234 "prefers tail-folding, don't attempt vectorization if "llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
235 "tail-folding fails.")llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
));
236
237static cl::opt<bool> MaximizeBandwidth(
238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
239 cl::desc("Maximize bandwidth when selecting vectorization factor which "
240 "will be determined by the smallest type in loop."));
241
242static cl::opt<bool> EnableInterleavedMemAccesses(
243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
245
246/// An interleave-group may need masking if it resides in a block that needs
247/// predication, or in order to mask away gaps.
248static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
251
252static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
254 cl::desc("We don't interleave loops with a estimated constant trip count "
255 "below this number"));
256
257static cl::opt<unsigned> ForceTargetNumScalarRegs(
258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
259 cl::desc("A flag that overrides the target's number of scalar registers."));
260
261static cl::opt<unsigned> ForceTargetNumVectorRegs(
262 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
263 cl::desc("A flag that overrides the target's number of vector registers."));
264
265static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
267 cl::desc("A flag that overrides the target's max interleave factor for "
268 "scalar loops."));
269
270static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
272 cl::desc("A flag that overrides the target's max interleave factor for "
273 "vectorized loops."));
274
275static cl::opt<unsigned> ForceTargetInstructionCost(
276 "force-target-instruction-cost", cl::init(0), cl::Hidden,
277 cl::desc("A flag that overrides the target's expected cost for "
278 "an instruction to a single constant value. Mostly "
279 "useful for getting consistent testing."));
280
281static cl::opt<bool> ForceTargetSupportsScalableVectors(
282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
283 cl::desc(
284 "Pretend that scalable vectors are supported, even if the target does "
285 "not support them. This flag should only be used for testing."));
286
287static cl::opt<unsigned> SmallLoopCost(
288 "small-loop-cost", cl::init(20), cl::Hidden,
289 cl::desc(
290 "The cost of a loop that is considered 'small' by the interleaver."));
291
292static cl::opt<bool> LoopVectorizeWithBlockFrequency(
293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
294 cl::desc("Enable the use of the block frequency analysis to access PGO "
295 "heuristics minimizing code growth in cold regions and being more "
296 "aggressive in hot regions."));
297
298// Runtime interleave loops for load/store throughput.
299static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
301 cl::desc(
302 "Enable runtime interleaving until load/store ports are saturated"));
303
304/// Interleave small loops with scalar reductions.
305static cl::opt<bool> InterleaveSmallLoopScalarReduction(
306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
307 cl::desc("Enable interleaving for loops with small iteration counts that "
308 "contain scalar reductions to expose ILP."));
309
310/// The number of stores in a loop that are allowed to need predication.
311static cl::opt<unsigned> NumberOfStoresToPredicate(
312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
313 cl::desc("Max number of stores to be predicated behind an if."));
314
315static cl::opt<bool> EnableIndVarRegisterHeur(
316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
317 cl::desc("Count the induction variable only once when interleaving"));
318
319static cl::opt<bool> EnableCondStoresVectorization(
320 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
321 cl::desc("Enable if predication of stores during vectorization."));
322
323static cl::opt<unsigned> MaxNestedScalarReductionIC(
324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
325 cl::desc("The maximum interleave count to use when interleaving a scalar "
326 "reduction in a nested loop."));
327
328static cl::opt<bool>
329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
330 cl::Hidden,
331 cl::desc("Prefer in-loop vector reductions, "
332 "overriding the targets preference."));
333
334static cl::opt<bool> ForceOrderedReductions(
335 "force-ordered-reductions", cl::init(false), cl::Hidden,
336 cl::desc("Enable the vectorisation of loops with in-order (strict) "
337 "FP reductions"));
338
339static cl::opt<bool> PreferPredicatedReductionSelect(
340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
341 cl::desc(
342 "Prefer predicating a reduction operation over an after loop select."));
343
344cl::opt<bool> EnableVPlanNativePath(
345 "enable-vplan-native-path", cl::init(false), cl::Hidden,
346 cl::desc("Enable VPlan-native vectorization path with "
347 "support for outer loop vectorization."));
348
349// FIXME: Remove this switch once we have divergence analysis. Currently we
350// assume divergent non-backedge branches when this switch is true.
351cl::opt<bool> EnableVPlanPredication(
352 "enable-vplan-predication", cl::init(false), cl::Hidden,
353 cl::desc("Enable VPlan-native vectorization path predicator with "
354 "support for outer loop vectorization."));
355
356// This flag enables the stress testing of the VPlan H-CFG construction in the
357// VPlan-native vectorization path. It must be used in conjuction with
358// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
359// verification of the H-CFGs built.
360static cl::opt<bool> VPlanBuildStressTest(
361 "vplan-build-stress-test", cl::init(false), cl::Hidden,
362 cl::desc(
363 "Build VPlan for every supported loop nest in the function and bail "
364 "out right after the build (stress test the VPlan H-CFG construction "
365 "in the VPlan-native vectorization path)."));
366
367cl::opt<bool> llvm::EnableLoopInterleaving(
368 "interleave-loops", cl::init(true), cl::Hidden,
369 cl::desc("Enable loop interleaving in Loop vectorization passes"));
370cl::opt<bool> llvm::EnableLoopVectorization(
371 "vectorize-loops", cl::init(true), cl::Hidden,
372 cl::desc("Run the Loop vectorization passes"));
373
374cl::opt<bool> PrintVPlansInDotFormat(
375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
376 cl::desc("Use dot format instead of plain text when dumping VPlans"));
377
378/// A helper function that returns true if the given type is irregular. The
379/// type is irregular if its allocated size doesn't equal the store size of an
380/// element of the corresponding vector type.
381static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
382 // Determine if an array of N elements of type Ty is "bitcast compatible"
383 // with a <N x Ty> vector.
384 // This is only true if there is no padding between the array elements.
385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
386}
387
388/// A helper function that returns the reciprocal of the block probability of
389/// predicated blocks. If we return X, we are assuming the predicated block
390/// will execute once for every X iterations of the loop header.
391///
392/// TODO: We should use actual block probability here, if available. Currently,
393/// we always assume predicated blocks have a 50% chance of executing.
394static unsigned getReciprocalPredBlockProb() { return 2; }
395
396/// A helper function that returns an integer or floating-point constant with
397/// value C.
398static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
400 : ConstantFP::get(Ty, C);
401}
402
403/// Returns "best known" trip count for the specified loop \p L as defined by
404/// the following procedure:
405/// 1) Returns exact trip count if it is known.
406/// 2) Returns expected trip count according to profile data if any.
407/// 3) Returns upper bound estimate if it is known.
408/// 4) Returns None if all of the above failed.
409static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
410 // Check if exact trip count is known.
411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
412 return ExpectedTC;
413
414 // Check if there is an expected trip count available from profile data.
415 if (LoopVectorizeWithBlockFrequency)
416 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
417 return EstimatedTC;
418
419 // Check if upper bound estimate is known.
420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
421 return ExpectedTC;
422
423 return None;
424}
425
426// Forward declare GeneratedRTChecks.
427class GeneratedRTChecks;
428
429namespace llvm {
430
431/// InnerLoopVectorizer vectorizes loops which contain only one basic
432/// block to a specified vectorization factor (VF).
433/// This class performs the widening of scalars into vectors, or multiple
434/// scalars. This class also implements the following features:
435/// * It inserts an epilogue loop for handling loops that don't have iteration
436/// counts that are known to be a multiple of the vectorization factor.
437/// * It handles the code generation for reduction variables.
438/// * Scalarization (implementation using scalars) of un-vectorizable
439/// instructions.
440/// InnerLoopVectorizer does not perform any vectorization-legality
441/// checks, and relies on the caller to check for the different legality
442/// aspects. The InnerLoopVectorizer relies on the
443/// LoopVectorizationLegality class to provide information about the induction
444/// and reduction variables that were found to a given vectorization factor.
445class InnerLoopVectorizer {
446public:
447 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
448 LoopInfo *LI, DominatorTree *DT,
449 const TargetLibraryInfo *TLI,
450 const TargetTransformInfo *TTI, AssumptionCache *AC,
451 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
452 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
453 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
454 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
455 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
456 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
457 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
458 PSI(PSI), RTChecks(RTChecks) {
459 // Query this against the original loop and save it here because the profile
460 // of the original loop header may change as the transformation happens.
461 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
462 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
463 }
464
465 virtual ~InnerLoopVectorizer() = default;
466
467 /// Create a new empty loop that will contain vectorized instructions later
468 /// on, while the old loop will be used as the scalar remainder. Control flow
469 /// is generated around the vectorized (and scalar epilogue) loops consisting
470 /// of various checks and bypasses. Return the pre-header block of the new
471 /// loop.
472 /// In the case of epilogue vectorization, this function is overriden to
473 /// handle the more complex control flow around the loops.
474 virtual BasicBlock *createVectorizedLoopSkeleton();
475
476 /// Widen a single instruction within the innermost loop.
477 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
478 VPTransformState &State);
479
480 /// Widen a single call instruction within the innermost loop.
481 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
482 VPTransformState &State);
483
484 /// Widen a single select instruction within the innermost loop.
485 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
486 bool InvariantCond, VPTransformState &State);
487
488 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
489 void fixVectorizedLoop(VPTransformState &State);
490
491 // Return true if any runtime check is added.
492 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
493
494 /// A type for vectorized values in the new loop. Each value from the
495 /// original loop, when vectorized, is represented by UF vector values in the
496 /// new unrolled loop, where UF is the unroll factor.
497 using VectorParts = SmallVector<Value *, 2>;
498
499 /// Vectorize a single GetElementPtrInst based on information gathered and
500 /// decisions taken during planning.
501 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
502 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
503 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
504
505 /// Vectorize a single first-order recurrence or pointer induction PHINode in
506 /// a block. This method handles the induction variable canonicalization. It
507 /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
508 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
509 VPTransformState &State);
510
511 /// A helper function to scalarize a single Instruction in the innermost loop.
512 /// Generates a sequence of scalar instances for each lane between \p MinLane
513 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
514 /// inclusive. Uses the VPValue operands from \p Operands instead of \p
515 /// Instr's operands.
516 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands,
517 const VPIteration &Instance, bool IfPredicateInstr,
518 VPTransformState &State);
519
520 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
521 /// is provided, the integer induction variable will first be truncated to
522 /// the corresponding type.
523 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc,
524 VPValue *Def, VPValue *CastDef,
525 VPTransformState &State);
526
527 /// Construct the vector value of a scalarized value \p V one lane at a time.
528 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
529 VPTransformState &State);
530
531 /// Try to vectorize interleaved access group \p Group with the base address
532 /// given in \p Addr, optionally masking the vector operations if \p
533 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
534 /// values in the vectorized loop.
535 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
536 ArrayRef<VPValue *> VPDefs,
537 VPTransformState &State, VPValue *Addr,
538 ArrayRef<VPValue *> StoredValues,
539 VPValue *BlockInMask = nullptr);
540
541 /// Vectorize Load and Store instructions with the base address given in \p
542 /// Addr, optionally masking the vector operations if \p BlockInMask is
543 /// non-null. Use \p State to translate given VPValues to IR values in the
544 /// vectorized loop.
545 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
546 VPValue *Def, VPValue *Addr,
547 VPValue *StoredValue, VPValue *BlockInMask);
548
549 /// Set the debug location in the builder \p Ptr using the debug location in
550 /// \p V. If \p Ptr is None then it uses the class member's Builder.
551 void setDebugLocFromInst(const Value *V,
552 Optional<IRBuilder<> *> CustomBuilder = None);
553
554 /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
555 void fixNonInductionPHIs(VPTransformState &State);
556
557 /// Returns true if the reordering of FP operations is not allowed, but we are
558 /// able to vectorize with strict in-order reductions for the given RdxDesc.
559 bool useOrderedReductions(RecurrenceDescriptor &RdxDesc);
560
561 /// Create a broadcast instruction. This method generates a broadcast
562 /// instruction (shuffle) for loop invariant values and for the induction
563 /// value. If this is the induction variable then we extend it to N, N+1, ...
564 /// this is needed because each iteration in the loop corresponds to a SIMD
565 /// element.
566 virtual Value *getBroadcastInstrs(Value *V);
567
568protected:
569 friend class LoopVectorizationPlanner;
570
571 /// A small list of PHINodes.
572 using PhiVector = SmallVector<PHINode *, 4>;
573
574 /// A type for scalarized values in the new loop. Each value from the
575 /// original loop, when scalarized, is represented by UF x VF scalar values
576 /// in the new unrolled loop, where UF is the unroll factor and VF is the
577 /// vectorization factor.
578 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
579
580 /// Set up the values of the IVs correctly when exiting the vector loop.
581 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
582 Value *CountRoundDown, Value *EndValue,
583 BasicBlock *MiddleBlock);
584
585 /// Create a new induction variable inside L.
586 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
587 Value *Step, Instruction *DL);
588
589 /// Handle all cross-iteration phis in the header.
590 void fixCrossIterationPHIs(VPTransformState &State);
591
592 /// Create the exit value of first order recurrences in the middle block and
593 /// update their users.
594 void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State);
595
596 /// Create code for the loop exit value of the reduction.
597 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
598
599 /// Clear NSW/NUW flags from reduction instructions if necessary.
600 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
601 VPTransformState &State);
602
603 /// Fixup the LCSSA phi nodes in the unique exit block. This simply
604 /// means we need to add the appropriate incoming value from the middle
605 /// block as exiting edges from the scalar epilogue loop (if present) are
606 /// already in place, and we exit the vector loop exclusively to the middle
607 /// block.
608 void fixLCSSAPHIs(VPTransformState &State);
609
610 /// Iteratively sink the scalarized operands of a predicated instruction into
611 /// the block that was created for it.
612 void sinkScalarOperands(Instruction *PredInst);
613
614 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
615 /// represented as.
616 void truncateToMinimalBitwidths(VPTransformState &State);
617
618 /// This function adds
619 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
620 /// to each vector element of Val. The sequence starts at StartIndex.
621 /// \p Opcode is relevant for FP induction variable.
622 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
623 Instruction::BinaryOps Opcode =
624 Instruction::BinaryOpsEnd);
625
626 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
627 /// variable on which to base the steps, \p Step is the size of the step, and
628 /// \p EntryVal is the value from the original loop that maps to the steps.
629 /// Note that \p EntryVal doesn't have to be an induction variable - it
630 /// can also be a truncate instruction.
631 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
632 const InductionDescriptor &ID, VPValue *Def,
633 VPValue *CastDef, VPTransformState &State);
634
635 /// Create a vector induction phi node based on an existing scalar one. \p
636 /// EntryVal is the value from the original loop that maps to the vector phi
637 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
638 /// truncate instruction, instead of widening the original IV, we widen a
639 /// version of the IV truncated to \p EntryVal's type.
640 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
641 Value *Step, Value *Start,
642 Instruction *EntryVal, VPValue *Def,
643 VPValue *CastDef,
644 VPTransformState &State);
645
646 /// Returns true if an instruction \p I should be scalarized instead of
647 /// vectorized for the chosen vectorization factor.
648 bool shouldScalarizeInstruction(Instruction *I) const;
649
650 /// Returns true if we should generate a scalar version of \p IV.
651 bool needsScalarInduction(Instruction *IV) const;
652
653 /// If there is a cast involved in the induction variable \p ID, which should
654 /// be ignored in the vectorized loop body, this function records the
655 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
656 /// cast. We had already proved that the casted Phi is equal to the uncasted
657 /// Phi in the vectorized loop (under a runtime guard), and therefore
658 /// there is no need to vectorize the cast - the same value can be used in the
659 /// vector loop for both the Phi and the cast.
660 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
661 /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
662 ///
663 /// \p EntryVal is the value from the original loop that maps to the vector
664 /// phi node and is used to distinguish what is the IV currently being
665 /// processed - original one (if \p EntryVal is a phi corresponding to the
666 /// original IV) or the "newly-created" one based on the proof mentioned above
667 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
668 /// latter case \p EntryVal is a TruncInst and we must not record anything for
669 /// that IV, but it's error-prone to expect callers of this routine to care
670 /// about that, hence this explicit parameter.
671 void recordVectorLoopValueForInductionCast(
672 const InductionDescriptor &ID, const Instruction *EntryVal,
673 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
674 unsigned Part, unsigned Lane = UINT_MAX(2147483647 *2U +1U));
675
676 /// Generate a shuffle sequence that will reverse the vector Vec.
677 virtual Value *reverseVector(Value *Vec);
678
679 /// Returns (and creates if needed) the original loop trip count.
680 Value *getOrCreateTripCount(Loop *NewLoop);
681
682 /// Returns (and creates if needed) the trip count of the widened loop.
683 Value *getOrCreateVectorTripCount(Loop *NewLoop);
684
685 /// Returns a bitcasted value to the requested vector type.
686 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
687 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
688 const DataLayout &DL);
689
690 /// Emit a bypass check to see if the vector trip count is zero, including if
691 /// it overflows.
692 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
693
694 /// Emit a bypass check to see if all of the SCEV assumptions we've
695 /// had to make are correct. Returns the block containing the checks or
696 /// nullptr if no checks have been added.
697 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
698
699 /// Emit bypass checks to check any memory assumptions we may have made.
700 /// Returns the block containing the checks or nullptr if no checks have been
701 /// added.
702 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
703
704 /// Compute the transformed value of Index at offset StartValue using step
705 /// StepValue.
706 /// For integer induction, returns StartValue + Index * StepValue.
707 /// For pointer induction, returns StartValue[Index * StepValue].
708 /// FIXME: The newly created binary instructions should contain nsw/nuw
709 /// flags, which can be found from the original scalar operations.
710 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
711 const DataLayout &DL,
712 const InductionDescriptor &ID) const;
713
714 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
715 /// vector loop preheader, middle block and scalar preheader. Also
716 /// allocate a loop object for the new vector loop and return it.
717 Loop *createVectorLoopSkeleton(StringRef Prefix);
718
719 /// Create new phi nodes for the induction variables to resume iteration count
720 /// in the scalar epilogue, from where the vectorized loop left off (given by
721 /// \p VectorTripCount).
722 /// In cases where the loop skeleton is more complicated (eg. epilogue
723 /// vectorization) and the resume values can come from an additional bypass
724 /// block, the \p AdditionalBypass pair provides information about the bypass
725 /// block and the end value on the edge from bypass to this loop.
726 void createInductionResumeValues(
727 Loop *L, Value *VectorTripCount,
728 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
729
730 /// Complete the loop skeleton by adding debug MDs, creating appropriate
731 /// conditional branches in the middle block, preparing the builder and
732 /// running the verifier. Take in the vector loop \p L as argument, and return
733 /// the preheader of the completed vector loop.
734 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
735
736 /// Add additional metadata to \p To that was not present on \p Orig.
737 ///
738 /// Currently this is used to add the noalias annotations based on the
739 /// inserted memchecks. Use this for instructions that are *cloned* into the
740 /// vector loop.
741 void addNewMetadata(Instruction *To, const Instruction *Orig);
742
743 /// Add metadata from one instruction to another.
744 ///
745 /// This includes both the original MDs from \p From and additional ones (\see
746 /// addNewMetadata). Use this for *newly created* instructions in the vector
747 /// loop.
748 void addMetadata(Instruction *To, Instruction *From);
749
750 /// Similar to the previous function but it adds the metadata to a
751 /// vector of instructions.
752 void addMetadata(ArrayRef<Value *> To, Instruction *From);
753
754 /// Allow subclasses to override and print debug traces before/after vplan
755 /// execution, when trace information is requested.
756 virtual void printDebugTracesAtStart(){};
757 virtual void printDebugTracesAtEnd(){};
758
759 /// The original loop.
760 Loop *OrigLoop;
761
762 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
763 /// dynamic knowledge to simplify SCEV expressions and converts them to a
764 /// more usable form.
765 PredicatedScalarEvolution &PSE;
766
767 /// Loop Info.
768 LoopInfo *LI;
769
770 /// Dominator Tree.
771 DominatorTree *DT;
772
773 /// Alias Analysis.
774 AAResults *AA;
775
776 /// Target Library Info.
777 const TargetLibraryInfo *TLI;
778
779 /// Target Transform Info.
780 const TargetTransformInfo *TTI;
781
782 /// Assumption Cache.
783 AssumptionCache *AC;
784
785 /// Interface to emit optimization remarks.
786 OptimizationRemarkEmitter *ORE;
787
788 /// LoopVersioning. It's only set up (non-null) if memchecks were
789 /// used.
790 ///
791 /// This is currently only used to add no-alias metadata based on the
792 /// memchecks. The actually versioning is performed manually.
793 std::unique_ptr<LoopVersioning> LVer;
794
795 /// The vectorization SIMD factor to use. Each vector will have this many
796 /// vector elements.
797 ElementCount VF;
798
799 /// The vectorization unroll factor to use. Each scalar is vectorized to this
800 /// many different vector instructions.
801 unsigned UF;
802
803 /// The builder that we use
804 IRBuilder<> Builder;
805
806 // --- Vectorization state ---
807
808 /// The vector-loop preheader.
809 BasicBlock *LoopVectorPreHeader;
810
811 /// The scalar-loop preheader.
812 BasicBlock *LoopScalarPreHeader;
813
814 /// Middle Block between the vector and the scalar.
815 BasicBlock *LoopMiddleBlock;
816
817 /// The unique ExitBlock of the scalar loop if one exists. Note that
818 /// there can be multiple exiting edges reaching this block.
819 BasicBlock *LoopExitBlock;
820
821 /// The vector loop body.
822 BasicBlock *LoopVectorBody;
823
824 /// The scalar loop body.
825 BasicBlock *LoopScalarBody;
826
827 /// A list of all bypass blocks. The first block is the entry of the loop.
828 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
829
830 /// The new Induction variable which was added to the new block.
831 PHINode *Induction = nullptr;
832
833 /// The induction variable of the old basic block.
834 PHINode *OldInduction = nullptr;
835
836 /// Store instructions that were predicated.
837 SmallVector<Instruction *, 4> PredicatedInstructions;
838
839 /// Trip count of the original loop.
840 Value *TripCount = nullptr;
841
842 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
843 Value *VectorTripCount = nullptr;
844
845 /// The legality analysis.
846 LoopVectorizationLegality *Legal;
847
848 /// The profitablity analysis.
849 LoopVectorizationCostModel *Cost;
850
851 // Record whether runtime checks are added.
852 bool AddedSafetyChecks = false;
853
854 // Holds the end values for each induction variable. We save the end values
855 // so we can later fix-up the external users of the induction variables.
856 DenseMap<PHINode *, Value *> IVEndValues;
857
858 // Vector of original scalar PHIs whose corresponding widened PHIs need to be
859 // fixed up at the end of vector code generation.
860 SmallVector<PHINode *, 8> OrigPHIsToFix;
861
862 /// BFI and PSI are used to check for profile guided size optimizations.
863 BlockFrequencyInfo *BFI;
864 ProfileSummaryInfo *PSI;
865
866 // Whether this loop should be optimized for size based on profile guided size
867 // optimizatios.
868 bool OptForSizeBasedOnProfile;
869
870 /// Structure to hold information about generated runtime checks, responsible
871 /// for cleaning the checks, if vectorization turns out unprofitable.
872 GeneratedRTChecks &RTChecks;
873};
874
875class InnerLoopUnroller : public InnerLoopVectorizer {
876public:
877 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
878 LoopInfo *LI, DominatorTree *DT,
879 const TargetLibraryInfo *TLI,
880 const TargetTransformInfo *TTI, AssumptionCache *AC,
881 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
882 LoopVectorizationLegality *LVL,
883 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
884 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
885 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
886 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
887 BFI, PSI, Check) {}
888
889private:
890 Value *getBroadcastInstrs(Value *V) override;
891 Value *getStepVector(Value *Val, int StartIdx, Value *Step,
892 Instruction::BinaryOps Opcode =
893 Instruction::BinaryOpsEnd) override;
894 Value *reverseVector(Value *Vec) override;
895};
896
897/// Encapsulate information regarding vectorization of a loop and its epilogue.
898/// This information is meant to be updated and used across two stages of
899/// epilogue vectorization.
900struct EpilogueLoopVectorizationInfo {
901 ElementCount MainLoopVF = ElementCount::getFixed(0);
902 unsigned MainLoopUF = 0;
903 ElementCount EpilogueVF = ElementCount::getFixed(0);
904 unsigned EpilogueUF = 0;
905 BasicBlock *MainLoopIterationCountCheck = nullptr;
906 BasicBlock *EpilogueIterationCountCheck = nullptr;
907 BasicBlock *SCEVSafetyCheck = nullptr;
908 BasicBlock *MemSafetyCheck = nullptr;
909 Value *TripCount = nullptr;
910 Value *VectorTripCount = nullptr;
911
912 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
913 unsigned EUF)
914 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
915 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
916 assert(EUF == 1 &&(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 917, __extension__ __PRETTY_FUNCTION__))
917 "A high UF for the epilogue loop is likely not beneficial.")(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 917, __extension__ __PRETTY_FUNCTION__))
;
918 }
919};
920
921/// An extension of the inner loop vectorizer that creates a skeleton for a
922/// vectorized loop that has its epilogue (residual) also vectorized.
923/// The idea is to run the vplan on a given loop twice, firstly to setup the
924/// skeleton and vectorize the main loop, and secondly to complete the skeleton
925/// from the first step and vectorize the epilogue. This is achieved by
926/// deriving two concrete strategy classes from this base class and invoking
927/// them in succession from the loop vectorizer planner.
928class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
929public:
930 InnerLoopAndEpilogueVectorizer(
931 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
932 DominatorTree *DT, const TargetLibraryInfo *TLI,
933 const TargetTransformInfo *TTI, AssumptionCache *AC,
934 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
935 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
936 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
937 GeneratedRTChecks &Checks)
938 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
939 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
940 Checks),
941 EPI(EPI) {}
942
943 // Override this function to handle the more complex control flow around the
944 // three loops.
945 BasicBlock *createVectorizedLoopSkeleton() final override {
946 return createEpilogueVectorizedLoopSkeleton();
947 }
948
949 /// The interface for creating a vectorized skeleton using one of two
950 /// different strategies, each corresponding to one execution of the vplan
951 /// as described above.
952 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
953
954 /// Holds and updates state information required to vectorize the main loop
955 /// and its epilogue in two separate passes. This setup helps us avoid
956 /// regenerating and recomputing runtime safety checks. It also helps us to
957 /// shorten the iteration-count-check path length for the cases where the
958 /// iteration count of the loop is so small that the main vector loop is
959 /// completely skipped.
960 EpilogueLoopVectorizationInfo &EPI;
961};
962
963/// A specialized derived class of inner loop vectorizer that performs
964/// vectorization of *main* loops in the process of vectorizing loops and their
965/// epilogues.
966class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
967public:
968 EpilogueVectorizerMainLoop(
969 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
970 DominatorTree *DT, const TargetLibraryInfo *TLI,
971 const TargetTransformInfo *TTI, AssumptionCache *AC,
972 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
973 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
974 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
975 GeneratedRTChecks &Check)
976 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
977 EPI, LVL, CM, BFI, PSI, Check) {}
978 /// Implements the interface for creating a vectorized skeleton using the
979 /// *main loop* strategy (ie the first pass of vplan execution).
980 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
981
982protected:
983 /// Emits an iteration count bypass check once for the main loop (when \p
984 /// ForEpilogue is false) and once for the epilogue loop (when \p
985 /// ForEpilogue is true).
986 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
987 bool ForEpilogue);
988 void printDebugTracesAtStart() override;
989 void printDebugTracesAtEnd() override;
990};
991
992// A specialized derived class of inner loop vectorizer that performs
993// vectorization of *epilogue* loops in the process of vectorizing loops and
994// their epilogues.
995class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
996public:
997 EpilogueVectorizerEpilogueLoop(
998 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
999 DominatorTree *DT, const TargetLibraryInfo *TLI,
1000 const TargetTransformInfo *TTI, AssumptionCache *AC,
1001 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
1002 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
1003 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
1004 GeneratedRTChecks &Checks)
1005 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1006 EPI, LVL, CM, BFI, PSI, Checks) {}
1007 /// Implements the interface for creating a vectorized skeleton using the
1008 /// *epilogue loop* strategy (ie the second pass of vplan execution).
1009 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1010
1011protected:
1012 /// Emits an iteration count bypass check after the main vector loop has
1013 /// finished to see if there are any iterations left to execute by either
1014 /// the vector epilogue or the scalar epilogue.
1015 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1016 BasicBlock *Bypass,
1017 BasicBlock *Insert);
1018 void printDebugTracesAtStart() override;
1019 void printDebugTracesAtEnd() override;
1020};
1021} // end namespace llvm
1022
1023/// Look for a meaningful debug location on the instruction or it's
1024/// operands.
1025static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1026 if (!I)
1027 return I;
1028
1029 DebugLoc Empty;
1030 if (I->getDebugLoc() != Empty)
1031 return I;
1032
1033 for (Use &Op : I->operands()) {
1034 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
1035 if (OpInst->getDebugLoc() != Empty)
1036 return OpInst;
1037 }
1038
1039 return I;
1040}
1041
1042void InnerLoopVectorizer::setDebugLocFromInst(
1043 const Value *V, Optional<IRBuilder<> *> CustomBuilder) {
1044 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
1045 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
1046 const DILocation *DIL = Inst->getDebugLoc();
1047
1048 // When a FSDiscriminator is enabled, we don't need to add the multiply
1049 // factors to the discriminators.
1050 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1051 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
1052 // FIXME: For scalable vectors, assume vscale=1.
1053 auto NewDIL =
1054 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1055 if (NewDIL)
1056 B->SetCurrentDebugLocation(NewDIL.getValue());
1057 else
1058 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
1059 << "Failed to create new discriminator: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
1060 << DIL->getFilename() << " Line: " << DIL->getLine())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
;
1061 } else
1062 B->SetCurrentDebugLocation(DIL);
1063 } else
1064 B->SetCurrentDebugLocation(DebugLoc());
1065}
1066
1067/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
1068/// is passed, the message relates to that particular instruction.
1069#ifndef NDEBUG
1070static void debugVectorizationMessage(const StringRef Prefix,
1071 const StringRef DebugMsg,
1072 Instruction *I) {
1073 dbgs() << "LV: " << Prefix << DebugMsg;
1074 if (I != nullptr)
1075 dbgs() << " " << *I;
1076 else
1077 dbgs() << '.';
1078 dbgs() << '\n';
1079}
1080#endif
1081
1082/// Create an analysis remark that explains why vectorization failed
1083///
1084/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
1085/// RemarkName is the identifier for the remark. If \p I is passed it is an
1086/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
1087/// the location of the remark. \return the remark object that can be
1088/// streamed to.
1089static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1090 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1091 Value *CodeRegion = TheLoop->getHeader();
1092 DebugLoc DL = TheLoop->getStartLoc();
1093
1094 if (I) {
1095 CodeRegion = I->getParent();
1096 // If there is no debug location attached to the instruction, revert back to
1097 // using the loop's.
1098 if (I->getDebugLoc())
1099 DL = I->getDebugLoc();
1100 }
1101
1102 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1103}
1104
1105/// Return a value for Step multiplied by VF.
1106static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1107 assert(isa<ConstantInt>(Step) && "Expected an integer step")(static_cast <bool> (isa<ConstantInt>(Step) &&
"Expected an integer step") ? void (0) : __assert_fail ("isa<ConstantInt>(Step) && \"Expected an integer step\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1107, __extension__ __PRETTY_FUNCTION__))
;
1108 Constant *StepVal = ConstantInt::get(
1109 Step->getType(),
1110 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1111 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1112}
1113
1114namespace llvm {
1115
1116/// Return the runtime value for VF.
1117Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
1118 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1119 return VF.isScalable() ? B.CreateVScale(EC) : EC;
1120}
1121
1122void reportVectorizationFailure(const StringRef DebugMsg,
1123 const StringRef OREMsg, const StringRef ORETag,
1124 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1125 Instruction *I) {
1126 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationMessage("Not vectorizing: "
, DebugMsg, I); } } while (false)
;
1127 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1128 ORE->emit(
1129 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1130 << "loop not vectorized: " << OREMsg);
1131}
1132
1133void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1134 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1135 Instruction *I) {
1136 LLVM_DEBUG(debugVectorizationMessage("", Msg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationMessage("", Msg, I); }
} while (false)
;
1137 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1138 ORE->emit(
1139 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1140 << Msg);
1141}
1142
1143} // end namespace llvm
1144
1145#ifndef NDEBUG
1146/// \return string containing a file name and a line # for the given loop.
1147static std::string getDebugLocString(const Loop *L) {
1148 std::string Result;
1149 if (L) {
1150 raw_string_ostream OS(Result);
1151 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1152 LoopDbgLoc.print(OS);
1153 else
1154 // Just print the module name.
1155 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1156 OS.flush();
1157 }
1158 return Result;
1159}
1160#endif
1161
1162void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1163 const Instruction *Orig) {
1164 // If the loop was versioned with memchecks, add the corresponding no-alias
1165 // metadata.
1166 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1167 LVer->annotateInstWithNoAlias(To, Orig);
1168}
1169
1170void InnerLoopVectorizer::addMetadata(Instruction *To,
1171 Instruction *From) {
1172 propagateMetadata(To, From);
1173 addNewMetadata(To, From);
1174}
1175
1176void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1177 Instruction *From) {
1178 for (Value *V : To) {
1179 if (Instruction *I = dyn_cast<Instruction>(V))
1180 addMetadata(I, From);
1181 }
1182}
1183
1184namespace llvm {
1185
1186// Loop vectorization cost-model hints how the scalar epilogue loop should be
1187// lowered.
1188enum ScalarEpilogueLowering {
1189
1190 // The default: allowing scalar epilogues.
1191 CM_ScalarEpilogueAllowed,
1192
1193 // Vectorization with OptForSize: don't allow epilogues.
1194 CM_ScalarEpilogueNotAllowedOptSize,
1195
1196 // A special case of vectorisation with OptForSize: loops with a very small
1197 // trip count are considered for vectorization under OptForSize, thereby
1198 // making sure the cost of their loop body is dominant, free of runtime
1199 // guards and scalar iteration overheads.
1200 CM_ScalarEpilogueNotAllowedLowTripLoop,
1201
1202 // Loop hint predicate indicating an epilogue is undesired.
1203 CM_ScalarEpilogueNotNeededUsePredicate,
1204
1205 // Directive indicating we must either tail fold or not vectorize
1206 CM_ScalarEpilogueNotAllowedUsePredicate
1207};
1208
1209/// ElementCountComparator creates a total ordering for ElementCount
1210/// for the purposes of using it in a set structure.
1211struct ElementCountComparator {
1212 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1213 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1214 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1215 }
1216};
1217using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1218
1219/// LoopVectorizationCostModel - estimates the expected speedups due to
1220/// vectorization.
1221/// In many cases vectorization is not profitable. This can happen because of
1222/// a number of reasons. In this class we mainly attempt to predict the
1223/// expected speedup/slowdowns due to the supported instruction set. We use the
1224/// TargetTransformInfo to query the different backends for the cost of
1225/// different operations.
1226class LoopVectorizationCostModel {
1227public:
1228 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1229 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1230 LoopVectorizationLegality *Legal,
1231 const TargetTransformInfo &TTI,
1232 const TargetLibraryInfo *TLI, DemandedBits *DB,
1233 AssumptionCache *AC,
1234 OptimizationRemarkEmitter *ORE, const Function *F,
1235 const LoopVectorizeHints *Hints,
1236 InterleavedAccessInfo &IAI)
1237 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1238 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1239 Hints(Hints), InterleaveInfo(IAI) {}
1240
1241 /// \return An upper bound for the vectorization factors (both fixed and
1242 /// scalable). If the factors are 0, vectorization and interleaving should be
1243 /// avoided up front.
1244 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1245
1246 /// \return True if runtime checks are required for vectorization, and false
1247 /// otherwise.
1248 bool runtimeChecksRequired();
1249
1250 /// \return The most profitable vectorization factor and the cost of that VF.
1251 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1252 /// then this vectorization factor will be selected if vectorization is
1253 /// possible.
1254 VectorizationFactor
1255 selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1256
1257 VectorizationFactor
1258 selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1259 const LoopVectorizationPlanner &LVP);
1260
1261 /// Setup cost-based decisions for user vectorization factor.
1262 /// \return true if the UserVF is a feasible VF to be chosen.
1263 bool selectUserVectorizationFactor(ElementCount UserVF) {
1264 collectUniformsAndScalars(UserVF);
1265 collectInstsToScalarize(UserVF);
1266 return expectedCost(UserVF).first.isValid();
1267 }
1268
1269 /// \return The size (in bits) of the smallest and widest types in the code
1270 /// that needs to be vectorized. We ignore values that remain scalar such as
1271 /// 64 bit loop indices.
1272 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1273
1274 /// \return The desired interleave count.
1275 /// If interleave count has been specified by metadata it will be returned.
1276 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1277 /// are the selected vectorization factor and the cost of the selected VF.
1278 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1279
1280 /// Memory access instruction may be vectorized in more than one way.
1281 /// Form of instruction after vectorization depends on cost.
1282 /// This function takes cost-based decisions for Load/Store instructions
1283 /// and collects them in a map. This decisions map is used for building
1284 /// the lists of loop-uniform and loop-scalar instructions.
1285 /// The calculated cost is saved with widening decision in order to
1286 /// avoid redundant calculations.
1287 void setCostBasedWideningDecision(ElementCount VF);
1288
1289 /// A struct that represents some properties of the register usage
1290 /// of a loop.
1291 struct RegisterUsage {
1292 /// Holds the number of loop invariant values that are used in the loop.
1293 /// The key is ClassID of target-provided register class.
1294 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1295 /// Holds the maximum number of concurrent live intervals in the loop.
1296 /// The key is ClassID of target-provided register class.
1297 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1298 };
1299
1300 /// \return Returns information about the register usages of the loop for the
1301 /// given vectorization factors.
1302 SmallVector<RegisterUsage, 8>
1303 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1304
1305 /// Collect values we want to ignore in the cost model.
1306 void collectValuesToIgnore();
1307
1308 /// Collect all element types in the loop for which widening is needed.
1309 void collectElementTypesForWidening();
1310
1311 /// Split reductions into those that happen in the loop, and those that happen
1312 /// outside. In loop reductions are collected into InLoopReductionChains.
1313 void collectInLoopReductions();
1314
1315 /// Returns true if we should use strict in-order reductions for the given
1316 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1317 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1318 /// of FP operations.
1319 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1320 return !Hints->allowReordering() && RdxDesc.isOrdered();
1321 }
1322
1323 /// \returns The smallest bitwidth each instruction can be represented with.
1324 /// The vector equivalents of these instructions should be truncated to this
1325 /// type.
1326 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1327 return MinBWs;
1328 }
1329
1330 /// \returns True if it is more profitable to scalarize instruction \p I for
1331 /// vectorization factor \p VF.
1332 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1333 assert(VF.isVector() &&(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1334, __extension__ __PRETTY_FUNCTION__))
1334 "Profitable to scalarize relevant only for VF > 1.")(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1334, __extension__ __PRETTY_FUNCTION__))
;
1335
1336 // Cost model is not run in the VPlan-native path - return conservative
1337 // result until this changes.
1338 if (EnableVPlanNativePath)
1339 return false;
1340
1341 auto Scalars = InstsToScalarize.find(VF);
1342 assert(Scalars != InstsToScalarize.end() &&(static_cast <bool> (Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability") ? void
(0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1343, __extension__ __PRETTY_FUNCTION__))
1343 "VF not yet analyzed for scalarization profitability")(static_cast <bool> (Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability") ? void
(0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1343, __extension__ __PRETTY_FUNCTION__))
;
1344 return Scalars->second.find(I) != Scalars->second.end();
1345 }
1346
1347 /// Returns true if \p I is known to be uniform after vectorization.
1348 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1349 if (VF.isScalar())
1350 return true;
1351
1352 // Cost model is not run in the VPlan-native path - return conservative
1353 // result until this changes.
1354 if (EnableVPlanNativePath)
1355 return false;
1356
1357 auto UniformsPerVF = Uniforms.find(VF);
1358 assert(UniformsPerVF != Uniforms.end() &&(static_cast <bool> (UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity") ? void (0) : __assert_fail
("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1359, __extension__ __PRETTY_FUNCTION__))
1359 "VF not yet analyzed for uniformity")(static_cast <bool> (UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity") ? void (0) : __assert_fail
("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1359, __extension__ __PRETTY_FUNCTION__))
;
1360 return UniformsPerVF->second.count(I);
1361 }
1362
1363 /// Returns true if \p I is known to be scalar after vectorization.
1364 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1365 if (VF.isScalar())
1366 return true;
1367
1368 // Cost model is not run in the VPlan-native path - return conservative
1369 // result until this changes.
1370 if (EnableVPlanNativePath)
1371 return false;
1372
1373 auto ScalarsPerVF = Scalars.find(VF);
1374 assert(ScalarsPerVF != Scalars.end() &&(static_cast <bool> (ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF") ? void (0) : __assert_fail
("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1375, __extension__ __PRETTY_FUNCTION__))
1375 "Scalar values are not calculated for VF")(static_cast <bool> (ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF") ? void (0) : __assert_fail
("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1375, __extension__ __PRETTY_FUNCTION__))
;
1376 return ScalarsPerVF->second.count(I);
1377 }
1378
1379 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1380 /// for vectorization factor \p VF.
1381 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1382 return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1383 !isProfitableToScalarize(I, VF) &&
1384 !isScalarAfterVectorization(I, VF);
1385 }
1386
1387 /// Decision that was taken during cost calculation for memory instruction.
1388 enum InstWidening {
1389 CM_Unknown,
1390 CM_Widen, // For consecutive accesses with stride +1.
1391 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1392 CM_Interleave,
1393 CM_GatherScatter,
1394 CM_Scalarize
1395 };
1396
1397 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1398 /// instruction \p I and vector width \p VF.
1399 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1400 InstructionCost Cost) {
1401 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1401, __extension__ __PRETTY_FUNCTION__))
;
1402 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1403 }
1404
1405 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1406 /// interleaving group \p Grp and vector width \p VF.
1407 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1408 ElementCount VF, InstWidening W,
1409 InstructionCost Cost) {
1410 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1410, __extension__ __PRETTY_FUNCTION__))
;
1411 /// Broadcast this decicion to all instructions inside the group.
1412 /// But the cost will be assigned to one instruction only.
1413 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1414 if (auto *I = Grp->getMember(i)) {
1415 if (Grp->getInsertPos() == I)
1416 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1417 else
1418 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1419 }
1420 }
1421 }
1422
1423 /// Return the cost model decision for the given instruction \p I and vector
1424 /// width \p VF. Return CM_Unknown if this instruction did not pass
1425 /// through the cost modeling.
1426 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1427 assert(VF.isVector() && "Expected VF to be a vector VF")(static_cast <bool> (VF.isVector() && "Expected VF to be a vector VF"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF to be a vector VF\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1427, __extension__ __PRETTY_FUNCTION__))
;
1428 // Cost model is not run in the VPlan-native path - return conservative
1429 // result until this changes.
1430 if (EnableVPlanNativePath)
1431 return CM_GatherScatter;
1432
1433 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1434 auto Itr = WideningDecisions.find(InstOnVF);
1435 if (Itr == WideningDecisions.end())
1436 return CM_Unknown;
1437 return Itr->second.first;
1438 }
1439
1440 /// Return the vectorization cost for the given instruction \p I and vector
1441 /// width \p VF.
1442 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1443 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1443, __extension__ __PRETTY_FUNCTION__))
;
1444 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1445 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&(static_cast <bool> (WideningDecisions.find(InstOnVF) !=
WideningDecisions.end() && "The cost is not calculated"
) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1446, __extension__ __PRETTY_FUNCTION__))
1446 "The cost is not calculated")(static_cast <bool> (WideningDecisions.find(InstOnVF) !=
WideningDecisions.end() && "The cost is not calculated"
) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1446, __extension__ __PRETTY_FUNCTION__))
;
1447 return WideningDecisions[InstOnVF].second;
1448 }
1449
1450 /// Return True if instruction \p I is an optimizable truncate whose operand
1451 /// is an induction variable. Such a truncate will be removed by adding a new
1452 /// induction variable with the destination type.
1453 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1454 // If the instruction is not a truncate, return false.
1455 auto *Trunc = dyn_cast<TruncInst>(I);
1456 if (!Trunc)
1457 return false;
1458
1459 // Get the source and destination types of the truncate.
1460 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1461 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1462
1463 // If the truncate is free for the given types, return false. Replacing a
1464 // free truncate with an induction variable would add an induction variable
1465 // update instruction to each iteration of the loop. We exclude from this
1466 // check the primary induction variable since it will need an update
1467 // instruction regardless.
1468 Value *Op = Trunc->getOperand(0);
1469 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1470 return false;
1471
1472 // If the truncated value is not an induction variable, return false.
1473 return Legal->isInductionPhi(Op);
1474 }
1475
1476 /// Collects the instructions to scalarize for each predicated instruction in
1477 /// the loop.
1478 void collectInstsToScalarize(ElementCount VF);
1479
1480 /// Collect Uniform and Scalar values for the given \p VF.
1481 /// The sets depend on CM decision for Load/Store instructions
1482 /// that may be vectorized as interleave, gather-scatter or scalarized.
1483 void collectUniformsAndScalars(ElementCount VF) {
1484 // Do the analysis once.
1485 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1486 return;
1487 setCostBasedWideningDecision(VF);
1488 collectLoopUniforms(VF);
1489 collectLoopScalars(VF);
1490 }
1491
1492 /// Returns true if the target machine supports masked store operation
1493 /// for the given \p DataType and kind of access to \p Ptr.
1494 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1495 return Legal->isConsecutivePtr(Ptr) &&
1496 TTI.isLegalMaskedStore(DataType, Alignment);
1497 }
1498
1499 /// Returns true if the target machine supports masked load operation
1500 /// for the given \p DataType and kind of access to \p Ptr.
1501 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1502 return Legal->isConsecutivePtr(Ptr) &&
1503 TTI.isLegalMaskedLoad(DataType, Alignment);
1504 }
1505
1506 /// Returns true if the target machine can represent \p V as a masked gather
1507 /// or scatter operation.
1508 bool isLegalGatherOrScatter(Value *V) {
1509 bool LI = isa<LoadInst>(V);
1510 bool SI = isa<StoreInst>(V);
1511 if (!LI && !SI)
1512 return false;
1513 auto *Ty = getLoadStoreType(V);
1514 Align Align = getLoadStoreAlignment(V);
1515 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1516 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1517 }
1518
1519 /// Returns true if the target machine supports all of the reduction
1520 /// variables found for the given VF.
1521 bool canVectorizeReductions(ElementCount VF) const {
1522 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1523 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1524 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1525 }));
1526 }
1527
1528 /// Returns true if \p I is an instruction that will be scalarized with
1529 /// predication. Such instructions include conditional stores and
1530 /// instructions that may divide by zero.
1531 /// If a non-zero VF has been calculated, we check if I will be scalarized
1532 /// predication for that VF.
1533 bool isScalarWithPredication(Instruction *I) const;
1534
1535 // Returns true if \p I is an instruction that will be predicated either
1536 // through scalar predication or masked load/store or masked gather/scatter.
1537 // Superset of instructions that return true for isScalarWithPredication.
1538 bool isPredicatedInst(Instruction *I) {
1539 if (!blockNeedsPredication(I->getParent()))
1540 return false;
1541 // Loads and stores that need some form of masked operation are predicated
1542 // instructions.
1543 if (isa<LoadInst>(I) || isa<StoreInst>(I))
1544 return Legal->isMaskRequired(I);
1545 return isScalarWithPredication(I);
1546 }
1547
1548 /// Returns true if \p I is a memory instruction with consecutive memory
1549 /// access that can be widened.
1550 bool
1551 memoryInstructionCanBeWidened(Instruction *I,
1552 ElementCount VF = ElementCount::getFixed(1));
1553
1554 /// Returns true if \p I is a memory instruction in an interleaved-group
1555 /// of memory accesses that can be vectorized with wide vector loads/stores
1556 /// and shuffles.
1557 bool
1558 interleavedAccessCanBeWidened(Instruction *I,
1559 ElementCount VF = ElementCount::getFixed(1));
1560
1561 /// Check if \p Instr belongs to any interleaved access group.
1562 bool isAccessInterleaved(Instruction *Instr) {
1563 return InterleaveInfo.isInterleaved(Instr);
1564 }
1565
1566 /// Get the interleaved access group that \p Instr belongs to.
1567 const InterleaveGroup<Instruction> *
1568 getInterleavedAccessGroup(Instruction *Instr) {
1569 return InterleaveInfo.getInterleaveGroup(Instr);
1570 }
1571
1572 /// Returns true if we're required to use a scalar epilogue for at least
1573 /// the final iteration of the original loop.
1574 bool requiresScalarEpilogue(ElementCount VF) const {
1575 if (!isScalarEpilogueAllowed())
1576 return false;
1577 // If we might exit from anywhere but the latch, must run the exiting
1578 // iteration in scalar form.
1579 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1580 return true;
1581 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1582 }
1583
1584 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1585 /// loop hint annotation.
1586 bool isScalarEpilogueAllowed() const {
1587 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1588 }
1589
1590 /// Returns true if all loop blocks should be masked to fold tail loop.
1591 bool foldTailByMasking() const { return FoldTailByMasking; }
1592
1593 bool blockNeedsPredication(BasicBlock *BB) const {
1594 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1595 }
1596
1597 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1598 /// nodes to the chain of instructions representing the reductions. Uses a
1599 /// MapVector to ensure deterministic iteration order.
1600 using ReductionChainMap =
1601 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1602
1603 /// Return the chain of instructions representing an inloop reduction.
1604 const ReductionChainMap &getInLoopReductionChains() const {
1605 return InLoopReductionChains;
1606 }
1607
1608 /// Returns true if the Phi is part of an inloop reduction.
1609 bool isInLoopReduction(PHINode *Phi) const {
1610 return InLoopReductionChains.count(Phi);
1611 }
1612
1613 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1614 /// with factor VF. Return the cost of the instruction, including
1615 /// scalarization overhead if it's needed.
1616 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1617
1618 /// Estimate cost of a call instruction CI if it were vectorized with factor
1619 /// VF. Return the cost of the instruction, including scalarization overhead
1620 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1621 /// scalarized -
1622 /// i.e. either vector version isn't available, or is too expensive.
1623 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1624 bool &NeedToScalarize) const;
1625
1626 /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1627 /// that of B.
1628 bool isMoreProfitable(const VectorizationFactor &A,
1629 const VectorizationFactor &B) const;
1630
1631 /// Invalidates decisions already taken by the cost model.
1632 void invalidateCostModelingDecisions() {
1633 WideningDecisions.clear();
1634 Uniforms.clear();
1635 Scalars.clear();
1636 }
1637
1638private:
1639 unsigned NumPredStores = 0;
1640
1641 /// \return An upper bound for the vectorization factors for both
1642 /// fixed and scalable vectorization, where the minimum-known number of
1643 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1644 /// disabled or unsupported, then the scalable part will be equal to
1645 /// ElementCount::getScalable(0).
1646 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1647 ElementCount UserVF);
1648
1649 /// \return the maximized element count based on the targets vector
1650 /// registers and the loop trip-count, but limited to a maximum safe VF.
1651 /// This is a helper function of computeFeasibleMaxVF.
1652 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1653 /// issue that occurred on one of the buildbots which cannot be reproduced
1654 /// without having access to the properietary compiler (see comments on
1655 /// D98509). The issue is currently under investigation and this workaround
1656 /// will be removed as soon as possible.
1657 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1658 unsigned SmallestType,
1659 unsigned WidestType,
1660 const ElementCount &MaxSafeVF);
1661
1662 /// \return the maximum legal scalable VF, based on the safe max number
1663 /// of elements.
1664 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1665
1666 /// The vectorization cost is a combination of the cost itself and a boolean
1667 /// indicating whether any of the contributing operations will actually
1668 /// operate on vector values after type legalization in the backend. If this
1669 /// latter value is false, then all operations will be scalarized (i.e. no
1670 /// vectorization has actually taken place).
1671 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1672
1673 /// Returns the expected execution cost. The unit of the cost does
1674 /// not matter because we use the 'cost' units to compare different
1675 /// vector widths. The cost that is returned is *not* normalized by
1676 /// the factor width. If \p Invalid is not nullptr, this function
1677 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1678 /// each instruction that has an Invalid cost for the given VF.
1679 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1680 VectorizationCostTy
1681 expectedCost(ElementCount VF,
1682 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1683
1684 /// Returns the execution time cost of an instruction for a given vector
1685 /// width. Vector width of one means scalar.
1686 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1687
1688 /// The cost-computation logic from getInstructionCost which provides
1689 /// the vector type as an output parameter.
1690 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1691 Type *&VectorTy);
1692
1693 /// Return the cost of instructions in an inloop reduction pattern, if I is
1694 /// part of that pattern.
1695 Optional<InstructionCost>
1696 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1697 TTI::TargetCostKind CostKind);
1698
1699 /// Calculate vectorization cost of memory instruction \p I.
1700 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1701
1702 /// The cost computation for scalarized memory instruction.
1703 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1704
1705 /// The cost computation for interleaving group of memory instructions.
1706 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1707
1708 /// The cost computation for Gather/Scatter instruction.
1709 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1710
1711 /// The cost computation for widening instruction \p I with consecutive
1712 /// memory access.
1713 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1714
1715 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1716 /// Load: scalar load + broadcast.
1717 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1718 /// element)
1719 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1720
1721 /// Estimate the overhead of scalarizing an instruction. This is a
1722 /// convenience wrapper for the type-based getScalarizationOverhead API.
1723 InstructionCost getScalarizationOverhead(Instruction *I,
1724 ElementCount VF) const;
1725
1726 /// Returns whether the instruction is a load or store and will be a emitted
1727 /// as a vector operation.
1728 bool isConsecutiveLoadOrStore(Instruction *I);
1729
1730 /// Returns true if an artificially high cost for emulated masked memrefs
1731 /// should be used.
1732 bool useEmulatedMaskMemRefHack(Instruction *I);
1733
1734 /// Map of scalar integer values to the smallest bitwidth they can be legally
1735 /// represented as. The vector equivalents of these values should be truncated
1736 /// to this type.
1737 MapVector<Instruction *, uint64_t> MinBWs;
1738
1739 /// A type representing the costs for instructions if they were to be
1740 /// scalarized rather than vectorized. The entries are Instruction-Cost
1741 /// pairs.
1742 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1743
1744 /// A set containing all BasicBlocks that are known to present after
1745 /// vectorization as a predicated block.
1746 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1747
1748 /// Records whether it is allowed to have the original scalar loop execute at
1749 /// least once. This may be needed as a fallback loop in case runtime
1750 /// aliasing/dependence checks fail, or to handle the tail/remainder
1751 /// iterations when the trip count is unknown or doesn't divide by the VF,
1752 /// or as a peel-loop to handle gaps in interleave-groups.
1753 /// Under optsize and when the trip count is very small we don't allow any
1754 /// iterations to execute in the scalar loop.
1755 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1756
1757 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1758 bool FoldTailByMasking = false;
1759
1760 /// A map holding scalar costs for different vectorization factors. The
1761 /// presence of a cost for an instruction in the mapping indicates that the
1762 /// instruction will be scalarized when vectorizing with the associated
1763 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1764 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1765
1766 /// Holds the instructions known to be uniform after vectorization.
1767 /// The data is collected per VF.
1768 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1769
1770 /// Holds the instructions known to be scalar after vectorization.
1771 /// The data is collected per VF.
1772 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1773
1774 /// Holds the instructions (address computations) that are forced to be
1775 /// scalarized.
1776 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1777
1778 /// PHINodes of the reductions that should be expanded in-loop along with
1779 /// their associated chains of reduction operations, in program order from top
1780 /// (PHI) to bottom
1781 ReductionChainMap InLoopReductionChains;
1782
1783 /// A Map of inloop reduction operations and their immediate chain operand.
1784 /// FIXME: This can be removed once reductions can be costed correctly in
1785 /// vplan. This was added to allow quick lookup to the inloop operations,
1786 /// without having to loop through InLoopReductionChains.
1787 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1788
1789 /// Returns the expected difference in cost from scalarizing the expression
1790 /// feeding a predicated instruction \p PredInst. The instructions to
1791 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1792 /// non-negative return value implies the expression will be scalarized.
1793 /// Currently, only single-use chains are considered for scalarization.
1794 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1795 ElementCount VF);
1796
1797 /// Collect the instructions that are uniform after vectorization. An
1798 /// instruction is uniform if we represent it with a single scalar value in
1799 /// the vectorized loop corresponding to each vector iteration. Examples of
1800 /// uniform instructions include pointer operands of consecutive or
1801 /// interleaved memory accesses. Note that although uniformity implies an
1802 /// instruction will be scalar, the reverse is not true. In general, a
1803 /// scalarized instruction will be represented by VF scalar values in the
1804 /// vectorized loop, each corresponding to an iteration of the original
1805 /// scalar loop.
1806 void collectLoopUniforms(ElementCount VF);
1807
1808 /// Collect the instructions that are scalar after vectorization. An
1809 /// instruction is scalar if it is known to be uniform or will be scalarized
1810 /// during vectorization. Non-uniform scalarized instructions will be
1811 /// represented by VF values in the vectorized loop, each corresponding to an
1812 /// iteration of the original scalar loop.
1813 void collectLoopScalars(ElementCount VF);
1814
1815 /// Keeps cost model vectorization decision and cost for instructions.
1816 /// Right now it is used for memory instructions only.
1817 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1818 std::pair<InstWidening, InstructionCost>>;
1819
1820 DecisionList WideningDecisions;
1821
1822 /// Returns true if \p V is expected to be vectorized and it needs to be
1823 /// extracted.
1824 bool needsExtract(Value *V, ElementCount VF) const {
1825 Instruction *I = dyn_cast<Instruction>(V);
1826 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1827 TheLoop->isLoopInvariant(I))
1828 return false;
1829
1830 // Assume we can vectorize V (and hence we need extraction) if the
1831 // scalars are not computed yet. This can happen, because it is called
1832 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1833 // the scalars are collected. That should be a safe assumption in most
1834 // cases, because we check if the operands have vectorizable types
1835 // beforehand in LoopVectorizationLegality.
1836 return Scalars.find(VF) == Scalars.end() ||
1837 !isScalarAfterVectorization(I, VF);
1838 };
1839
1840 /// Returns a range containing only operands needing to be extracted.
1841 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1842 ElementCount VF) const {
1843 return SmallVector<Value *, 4>(make_filter_range(
1844 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1845 }
1846
1847 /// Determines if we have the infrastructure to vectorize loop \p L and its
1848 /// epilogue, assuming the main loop is vectorized by \p VF.
1849 bool isCandidateForEpilogueVectorization(const Loop &L,
1850 const ElementCount VF) const;
1851
1852 /// Returns true if epilogue vectorization is considered profitable, and
1853 /// false otherwise.
1854 /// \p VF is the vectorization factor chosen for the original loop.
1855 bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1856
1857public:
1858 /// The loop that we evaluate.
1859 Loop *TheLoop;
1860
1861 /// Predicated scalar evolution analysis.
1862 PredicatedScalarEvolution &PSE;
1863
1864 /// Loop Info analysis.
1865 LoopInfo *LI;
1866
1867 /// Vectorization legality.
1868 LoopVectorizationLegality *Legal;
1869
1870 /// Vector target information.
1871 const TargetTransformInfo &TTI;
1872
1873 /// Target Library Info.
1874 const TargetLibraryInfo *TLI;
1875
1876 /// Demanded bits analysis.
1877 DemandedBits *DB;
1878
1879 /// Assumption cache.
1880 AssumptionCache *AC;
1881
1882 /// Interface to emit optimization remarks.
1883 OptimizationRemarkEmitter *ORE;
1884
1885 const Function *TheFunction;
1886
1887 /// Loop Vectorize Hint.
1888 const LoopVectorizeHints *Hints;
1889
1890 /// The interleave access information contains groups of interleaved accesses
1891 /// with the same stride and close to each other.
1892 InterleavedAccessInfo &InterleaveInfo;
1893
1894 /// Values to ignore in the cost model.
1895 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1896
1897 /// Values to ignore in the cost model when VF > 1.
1898 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1899
1900 /// All element types found in the loop.
1901 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1902
1903 /// Profitable vector factors.
1904 SmallVector<VectorizationFactor, 8> ProfitableVFs;
1905};
1906} // end namespace llvm
1907
1908/// Helper struct to manage generating runtime checks for vectorization.
1909///
1910/// The runtime checks are created up-front in temporary blocks to allow better
1911/// estimating the cost and un-linked from the existing IR. After deciding to
1912/// vectorize, the checks are moved back. If deciding not to vectorize, the
1913/// temporary blocks are completely removed.
1914class GeneratedRTChecks {
1915 /// Basic block which contains the generated SCEV checks, if any.
1916 BasicBlock *SCEVCheckBlock = nullptr;
1917
1918 /// The value representing the result of the generated SCEV checks. If it is
1919 /// nullptr, either no SCEV checks have been generated or they have been used.
1920 Value *SCEVCheckCond = nullptr;
1921
1922 /// Basic block which contains the generated memory runtime checks, if any.
1923 BasicBlock *MemCheckBlock = nullptr;
1924
1925 /// The value representing the result of the generated memory runtime checks.
1926 /// If it is nullptr, either no memory runtime checks have been generated or
1927 /// they have been used.
1928 Instruction *MemRuntimeCheckCond = nullptr;
1929
1930 DominatorTree *DT;
1931 LoopInfo *LI;
1932
1933 SCEVExpander SCEVExp;
1934 SCEVExpander MemCheckExp;
1935
1936public:
1937 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1938 const DataLayout &DL)
1939 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1940 MemCheckExp(SE, DL, "scev.check") {}
1941
1942 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1943 /// accurately estimate the cost of the runtime checks. The blocks are
1944 /// un-linked from the IR and is added back during vector code generation. If
1945 /// there is no vector code generation, the check blocks are removed
1946 /// completely.
1947 void Create(Loop *L, const LoopAccessInfo &LAI,
1948 const SCEVUnionPredicate &UnionPred) {
1949
1950 BasicBlock *LoopHeader = L->getHeader();
1951 BasicBlock *Preheader = L->getLoopPreheader();
1952
1953 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1954 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1955 // may be used by SCEVExpander. The blocks will be un-linked from their
1956 // predecessors and removed from LI & DT at the end of the function.
1957 if (!UnionPred.isAlwaysTrue()) {
1958 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1959 nullptr, "vector.scevcheck");
1960
1961 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1962 &UnionPred, SCEVCheckBlock->getTerminator());
1963 }
1964
1965 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1966 if (RtPtrChecking.Need) {
1967 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1968 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1969 "vector.memcheck");
1970
1971 std::tie(std::ignore, MemRuntimeCheckCond) =
1972 addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1973 RtPtrChecking.getChecks(), MemCheckExp);
1974 assert(MemRuntimeCheckCond &&(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1976, __extension__ __PRETTY_FUNCTION__))
1975 "no RT checks generated although RtPtrChecking "(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1976, __extension__ __PRETTY_FUNCTION__))
1976 "claimed checks are required")(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1976, __extension__ __PRETTY_FUNCTION__))
;
1977 }
1978
1979 if (!MemCheckBlock && !SCEVCheckBlock)
1980 return;
1981
1982 // Unhook the temporary block with the checks, update various places
1983 // accordingly.
1984 if (SCEVCheckBlock)
1985 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1986 if (MemCheckBlock)
1987 MemCheckBlock->replaceAllUsesWith(Preheader);
1988
1989 if (SCEVCheckBlock) {
1990 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1991 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1992 Preheader->getTerminator()->eraseFromParent();
1993 }
1994 if (MemCheckBlock) {
1995 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1996 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1997 Preheader->getTerminator()->eraseFromParent();
1998 }
1999
2000 DT->changeImmediateDominator(LoopHeader, Preheader);
2001 if (MemCheckBlock) {
2002 DT->eraseNode(MemCheckBlock);
2003 LI->removeBlock(MemCheckBlock);
2004 }
2005 if (SCEVCheckBlock) {
2006 DT->eraseNode(SCEVCheckBlock);
2007 LI->removeBlock(SCEVCheckBlock);
2008 }
2009 }
2010
2011 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2012 /// unused.
2013 ~GeneratedRTChecks() {
2014 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
2015 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
2016 if (!SCEVCheckCond)
2017 SCEVCleaner.markResultUsed();
2018
2019 if (!MemRuntimeCheckCond)
2020 MemCheckCleaner.markResultUsed();
2021
2022 if (MemRuntimeCheckCond) {
2023 auto &SE = *MemCheckExp.getSE();
2024 // Memory runtime check generation creates compares that use expanded
2025 // values. Remove them before running the SCEVExpanderCleaners.
2026 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2027 if (MemCheckExp.isInsertedInstruction(&I))
2028 continue;
2029 SE.forgetValue(&I);
2030 SE.eraseValueFromMap(&I);
2031 I.eraseFromParent();
2032 }
2033 }
2034 MemCheckCleaner.cleanup();
2035 SCEVCleaner.cleanup();
2036
2037 if (SCEVCheckCond)
2038 SCEVCheckBlock->eraseFromParent();
2039 if (MemRuntimeCheckCond)
2040 MemCheckBlock->eraseFromParent();
2041 }
2042
2043 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2044 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2045 /// depending on the generated condition.
2046 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
2047 BasicBlock *LoopVectorPreHeader,
2048 BasicBlock *LoopExitBlock) {
2049 if (!SCEVCheckCond)
2050 return nullptr;
2051 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2052 if (C->isZero())
2053 return nullptr;
2054
2055 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2056
2057 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2058 // Create new preheader for vector loop.
2059 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2060 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2061
2062 SCEVCheckBlock->getTerminator()->eraseFromParent();
2063 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2064 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2065 SCEVCheckBlock);
2066
2067 DT->addNewBlock(SCEVCheckBlock, Pred);
2068 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2069
2070 ReplaceInstWithInst(
2071 SCEVCheckBlock->getTerminator(),
2072 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2073 // Mark the check as used, to prevent it from being removed during cleanup.
2074 SCEVCheckCond = nullptr;
2075 return SCEVCheckBlock;
2076 }
2077
2078 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2079 /// the branches to branch to the vector preheader or \p Bypass, depending on
2080 /// the generated condition.
2081 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2082 BasicBlock *LoopVectorPreHeader) {
2083 // Check if we generated code that checks in runtime if arrays overlap.
2084 if (!MemRuntimeCheckCond)
2085 return nullptr;
2086
2087 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2088 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2089 MemCheckBlock);
2090
2091 DT->addNewBlock(MemCheckBlock, Pred);
2092 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2093 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2094
2095 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2096 PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2097
2098 ReplaceInstWithInst(
2099 MemCheckBlock->getTerminator(),
2100 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2101 MemCheckBlock->getTerminator()->setDebugLoc(
2102 Pred->getTerminator()->getDebugLoc());
2103
2104 // Mark the check as used, to prevent it from being removed during cleanup.
2105 MemRuntimeCheckCond = nullptr;
2106 return MemCheckBlock;
2107 }
2108};
2109
2110// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2111// vectorization. The loop needs to be annotated with #pragma omp simd
2112// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2113// vector length information is not provided, vectorization is not considered
2114// explicit. Interleave hints are not allowed either. These limitations will be
2115// relaxed in the future.
2116// Please, note that we are currently forced to abuse the pragma 'clang
2117// vectorize' semantics. This pragma provides *auto-vectorization hints*
2118// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2119// provides *explicit vectorization hints* (LV can bypass legal checks and
2120// assume that vectorization is legal). However, both hints are implemented
2121// using the same metadata (llvm.loop.vectorize, processed by
2122// LoopVectorizeHints). This will be fixed in the future when the native IR
2123// representation for pragma 'omp simd' is introduced.
2124static bool isExplicitVecOuterLoop(Loop *OuterLp,
2125 OptimizationRemarkEmitter *ORE) {
2126 assert(!OuterLp->isInnermost() && "This is not an outer loop")(static_cast <bool> (!OuterLp->isInnermost() &&
"This is not an outer loop") ? void (0) : __assert_fail ("!OuterLp->isInnermost() && \"This is not an outer loop\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2126, __extension__ __PRETTY_FUNCTION__))
;
2127 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2128
2129 // Only outer loops with an explicit vectorization hint are supported.
2130 // Unannotated outer loops are ignored.
2131 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2132 return false;
2133
2134 Function *Fn = OuterLp->getHeader()->getParent();
2135 if (!Hints.allowVectorization(Fn, OuterLp,
2136 true /*VectorizeOnlyWhenForced*/)) {
2137 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"
; } } while (false)
;
2138 return false;
2139 }
2140
2141 if (Hints.getInterleave() > 1) {
2142 // TODO: Interleave support is future work.
2143 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
2144 "outer loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
;
2145 Hints.emitRemarkWithHints();
2146 return false;
2147 }
2148
2149 return true;
2150}
2151
2152static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2153 OptimizationRemarkEmitter *ORE,
2154 SmallVectorImpl<Loop *> &V) {
2155 // Collect inner loops and outer loops without irreducible control flow. For
2156 // now, only collect outer loops that have explicit vectorization hints. If we
2157 // are stress testing the VPlan H-CFG construction, we collect the outermost
2158 // loop of every loop nest.
2159 if (L.isInnermost() || VPlanBuildStressTest ||
2160 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2161 LoopBlocksRPO RPOT(&L);
2162 RPOT.perform(LI);
2163 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2164 V.push_back(&L);
2165 // TODO: Collect inner loops inside marked outer loops in case
2166 // vectorization fails for the outer loop. Do not invoke
2167 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2168 // already known to be reducible. We can use an inherited attribute for
2169 // that.
2170 return;
2171 }
2172 }
2173 for (Loop *InnerL : L)
2174 collectSupportedLoops(*InnerL, LI, ORE, V);
2175}
2176
2177namespace {
2178
2179/// The LoopVectorize Pass.
2180struct LoopVectorize : public FunctionPass {
2181 /// Pass identification, replacement for typeid
2182 static char ID;
2183
2184 LoopVectorizePass Impl;
2185
2186 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2187 bool VectorizeOnlyWhenForced = false)
2188 : FunctionPass(ID),
2189 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2190 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2191 }
2192
2193 bool runOnFunction(Function &F) override {
2194 if (skipFunction(F))
2195 return false;
2196
2197 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2198 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2199 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2200 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2201 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2202 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2203 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2204 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2205 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2206 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2207 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2208 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2209 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2210
2211 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2212 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2213
2214 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2215 GetLAA, *ORE, PSI).MadeAnyChange;
2216 }
2217
2218 void getAnalysisUsage(AnalysisUsage &AU) const override {
2219 AU.addRequired<AssumptionCacheTracker>();
2220 AU.addRequired<BlockFrequencyInfoWrapperPass>();
2221 AU.addRequired<DominatorTreeWrapperPass>();
2222 AU.addRequired<LoopInfoWrapperPass>();
2223 AU.addRequired<ScalarEvolutionWrapperPass>();
2224 AU.addRequired<TargetTransformInfoWrapperPass>();
2225 AU.addRequired<AAResultsWrapperPass>();
2226 AU.addRequired<LoopAccessLegacyAnalysis>();
2227 AU.addRequired<DemandedBitsWrapperPass>();
2228 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2229 AU.addRequired<InjectTLIMappingsLegacy>();
2230
2231 // We currently do not preserve loopinfo/dominator analyses with outer loop
2232 // vectorization. Until this is addressed, mark these analyses as preserved
2233 // only for non-VPlan-native path.
2234 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2235 if (!EnableVPlanNativePath) {
2236 AU.addPreserved<LoopInfoWrapperPass>();
2237 AU.addPreserved<DominatorTreeWrapperPass>();
2238 }
2239
2240 AU.addPreserved<BasicAAWrapperPass>();
2241 AU.addPreserved<GlobalsAAWrapperPass>();
2242 AU.addRequired<ProfileSummaryInfoWrapperPass>();
2243 }
2244};
2245
2246} // end anonymous namespace
2247
2248//===----------------------------------------------------------------------===//
2249// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2250// LoopVectorizationCostModel and LoopVectorizationPlanner.
2251//===----------------------------------------------------------------------===//
2252
2253Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2254 // We need to place the broadcast of invariant variables outside the loop,
2255 // but only if it's proven safe to do so. Else, broadcast will be inside
2256 // vector loop body.
2257 Instruction *Instr = dyn_cast<Instruction>(V);
2258 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2259 (!Instr ||
2260 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2261 // Place the code for broadcasting invariant variables in the new preheader.
2262 IRBuilder<>::InsertPointGuard Guard(Builder);
2263 if (SafeToHoist)
2264 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2265
2266 // Broadcast the scalar into all locations in the vector.
2267 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2268
2269 return Shuf;
2270}
2271
2272void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2273 const InductionDescriptor &II, Value *Step, Value *Start,
2274 Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
2275 VPTransformState &State) {
2276 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2277, __extension__ __PRETTY_FUNCTION__))
2277 "Expected either an induction phi-node or a truncate of it!")(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2277, __extension__ __PRETTY_FUNCTION__))
;
2278
2279 // Construct the initial value of the vector IV in the vector loop preheader
2280 auto CurrIP = Builder.saveIP();
2281 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2282 if (isa<TruncInst>(EntryVal)) {
2283 assert(Start->getType()->isIntegerTy() &&(static_cast <bool> (Start->getType()->isIntegerTy
() && "Truncation requires an integer type") ? void (
0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2284, __extension__ __PRETTY_FUNCTION__))
2284 "Truncation requires an integer type")(static_cast <bool> (Start->getType()->isIntegerTy
() && "Truncation requires an integer type") ? void (
0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2284, __extension__ __PRETTY_FUNCTION__))
;
2285 auto *TruncType = cast<IntegerType>(EntryVal->getType());
2286 Step = Builder.CreateTrunc(Step, TruncType);
2287 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2288 }
2289 Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2290 Value *SteppedStart =
2291 getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2292
2293 // We create vector phi nodes for both integer and floating-point induction
2294 // variables. Here, we determine the kind of arithmetic we will perform.
2295 Instruction::BinaryOps AddOp;
2296 Instruction::BinaryOps MulOp;
2297 if (Step->getType()->isIntegerTy()) {
2298 AddOp = Instruction::Add;
2299 MulOp = Instruction::Mul;
2300 } else {
2301 AddOp = II.getInductionOpcode();
2302 MulOp = Instruction::FMul;
2303 }
2304
2305 // Multiply the vectorization factor by the step using integer or
2306 // floating-point arithmetic as appropriate.
2307 Type *StepType = Step->getType();
2308 if (Step->getType()->isFloatingPointTy())
2309 StepType = IntegerType::get(StepType->getContext(),
2310 StepType->getScalarSizeInBits());
2311 Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF);
2312 if (Step->getType()->isFloatingPointTy())
2313 RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType());
2314 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2315
2316 // Create a vector splat to use in the induction update.
2317 //
2318 // FIXME: If the step is non-constant, we create the vector splat with
2319 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2320 // handle a constant vector splat.
2321 Value *SplatVF = isa<Constant>(Mul)
2322 ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2323 : Builder.CreateVectorSplat(VF, Mul);
2324 Builder.restoreIP(CurrIP);
2325
2326 // We may need to add the step a number of times, depending on the unroll
2327 // factor. The last of those goes into the PHI.
2328 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2329 &*LoopVectorBody->getFirstInsertionPt());
2330 VecInd->setDebugLoc(EntryVal->getDebugLoc());
2331 Instruction *LastInduction = VecInd;
2332 for (unsigned Part = 0; Part < UF; ++Part) {
2333 State.set(Def, LastInduction, Part);
2334
2335 if (isa<TruncInst>(EntryVal))
2336 addMetadata(LastInduction, EntryVal);
2337 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
2338 State, Part);
2339
2340 LastInduction = cast<Instruction>(
2341 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2342 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2343 }
2344
2345 // Move the last step to the end of the latch block. This ensures consistent
2346 // placement of all induction updates.
2347 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2348 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2349 auto *ICmp = cast<Instruction>(Br->getCondition());
2350 LastInduction->moveBefore(ICmp);
2351 LastInduction->setName("vec.ind.next");
2352
2353 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2354 VecInd->addIncoming(LastInduction, LoopVectorLatch);
2355}
2356
2357bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2358 return Cost->isScalarAfterVectorization(I, VF) ||
2359 Cost->isProfitableToScalarize(I, VF);
2360}
2361
2362bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2363 if (shouldScalarizeInstruction(IV))
2364 return true;
2365 auto isScalarInst = [&](User *U) -> bool {
2366 auto *I = cast<Instruction>(U);
2367 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2368 };
2369 return llvm::any_of(IV->users(), isScalarInst);
2370}
2371
2372void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2373 const InductionDescriptor &ID, const Instruction *EntryVal,
2374 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
2375 unsigned Part, unsigned Lane) {
2376 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2377, __extension__ __PRETTY_FUNCTION__))
2377 "Expected either an induction phi-node or a truncate of it!")(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2377, __extension__ __PRETTY_FUNCTION__))
;
2378
2379 // This induction variable is not the phi from the original loop but the
2380 // newly-created IV based on the proof that casted Phi is equal to the
2381 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2382 // re-uses the same InductionDescriptor that original IV uses but we don't
2383 // have to do any recording in this case - that is done when original IV is
2384 // processed.
2385 if (isa<TruncInst>(EntryVal))
2386 return;
2387
2388 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2389 if (Casts.empty())
2390 return;
2391 // Only the first Cast instruction in the Casts vector is of interest.
2392 // The rest of the Casts (if exist) have no uses outside the
2393 // induction update chain itself.
2394 if (Lane < UINT_MAX(2147483647 *2U +1U))
2395 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
2396 else
2397 State.set(CastDef, VectorLoopVal, Part);
2398}
2399
2400void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
2401 TruncInst *Trunc, VPValue *Def,
2402 VPValue *CastDef,
2403 VPTransformState &State) {
2404 assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&(static_cast <bool> ((IV->getType()->isIntegerTy(
) || IV != OldInduction) && "Primary induction variable must have an integer type"
) ? void (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2405, __extension__ __PRETTY_FUNCTION__))
2405 "Primary induction variable must have an integer type")(static_cast <bool> ((IV->getType()->isIntegerTy(
) || IV != OldInduction) && "Primary induction variable must have an integer type"
) ? void (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2405, __extension__ __PRETTY_FUNCTION__))
;
2406
2407 auto II = Legal->getInductionVars().find(IV);
2408 assert(II != Legal->getInductionVars().end() && "IV is not an induction")(static_cast <bool> (II != Legal->getInductionVars()
.end() && "IV is not an induction") ? void (0) : __assert_fail
("II != Legal->getInductionVars().end() && \"IV is not an induction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2408, __extension__ __PRETTY_FUNCTION__))
;
2409
2410 auto ID = II->second;
2411 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match")(static_cast <bool> (IV->getType() == ID.getStartValue
()->getType() && "Types must match") ? void (0) : __assert_fail
("IV->getType() == ID.getStartValue()->getType() && \"Types must match\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2411, __extension__ __PRETTY_FUNCTION__))
;
2412
2413 // The value from the original loop to which we are mapping the new induction
2414 // variable.
2415 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2416
2417 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2418
2419 // Generate code for the induction step. Note that induction steps are
2420 // required to be loop-invariant
2421 auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2422 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&(static_cast <bool> (PSE.getSE()->isLoopInvariant(Step
, OrigLoop) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(Step, OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2423, __extension__ __PRETTY_FUNCTION__))
2423 "Induction step should be loop invariant")(static_cast <bool> (PSE.getSE()->isLoopInvariant(Step
, OrigLoop) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(Step, OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2423, __extension__ __PRETTY_FUNCTION__))
;
2424 if (PSE.getSE()->isSCEVable(IV->getType())) {
2425 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2426 return Exp.expandCodeFor(Step, Step->getType(),
2427 LoopVectorPreHeader->getTerminator());
2428 }
2429 return cast<SCEVUnknown>(Step)->getValue();
2430 };
2431
2432 // The scalar value to broadcast. This is derived from the canonical
2433 // induction variable. If a truncation type is given, truncate the canonical
2434 // induction variable and step. Otherwise, derive these values from the
2435 // induction descriptor.
2436 auto CreateScalarIV = [&](Value *&Step) -> Value * {
2437 Value *ScalarIV = Induction;
2438 if (IV != OldInduction) {
2439 ScalarIV = IV->getType()->isIntegerTy()
2440 ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2441 : Builder.CreateCast(Instruction::SIToFP, Induction,
2442 IV->getType());
2443 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2444 ScalarIV->setName("offset.idx");
2445 }
2446 if (Trunc) {
2447 auto *TruncType = cast<IntegerType>(Trunc->getType());
2448 assert(Step->getType()->isIntegerTy() &&(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2449, __extension__ __PRETTY_FUNCTION__))
2449 "Truncation requires an integer step")(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2449, __extension__ __PRETTY_FUNCTION__))
;
2450 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2451 Step = Builder.CreateTrunc(Step, TruncType);
2452 }
2453 return ScalarIV;
2454 };
2455
2456 // Create the vector values from the scalar IV, in the absence of creating a
2457 // vector IV.
2458 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2459 Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2460 for (unsigned Part = 0; Part < UF; ++Part) {
2461 assert(!VF.isScalable() && "scalable vectors not yet supported.")(static_cast <bool> (!VF.isScalable() && "scalable vectors not yet supported."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2461, __extension__ __PRETTY_FUNCTION__))
;
2462 Value *EntryPart =
2463 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2464 ID.getInductionOpcode());
2465 State.set(Def, EntryPart, Part);
2466 if (Trunc)
2467 addMetadata(EntryPart, Trunc);
2468 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
2469 State, Part);
2470 }
2471 };
2472
2473 // Fast-math-flags propagate from the original induction instruction.
2474 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2475 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2476 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2477
2478 // Now do the actual transformations, and start with creating the step value.
2479 Value *Step = CreateStepValue(ID.getStep());
2480 if (VF.isZero() || VF.isScalar()) {
2481 Value *ScalarIV = CreateScalarIV(Step);
2482 CreateSplatIV(ScalarIV, Step);
2483 return;
2484 }
2485
2486 // Determine if we want a scalar version of the induction variable. This is
2487 // true if the induction variable itself is not widened, or if it has at
2488 // least one user in the loop that is not widened.
2489 auto NeedsScalarIV = needsScalarInduction(EntryVal);
2490 if (!NeedsScalarIV) {
2491 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2492 State);
2493 return;
2494 }
2495
2496 // Try to create a new independent vector induction variable. If we can't
2497 // create the phi node, we will splat the scalar induction variable in each
2498 // loop iteration.
2499 if (!shouldScalarizeInstruction(EntryVal)) {
2500 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2501 State);
2502 Value *ScalarIV = CreateScalarIV(Step);
2503 // Create scalar steps that can be used by instructions we will later
2504 // scalarize. Note that the addition of the scalar steps will not increase
2505 // the number of instructions in the loop in the common case prior to
2506 // InstCombine. We will be trading one vector extract for each scalar step.
2507 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2508 return;
2509 }
2510
2511 // All IV users are scalar instructions, so only emit a scalar IV, not a
2512 // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2513 // predicate used by the masked loads/stores.
2514 Value *ScalarIV = CreateScalarIV(Step);
2515 if (!Cost->isScalarEpilogueAllowed())
2516 CreateSplatIV(ScalarIV, Step);
2517 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2518}
2519
2520Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2521 Instruction::BinaryOps BinOp) {
2522 // Create and check the types.
2523 auto *ValVTy = cast<VectorType>(Val->getType());
2524 ElementCount VLen = ValVTy->getElementCount();
2525
2526 Type *STy = Val->getType()->getScalarType();
2527 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2528, __extension__ __PRETTY_FUNCTION__))
2528 "Induction Step must be an integer or FP")(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2528, __extension__ __PRETTY_FUNCTION__))
;
2529 assert(Step->getType() == STy && "Step has wrong type")(static_cast <bool> (Step->getType() == STy &&
"Step has wrong type") ? void (0) : __assert_fail ("Step->getType() == STy && \"Step has wrong type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2529, __extension__ __PRETTY_FUNCTION__))
;
2530
2531 SmallVector<Constant *, 8> Indices;
2532
2533 // Create a vector of consecutive numbers from zero to VF.
2534 VectorType *InitVecValVTy = ValVTy;
2535 Type *InitVecValSTy = STy;
2536 if (STy->isFloatingPointTy()) {
2537 InitVecValSTy =
2538 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2539 InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2540 }
2541 Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2542
2543 // Add on StartIdx
2544 Value *StartIdxSplat = Builder.CreateVectorSplat(
2545 VLen, ConstantInt::get(InitVecValSTy, StartIdx));
2546 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2547
2548 if (STy->isIntegerTy()) {
2549 Step = Builder.CreateVectorSplat(VLen, Step);
2550 assert(Step->getType() == Val->getType() && "Invalid step vec")(static_cast <bool> (Step->getType() == Val->getType
() && "Invalid step vec") ? void (0) : __assert_fail (
"Step->getType() == Val->getType() && \"Invalid step vec\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2550, __extension__ __PRETTY_FUNCTION__))
;
2551 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2552 // which can be found from the original scalar operations.
2553 Step = Builder.CreateMul(InitVec, Step);
2554 return Builder.CreateAdd(Val, Step, "induction");
2555 }
2556
2557 // Floating point induction.
2558 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
== Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2559, __extension__ __PRETTY_FUNCTION__))
2559 "Binary Opcode should be specified for FP induction")(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
== Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2559, __extension__ __PRETTY_FUNCTION__))
;
2560 InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2561 Step = Builder.CreateVectorSplat(VLen, Step);
2562 Value *MulOp = Builder.CreateFMul(InitVec, Step);
2563 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2564}
2565
2566void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2567 Instruction *EntryVal,
2568 const InductionDescriptor &ID,
2569 VPValue *Def, VPValue *CastDef,
2570 VPTransformState &State) {
2571 // We shouldn't have to build scalar steps if we aren't vectorizing.
2572 assert(VF.isVector() && "VF should be greater than one")(static_cast <bool> (VF.isVector() && "VF should be greater than one"
) ? void (0) : __assert_fail ("VF.isVector() && \"VF should be greater than one\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2572, __extension__ __PRETTY_FUNCTION__))
;
2573 // Get the value type and ensure it and the step have the same integer type.
2574 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2575 assert(ScalarIVTy == Step->getType() &&(static_cast <bool> (ScalarIVTy == Step->getType() &&
"Val and Step should have the same type") ? void (0) : __assert_fail
("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2576, __extension__ __PRETTY_FUNCTION__))
2576 "Val and Step should have the same type")(static_cast <bool> (ScalarIVTy == Step->getType() &&
"Val and Step should have the same type") ? void (0) : __assert_fail
("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2576, __extension__ __PRETTY_FUNCTION__))
;
2577
2578 // We build scalar steps for both integer and floating-point induction
2579 // variables. Here, we determine the kind of arithmetic we will perform.
2580 Instruction::BinaryOps AddOp;
2581 Instruction::BinaryOps MulOp;
2582 if (ScalarIVTy->isIntegerTy()) {
2583 AddOp = Instruction::Add;
2584 MulOp = Instruction::Mul;
2585 } else {
2586 AddOp = ID.getInductionOpcode();
2587 MulOp = Instruction::FMul;
2588 }
2589
2590 // Determine the number of scalars we need to generate for each unroll
2591 // iteration. If EntryVal is uniform, we only need to generate the first
2592 // lane. Otherwise, we generate all VF values.
2593 bool IsUniform =
2594 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF);
2595 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue();
2596 // Compute the scalar steps and save the results in State.
2597 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2598 ScalarIVTy->getScalarSizeInBits());
2599 Type *VecIVTy = nullptr;
2600 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2601 if (!IsUniform && VF.isScalable()) {
2602 VecIVTy = VectorType::get(ScalarIVTy, VF);
2603 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF));
2604 SplatStep = Builder.CreateVectorSplat(VF, Step);
2605 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV);
2606 }
2607
2608 for (unsigned Part = 0; Part < UF; ++Part) {
2609 Value *StartIdx0 =
2610 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2611
2612 if (!IsUniform && VF.isScalable()) {
2613 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0);
2614 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2615 if (ScalarIVTy->isFloatingPointTy())
2616 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2617 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2618 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2619 State.set(Def, Add, Part);
2620 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2621 Part);
2622 // It's useful to record the lane values too for the known minimum number
2623 // of elements so we do those below. This improves the code quality when
2624 // trying to extract the first element, for example.
2625 }
2626
2627 if (ScalarIVTy->isFloatingPointTy())
2628 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2629
2630 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2631 Value *StartIdx = Builder.CreateBinOp(
2632 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2633 // The step returned by `createStepForVF` is a runtime-evaluated value
2634 // when VF is scalable. Otherwise, it should be folded into a Constant.
2635 assert((VF.isScalable() || isa<Constant>(StartIdx)) &&(static_cast <bool> ((VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2637, __extension__ __PRETTY_FUNCTION__))
2636 "Expected StartIdx to be folded to a constant when VF is not "(static_cast <bool> ((VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2637, __extension__ __PRETTY_FUNCTION__))
2637 "scalable")(static_cast <bool> ((VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2637, __extension__ __PRETTY_FUNCTION__))
;
2638 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2639 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2640 State.set(Def, Add, VPIteration(Part, Lane));
2641 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2642 Part, Lane);
2643 }
2644 }
2645}
2646
2647void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2648 const VPIteration &Instance,
2649 VPTransformState &State) {
2650 Value *ScalarInst = State.get(Def, Instance);
2651 Value *VectorValue = State.get(Def, Instance.Part);
2652 VectorValue = Builder.CreateInsertElement(
2653 VectorValue, ScalarInst,
2654 Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2655 State.set(Def, VectorValue, Instance.Part);
2656}
2657
2658Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2659 assert(Vec->getType()->isVectorTy() && "Invalid type")(static_cast <bool> (Vec->getType()->isVectorTy()
&& "Invalid type") ? void (0) : __assert_fail ("Vec->getType()->isVectorTy() && \"Invalid type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2659, __extension__ __PRETTY_FUNCTION__))
;
2660 return Builder.CreateVectorReverse(Vec, "reverse");
2661}
2662
2663// Return whether we allow using masked interleave-groups (for dealing with
2664// strided loads/stores that reside in predicated blocks, or for dealing
2665// with gaps).
2666static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2667 // If an override option has been passed in for interleaved accesses, use it.
2668 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2669 return EnableMaskedInterleavedMemAccesses;
2670
2671 return TTI.enableMaskedInterleavedAccessVectorization();
2672}
2673
2674// Try to vectorize the interleave group that \p Instr belongs to.
2675//
2676// E.g. Translate following interleaved load group (factor = 3):
2677// for (i = 0; i < N; i+=3) {
2678// R = Pic[i]; // Member of index 0
2679// G = Pic[i+1]; // Member of index 1
2680// B = Pic[i+2]; // Member of index 2
2681// ... // do something to R, G, B
2682// }
2683// To:
2684// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2685// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2686// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2687// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2688//
2689// Or translate following interleaved store group (factor = 3):
2690// for (i = 0; i < N; i+=3) {
2691// ... do something to R, G, B
2692// Pic[i] = R; // Member of index 0
2693// Pic[i+1] = G; // Member of index 1
2694// Pic[i+2] = B; // Member of index 2
2695// }
2696// To:
2697// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2698// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2699// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2700// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2701// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2702void InnerLoopVectorizer::vectorizeInterleaveGroup(
2703 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2704 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2705 VPValue *BlockInMask) {
2706 Instruction *Instr = Group->getInsertPos();
2707 const DataLayout &DL = Instr->getModule()->getDataLayout();
2708
2709 // Prepare for the vector type of the interleaved load/store.
2710 Type *ScalarTy = getLoadStoreType(Instr);
2711 unsigned InterleaveFactor = Group->getFactor();
2712 assert(!VF.isScalable() && "scalable vectors not yet supported.")(static_cast <bool> (!VF.isScalable() && "scalable vectors not yet supported."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2712, __extension__ __PRETTY_FUNCTION__))
;
2713 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2714
2715 // Prepare for the new pointers.
2716 SmallVector<Value *, 2> AddrParts;
2717 unsigned Index = Group->getIndex(Instr);
2718
2719 // TODO: extend the masked interleaved-group support to reversed access.
2720 assert((!BlockInMask || !Group->isReverse()) &&(static_cast <bool> ((!BlockInMask || !Group->isReverse
()) && "Reversed masked interleave-group not supported."
) ? void (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2721, __extension__ __PRETTY_FUNCTION__))
2721 "Reversed masked interleave-group not supported.")(static_cast <bool> ((!BlockInMask || !Group->isReverse
()) && "Reversed masked interleave-group not supported."
) ? void (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2721, __extension__ __PRETTY_FUNCTION__))
;
2722
2723 // If the group is reverse, adjust the index to refer to the last vector lane
2724 // instead of the first. We adjust the index from the first vector lane,
2725 // rather than directly getting the pointer for lane VF - 1, because the
2726 // pointer operand of the interleaved access is supposed to be uniform. For
2727 // uniform instructions, we're only required to generate a value for the
2728 // first vector lane in each unroll iteration.
2729 if (Group->isReverse())
2730 Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2731
2732 for (unsigned Part = 0; Part < UF; Part++) {
2733 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2734 setDebugLocFromInst(AddrPart);
2735
2736 // Notice current instruction could be any index. Need to adjust the address
2737 // to the member of index 0.
2738 //
2739 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2740 // b = A[i]; // Member of index 0
2741 // Current pointer is pointed to A[i+1], adjust it to A[i].
2742 //
2743 // E.g. A[i+1] = a; // Member of index 1
2744 // A[i] = b; // Member of index 0
2745 // A[i+2] = c; // Member of index 2 (Current instruction)
2746 // Current pointer is pointed to A[i+2], adjust it to A[i].
2747
2748 bool InBounds = false;
2749 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2750 InBounds = gep->isInBounds();
2751 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2752 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2753
2754 // Cast to the vector pointer type.
2755 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2756 Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2757 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2758 }
2759
2760 setDebugLocFromInst(Instr);
2761 Value *PoisonVec = PoisonValue::get(VecTy);
2762
2763 Value *MaskForGaps = nullptr;
2764 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2765 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2766 assert(MaskForGaps && "Mask for Gaps is required but it is null")(static_cast <bool> (MaskForGaps && "Mask for Gaps is required but it is null"
) ? void (0) : __assert_fail ("MaskForGaps && \"Mask for Gaps is required but it is null\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2766, __extension__ __PRETTY_FUNCTION__))
;
2767 }
2768
2769 // Vectorize the interleaved load group.
2770 if (isa<LoadInst>(Instr)) {
2771 // For each unroll part, create a wide load for the group.
2772 SmallVector<Value *, 2> NewLoads;
2773 for (unsigned Part = 0; Part < UF; Part++) {
2774 Instruction *NewLoad;
2775 if (BlockInMask || MaskForGaps) {
2776 assert(useMaskedInterleavedAccesses(*TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(*TTI)
&& "masked interleaved groups are not allowed.") ? void
(0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2777, __extension__ __PRETTY_FUNCTION__))
2777 "masked interleaved groups are not allowed.")(static_cast <bool> (useMaskedInterleavedAccesses(*TTI)
&& "masked interleaved groups are not allowed.") ? void
(0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2777, __extension__ __PRETTY_FUNCTION__))
;
2778 Value *GroupMask = MaskForGaps;
2779 if (BlockInMask) {
2780 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2781 Value *ShuffledMask = Builder.CreateShuffleVector(
2782 BlockInMaskPart,
2783 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2784 "interleaved.mask");
2785 GroupMask = MaskForGaps
2786 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2787 MaskForGaps)
2788 : ShuffledMask;
2789 }
2790 NewLoad =
2791 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2792 GroupMask, PoisonVec, "wide.masked.vec");
2793 }
2794 else
2795 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2796 Group->getAlign(), "wide.vec");
2797 Group->addMetadata(NewLoad);
2798 NewLoads.push_back(NewLoad);
2799 }
2800
2801 // For each member in the group, shuffle out the appropriate data from the
2802 // wide loads.
2803 unsigned J = 0;
2804 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2805 Instruction *Member = Group->getMember(I);
2806
2807 // Skip the gaps in the group.
2808 if (!Member)
2809 continue;
2810
2811 auto StrideMask =
2812 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2813 for (unsigned Part = 0; Part < UF; Part++) {
2814 Value *StridedVec = Builder.CreateShuffleVector(
2815 NewLoads[Part], StrideMask, "strided.vec");
2816
2817 // If this member has different type, cast the result type.
2818 if (Member->getType() != ScalarTy) {
2819 assert(!VF.isScalable() && "VF is assumed to be non scalable.")(static_cast <bool> (!VF.isScalable() && "VF is assumed to be non scalable."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2819, __extension__ __PRETTY_FUNCTION__))
;
2820 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2821 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2822 }
2823
2824 if (Group->isReverse())
2825 StridedVec = reverseVector(StridedVec);
2826
2827 State.set(VPDefs[J], StridedVec, Part);
2828 }
2829 ++J;
2830 }
2831 return;
2832 }
2833
2834 // The sub vector type for current instruction.
2835 auto *SubVT = VectorType::get(ScalarTy, VF);
2836
2837 // Vectorize the interleaved store group.
2838 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2839 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&(static_cast <bool> ((!MaskForGaps || useMaskedInterleavedAccesses
(*TTI)) && "masked interleaved groups are not allowed."
) ? void (0) : __assert_fail ("(!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2840, __extension__ __PRETTY_FUNCTION__))
2840 "masked interleaved groups are not allowed.")(static_cast <bool> ((!MaskForGaps || useMaskedInterleavedAccesses
(*TTI)) && "masked interleaved groups are not allowed."
) ? void (0) : __assert_fail ("(!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2840, __extension__ __PRETTY_FUNCTION__))
;
2841 assert((!MaskForGaps || !VF.isScalable()) &&(static_cast <bool> ((!MaskForGaps || !VF.isScalable())
&& "masking gaps for scalable vectors is not yet supported."
) ? void (0) : __assert_fail ("(!MaskForGaps || !VF.isScalable()) && \"masking gaps for scalable vectors is not yet supported.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2842, __extension__ __PRETTY_FUNCTION__))
2842 "masking gaps for scalable vectors is not yet supported.")(static_cast <bool> ((!MaskForGaps || !VF.isScalable())
&& "masking gaps for scalable vectors is not yet supported."
) ? void (0) : __assert_fail ("(!MaskForGaps || !VF.isScalable()) && \"masking gaps for scalable vectors is not yet supported.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2842, __extension__ __PRETTY_FUNCTION__))
;
2843 for (unsigned Part = 0; Part < UF; Part++) {
2844 // Collect the stored vector from each member.
2845 SmallVector<Value *, 4> StoredVecs;
2846 for (unsigned i = 0; i < InterleaveFactor; i++) {
2847 assert((Group->getMember(i) || MaskForGaps) &&(static_cast <bool> ((Group->getMember(i) || MaskForGaps
) && "Fail to get a member from an interleaved store group"
) ? void (0) : __assert_fail ("(Group->getMember(i) || MaskForGaps) && \"Fail to get a member from an interleaved store group\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2848, __extension__ __PRETTY_FUNCTION__))
2848 "Fail to get a member from an interleaved store group")(static_cast <bool> ((Group->getMember(i) || MaskForGaps
) && "Fail to get a member from an interleaved store group"
) ? void (0) : __assert_fail ("(Group->getMember(i) || MaskForGaps) && \"Fail to get a member from an interleaved store group\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2848, __extension__ __PRETTY_FUNCTION__))
;
2849 Instruction *Member = Group->getMember(i);
2850
2851 // Skip the gaps in the group.
2852 if (!Member) {
2853 Value *Undef = PoisonValue::get(SubVT);
2854 StoredVecs.push_back(Undef);
2855 continue;
2856 }
2857
2858 Value *StoredVec = State.get(StoredValues[i], Part);
2859
2860 if (Group->isReverse())
2861 StoredVec = reverseVector(StoredVec);
2862
2863 // If this member has different type, cast it to a unified type.
2864
2865 if (StoredVec->getType() != SubVT)
2866 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2867
2868 StoredVecs.push_back(StoredVec);
2869 }
2870
2871 // Concatenate all vectors into a wide vector.
2872 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2873
2874 // Interleave the elements in the wide vector.
2875 Value *IVec = Builder.CreateShuffleVector(
2876 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2877 "interleaved.vec");
2878
2879 Instruction *NewStoreInstr;
2880 if (BlockInMask || MaskForGaps) {
2881 Value *GroupMask = MaskForGaps;
2882 if (BlockInMask) {
2883 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2884 Value *ShuffledMask = Builder.CreateShuffleVector(
2885 BlockInMaskPart,
2886 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2887 "interleaved.mask");
2888 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2889 ShuffledMask, MaskForGaps)
2890 : ShuffledMask;
2891 }
2892 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2893 Group->getAlign(), GroupMask);
2894 } else
2895 NewStoreInstr =
2896 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2897
2898 Group->addMetadata(NewStoreInstr);
2899 }
2900}
2901
2902void InnerLoopVectorizer::vectorizeMemoryInstruction(
2903 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2904 VPValue *StoredValue, VPValue *BlockInMask) {
2905 // Attempt to issue a wide load.
2906 LoadInst *LI = dyn_cast<LoadInst>(Instr);
2907 StoreInst *SI = dyn_cast<StoreInst>(Instr);
2908
2909 assert((LI || SI) && "Invalid Load/Store instruction")(static_cast <bool> ((LI || SI) && "Invalid Load/Store instruction"
) ? void (0) : __assert_fail ("(LI || SI) && \"Invalid Load/Store instruction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2909, __extension__ __PRETTY_FUNCTION__))
;
2910 assert((!SI || StoredValue) && "No stored value provided for widened store")(static_cast <bool> ((!SI || StoredValue) && "No stored value provided for widened store"
) ? void (0) : __assert_fail ("(!SI || StoredValue) && \"No stored value provided for widened store\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2910, __extension__ __PRETTY_FUNCTION__))
;
2911 assert((!LI || !StoredValue) && "Stored value provided for widened load")(static_cast <bool> ((!LI || !StoredValue) && "Stored value provided for widened load"
) ? void (0) : __assert_fail ("(!LI || !StoredValue) && \"Stored value provided for widened load\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2911, __extension__ __PRETTY_FUNCTION__))
;
2912
2913 LoopVectorizationCostModel::InstWidening Decision =
2914 Cost->getWideningDecision(Instr, VF);
2915 assert((Decision == LoopVectorizationCostModel::CM_Widen ||(static_cast <bool> ((Decision == LoopVectorizationCostModel
::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse
|| Decision == LoopVectorizationCostModel::CM_GatherScatter)
&& "CM decision is not to widen the memory instruction"
) ? void (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2918, __extension__ __PRETTY_FUNCTION__))
2916 Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||(static_cast <bool> ((Decision == LoopVectorizationCostModel
::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse
|| Decision == LoopVectorizationCostModel::CM_GatherScatter)
&& "CM decision is not to widen the memory instruction"
) ? void (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2918, __extension__ __PRETTY_FUNCTION__))
2917 Decision == LoopVectorizationCostModel::CM_GatherScatter) &&(static_cast <bool> ((Decision == LoopVectorizationCostModel
::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse
|| Decision == LoopVectorizationCostModel::CM_GatherScatter)
&& "CM decision is not to widen the memory instruction"
) ? void (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2918, __extension__ __PRETTY_FUNCTION__))
2918 "CM decision is not to widen the memory instruction")(static_cast <bool> ((Decision == LoopVectorizationCostModel
::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse
|| Decision == LoopVectorizationCostModel::CM_GatherScatter)
&& "CM decision is not to widen the memory instruction"
) ? void (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2918, __extension__ __PRETTY_FUNCTION__))
;
2919
2920 Type *ScalarDataTy = getLoadStoreType(Instr);
2921
2922 auto *DataTy = VectorType::get(ScalarDataTy, VF);
2923 const Align Alignment = getLoadStoreAlignment(Instr);
2924
2925 // Determine if the pointer operand of the access is either consecutive or
2926 // reverse consecutive.
2927 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2928 bool ConsecutiveStride =
2929 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2930 bool CreateGatherScatter =
2931 (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2932
2933 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2934 // gather/scatter. Otherwise Decision should have been to Scalarize.
2935 assert((ConsecutiveStride || CreateGatherScatter) &&(static_cast <bool> ((ConsecutiveStride || CreateGatherScatter
) && "The instruction should be scalarized") ? void (
0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2936, __extension__ __PRETTY_FUNCTION__))
2936 "The instruction should be scalarized")(static_cast <bool> ((ConsecutiveStride || CreateGatherScatter
) && "The instruction should be scalarized") ? void (
0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2936, __extension__ __PRETTY_FUNCTION__))
;
2937 (void)ConsecutiveStride;
2938
2939 VectorParts BlockInMaskParts(UF);
2940 bool isMaskRequired = BlockInMask;
2941 if (isMaskRequired)
2942 for (unsigned Part = 0; Part < UF; ++Part)
2943 BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2944
2945 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2946 // Calculate the pointer for the specific unroll-part.
2947 GetElementPtrInst *PartPtr = nullptr;
2948
2949 bool InBounds = false;
2950 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2951 InBounds = gep->isInBounds();
2952 if (Reverse) {
2953 // If the address is consecutive but reversed, then the
2954 // wide store needs to start at the last vector element.
2955 // RunTimeVF = VScale * VF.getKnownMinValue()
2956 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
2957 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2958 // NumElt = -Part * RunTimeVF
2959 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
2960 // LastLane = 1 - RunTimeVF
2961 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
2962 PartPtr =
2963 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
2964 PartPtr->setIsInBounds(InBounds);
2965 PartPtr = cast<GetElementPtrInst>(
2966 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
2967 PartPtr->setIsInBounds(InBounds);
2968 if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2969 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2970 } else {
2971 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2972 PartPtr = cast<GetElementPtrInst>(
2973 Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2974 PartPtr->setIsInBounds(InBounds);
2975 }
2976
2977 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2978 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2979 };
2980
2981 // Handle Stores:
2982 if (SI) {
2983 setDebugLocFromInst(SI);
2984
2985 for (unsigned Part = 0; Part < UF; ++Part) {
2986 Instruction *NewSI = nullptr;
2987 Value *StoredVal = State.get(StoredValue, Part);
2988 if (CreateGatherScatter) {
2989 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2990 Value *VectorGep = State.get(Addr, Part);
2991 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2992 MaskPart);
2993 } else {
2994 if (Reverse) {
2995 // If we store to reverse consecutive memory locations, then we need
2996 // to reverse the order of elements in the stored value.
2997 StoredVal = reverseVector(StoredVal);
2998 // We don't want to update the value in the map as it might be used in
2999 // another expression. So don't call resetVectorValue(StoredVal).
3000 }
3001 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
3002 if (isMaskRequired)
3003 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
3004 BlockInMaskParts[Part]);
3005 else
3006 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
3007 }
3008 addMetadata(NewSI, SI);
3009 }
3010 return;
3011 }
3012
3013 // Handle loads.
3014 assert(LI && "Must have a load instruction")(static_cast <bool> (LI && "Must have a load instruction"
) ? void (0) : __assert_fail ("LI && \"Must have a load instruction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3014, __extension__ __PRETTY_FUNCTION__))
;
3015 setDebugLocFromInst(LI);
3016 for (unsigned Part = 0; Part < UF; ++Part) {
3017 Value *NewLI;
3018 if (CreateGatherScatter) {
3019 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
3020 Value *VectorGep = State.get(Addr, Part);
3021 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
3022 nullptr, "wide.masked.gather");
3023 addMetadata(NewLI, LI);
3024 } else {
3025 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
3026 if (isMaskRequired)
3027 NewLI = Builder.CreateMaskedLoad(
3028 DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
3029 PoisonValue::get(DataTy), "wide.masked.load");
3030 else
3031 NewLI =
3032 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
3033
3034 // Add metadata to the load, but setVectorValue to the reverse shuffle.
3035 addMetadata(NewLI, LI);
3036 if (Reverse)
3037 NewLI = reverseVector(NewLI);
3038 }
3039
3040 State.set(Def, NewLI, Part);
3041 }
3042}
3043
3044void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
3045 VPUser &User,
3046 const VPIteration &Instance,
3047 bool IfPredicateInstr,
3048 VPTransformState &State) {
3049 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")(static_cast <bool> (!Instr->getType()->isAggregateType
() && "Can't handle vectors") ? void (0) : __assert_fail
("!Instr->getType()->isAggregateType() && \"Can't handle vectors\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3049, __extension__ __PRETTY_FUNCTION__))
;
3050
3051 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
3052 // the first lane and part.
3053 if (isa<NoAliasScopeDeclInst>(Instr))
3054 if (!Instance.isFirstIteration())
3055 return;
3056
3057 setDebugLocFromInst(Instr);
3058
3059 // Does this instruction return a value ?
3060 bool IsVoidRetTy = Instr->getType()->isVoidTy();
3061
3062 Instruction *Cloned = Instr->clone();
3063 if (!IsVoidRetTy)
3064 Cloned->setName(Instr->getName() + ".cloned");
3065
3066 State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
3067 Builder.GetInsertPoint());
3068 // Replace the operands of the cloned instructions with their scalar
3069 // equivalents in the new loop.
3070 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
3071 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
3072 auto InputInstance = Instance;
3073 if (!Operand || !OrigLoop->contains(Operand) ||
3074 (Cost->isUniformAfterVectorization(Operand, State.VF)))
3075 InputInstance.Lane = VPLane::getFirstLane();
3076 auto *NewOp = State.get(User.getOperand(op), InputInstance);
3077 Cloned->setOperand(op, NewOp);
3078 }
3079 addNewMetadata(Cloned, Instr);
3080
3081 // Place the cloned scalar in the new loop.
3082 Builder.Insert(Cloned);
3083
3084 State.set(Def, Cloned, Instance);
3085
3086 // If we just cloned a new assumption, add it the assumption cache.
3087 if (auto *II = dyn_cast<AssumeInst>(Cloned))
3088 AC->registerAssumption(II);
3089
3090 // End if-block.
3091 if (IfPredicateInstr)
3092 PredicatedInstructions.push_back(Cloned);
3093}
3094
3095PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
3096 Value *End, Value *Step,
3097 Instruction *DL) {
3098 BasicBlock *Header = L->getHeader();
3099 BasicBlock *Latch = L->getLoopLatch();
3100 // As we're just creating this loop, it's possible no latch exists
3101 // yet. If so, use the header as this will be a single block loop.
3102 if (!Latch)
3103 Latch = Header;
3104
3105 IRBuilder<> B(&*Header->getFirstInsertionPt());
3106 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3107 setDebugLocFromInst(OldInst, &B);
3108 auto *Induction = B.CreatePHI(Start->getType(), 2, "index");
3109
3110 B.SetInsertPoint(Latch->getTerminator());
3111 setDebugLocFromInst(OldInst, &B);
3112
3113 // Create i+1 and fill the PHINode.
3114 //
3115 // If the tail is not folded, we know that End - Start >= Step (either
3116 // statically or through the minimum iteration checks). We also know that both
3117 // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV +
3118 // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned
3119 // overflows and we can mark the induction increment as NUW.
3120 Value *Next = B.CreateAdd(Induction, Step, "index.next",
3121 /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false);
3122 Induction->addIncoming(Start, L->getLoopPreheader());
3123 Induction->addIncoming(Next, Latch);
3124 // Create the compare.
3125 Value *ICmp = B.CreateICmpEQ(Next, End);
3126 B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
3127
3128 // Now we have two terminators. Remove the old one from the block.
3129 Latch->getTerminator()->eraseFromParent();
3130
3131 return Induction;
3132}
3133
3134Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3135 if (TripCount)
3136 return TripCount;
3137
3138 assert(L && "Create Trip Count for null loop.")(static_cast <bool> (L && "Create Trip Count for null loop."
) ? void (0) : __assert_fail ("L && \"Create Trip Count for null loop.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3138, __extension__ __PRETTY_FUNCTION__))
;
3139 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3140 // Find the loop boundaries.
3141 ScalarEvolution *SE = PSE.getSE();
3142 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3143 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&(static_cast <bool> (!isa<SCEVCouldNotCompute>(BackedgeTakenCount
) && "Invalid loop count") ? void (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3144, __extension__ __PRETTY_FUNCTION__))
3144 "Invalid loop count")(static_cast <bool> (!isa<SCEVCouldNotCompute>(BackedgeTakenCount
) && "Invalid loop count") ? void (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3144, __extension__ __PRETTY_FUNCTION__))
;
3145
3146 Type *IdxTy = Legal->getWidestInductionType();
3147 assert(IdxTy && "No type for induction")(static_cast <bool> (IdxTy && "No type for induction"
) ? void (0) : __assert_fail ("IdxTy && \"No type for induction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3147, __extension__ __PRETTY_FUNCTION__))
;
3148
3149 // The exit count might have the type of i64 while the phi is i32. This can
3150 // happen if we have an induction variable that is sign extended before the
3151 // compare. The only way that we get a backedge taken count is that the
3152 // induction variable was signed and as such will not overflow. In such a case
3153 // truncation is legal.
3154 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3155 IdxTy->getPrimitiveSizeInBits())
3156 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3157 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3158
3159 // Get the total trip count from the count by adding 1.
3160 const SCEV *ExitCount = SE->getAddExpr(
3161 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3162
3163 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3164
3165 // Expand the trip count and place the new instructions in the preheader.
3166 // Notice that the pre-header does not change, only the loop body.
3167 SCEVExpander Exp(*SE, DL, "induction");
3168
3169 // Count holds the overall loop count (N).
3170 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3171 L->getLoopPreheader()->getTerminator());
3172
3173 if (TripCount->getType()->isPointerTy())
3174 TripCount =
3175 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3176 L->getLoopPreheader()->getTerminator());
3177
3178 return TripCount;
3179}
3180
3181Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3182 if (VectorTripCount)
3183 return VectorTripCount;
3184
3185 Value *TC = getOrCreateTripCount(L);
3186 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3187
3188 Type *Ty = TC->getType();
3189 // This is where we can make the step a runtime constant.
3190 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
3191
3192 // If the tail is to be folded by masking, round the number of iterations N
3193 // up to a multiple of Step instead of rounding down. This is done by first
3194 // adding Step-1 and then rounding down. Note that it's ok if this addition
3195 // overflows: the vector induction variable will eventually wrap to zero given
3196 // that it starts at zero and its Step is a power of two; the loop will then
3197 // exit, with the last early-exit vector comparison also producing all-true.
3198 if (Cost->foldTailByMasking()) {
3199 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
) * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3200, __extension__ __PRETTY_FUNCTION__))
3200 "VF*UF must be a power of 2 when folding tail by masking")(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
) * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3200, __extension__ __PRETTY_FUNCTION__))
;
3201 assert(!VF.isScalable() &&(static_cast <bool> (!VF.isScalable() && "Tail folding not yet supported for scalable vectors"
) ? void (0) : __assert_fail ("!VF.isScalable() && \"Tail folding not yet supported for scalable vectors\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3202, __extension__ __PRETTY_FUNCTION__))
3202 "Tail folding not yet supported for scalable vectors")(static_cast <bool> (!VF.isScalable() && "Tail folding not yet supported for scalable vectors"
) ? void (0) : __assert_fail ("!VF.isScalable() && \"Tail folding not yet supported for scalable vectors\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3202, __extension__ __PRETTY_FUNCTION__))
;
3203 TC = Builder.CreateAdd(
3204 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3205 }
3206
3207 // Now we need to generate the expression for the part of the loop that the
3208 // vectorized body will execute. This is equal to N - (N % Step) if scalar
3209 // iterations are not required for correctness, or N - Step, otherwise. Step
3210 // is equal to the vectorization factor (number of SIMD elements) times the
3211 // unroll factor (number of SIMD instructions).
3212 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3213
3214 // There are cases where we *must* run at least one iteration in the remainder
3215 // loop. See the cost model for when this can happen. If the step evenly
3216 // divides the trip count, we set the remainder to be equal to the step. If
3217 // the step does not evenly divide the trip count, no adjustment is necessary
3218 // since there will already be scalar iterations. Note that the minimum
3219 // iterations check ensures that N >= Step.
3220 if (Cost->requiresScalarEpilogue(VF)) {
3221 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3222 R = Builder.CreateSelect(IsZero, Step, R);
3223 }
3224
3225 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3226
3227 return VectorTripCount;
3228}
3229
3230Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3231 const DataLayout &DL) {
3232 // Verify that V is a vector type with same number of elements as DstVTy.
3233 auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3234 unsigned VF = DstFVTy->getNumElements();
3235 auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3236 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match")(static_cast <bool> ((VF == SrcVecTy->getNumElements
()) && "Vector dimensions do not match") ? void (0) :
__assert_fail ("(VF == SrcVecTy->getNumElements()) && \"Vector dimensions do not match\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3236, __extension__ __PRETTY_FUNCTION__))
;
3237 Type *SrcElemTy = SrcVecTy->getElementType();
3238 Type *DstElemTy = DstFVTy->getElementType();
3239 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3240, __extension__ __PRETTY_FUNCTION__))
3240 "Vector elements must have same size")(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3240, __extension__ __PRETTY_FUNCTION__))
;
3241
3242 // Do a direct cast if element types are castable.
3243 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3244 return Builder.CreateBitOrPointerCast(V, DstFVTy);
3245 }
3246 // V cannot be directly casted to desired vector type.
3247 // May happen when V is a floating point vector but DstVTy is a vector of
3248 // pointers or vice-versa. Handle this using a two-step bitcast using an
3249 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3250 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3251, __extension__ __PRETTY_FUNCTION__))
3251 "Only one type should be a pointer type")(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3251, __extension__ __PRETTY_FUNCTION__))
;
3252 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&(static_cast <bool> ((DstElemTy->isFloatingPointTy()
!= SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3253, __extension__ __PRETTY_FUNCTION__))
3253 "Only one type should be a floating point type")(static_cast <bool> ((DstElemTy->isFloatingPointTy()
!= SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3253, __extension__ __PRETTY_FUNCTION__))
;
3254 Type *IntTy =
3255 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3256 auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3257 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3258 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3259}
3260
3261void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3262 BasicBlock *Bypass) {
3263 Value *Count = getOrCreateTripCount(L);
3264 // Reuse existing vector loop preheader for TC checks.
3265 // Note that new preheader block is generated for vector loop.
3266 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3267 IRBuilder<> Builder(TCCheckBlock->getTerminator());
3268
3269 // Generate code to check if the loop's trip count is less than VF * UF, or
3270 // equal to it in case a scalar epilogue is required; this implies that the
3271 // vector trip count is zero. This check also covers the case where adding one
3272 // to the backedge-taken count overflowed leading to an incorrect trip count
3273 // of zero. In this case we will also jump to the scalar loop.
3274 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
3275 : ICmpInst::ICMP_ULT;
3276
3277 // If tail is to be folded, vector loop takes care of all iterations.
3278 Value *CheckMinIters = Builder.getFalse();
3279 if (!Cost->foldTailByMasking()) {
3280 Value *Step =
3281 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3282 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3283 }
3284 // Create new preheader for vector loop.
3285 LoopVectorPreHeader =
3286 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3287 "vector.ph");
3288
3289 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3291, __extension__ __PRETTY_FUNCTION__))
3290 DT->getNode(Bypass)->getIDom()) &&(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3291, __extension__ __PRETTY_FUNCTION__))
3291 "TC check is expected to dominate Bypass")(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3291, __extension__ __PRETTY_FUNCTION__))
;
3292
3293 // Update dominator for Bypass & LoopExit (if needed).
3294 DT->changeImmediateDominator(Bypass, TCCheckBlock);
3295 if (!Cost->requiresScalarEpilogue(VF))
3296 // If there is an epilogue which must run, there's no edge from the
3297 // middle block to exit blocks and thus no need to update the immediate
3298 // dominator of the exit blocks.
3299 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3300
3301 ReplaceInstWithInst(
3302 TCCheckBlock->getTerminator(),
3303 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3304 LoopBypassBlocks.push_back(TCCheckBlock);
3305}
3306
3307BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3308
3309 BasicBlock *const SCEVCheckBlock =
3310 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3311 if (!SCEVCheckBlock)
3312 return nullptr;
3313
3314 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3317, __extension__ __PRETTY_FUNCTION__))
3315 (OptForSizeBasedOnProfile &&(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3317, __extension__ __PRETTY_FUNCTION__))
3316 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3317, __extension__ __PRETTY_FUNCTION__))
3317 "Cannot SCEV check stride or overflow when optimizing for size")(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3317, __extension__ __PRETTY_FUNCTION__))
;
3318
3319
3320 // Update dominator only if this is first RT check.
3321 if (LoopBypassBlocks.empty()) {
3322 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3323 if (!Cost->requiresScalarEpilogue(VF))
3324 // If there is an epilogue which must run, there's no edge from the
3325 // middle block to exit blocks and thus no need to update the immediate
3326 // dominator of the exit blocks.
3327 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3328 }
3329
3330 LoopBypassBlocks.push_back(SCEVCheckBlock);
3331 AddedSafetyChecks = true;
3332 return SCEVCheckBlock;
3333}
3334
3335BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3336 BasicBlock *Bypass) {
3337 // VPlan-native path does not do any analysis for runtime checks currently.
3338 if (EnableVPlanNativePath)
3339 return nullptr;
3340
3341 BasicBlock *const MemCheckBlock =
3342 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3343
3344 // Check if we generated code that checks in runtime if arrays overlap. We put
3345 // the checks into a separate block to make the more common case of few
3346 // elements faster.
3347 if (!MemCheckBlock)
3348 return nullptr;
3349
3350 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3351 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3353, __extension__ __PRETTY_FUNCTION__))
3352 "Cannot emit memory checks when optimizing for size, unless forced "(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3353, __extension__ __PRETTY_FUNCTION__))
3353 "to vectorize.")(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3353, __extension__ __PRETTY_FUNCTION__))
;
3354 ORE->emit([&]() {
3355 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationCodeSize",
3356 L->getStartLoc(), L->getHeader())
3357 << "Code-size may be reduced by not forcing "
3358 "vectorization, or by source-code modifications "
3359 "eliminating the need for runtime checks "
3360 "(e.g., adding 'restrict').";
3361 });
3362 }
3363
3364 LoopBypassBlocks.push_back(MemCheckBlock);
3365
3366 AddedSafetyChecks = true;
3367
3368 // We currently don't use LoopVersioning for the actual loop cloning but we
3369 // still use it to add the noalias metadata.
3370 LVer = std::make_unique<LoopVersioning>(
3371 *Legal->getLAI(),
3372 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3373 DT, PSE.getSE());
3374 LVer->prepareNoAliasMetadata();
3375 return MemCheckBlock;
3376}
3377
3378Value *InnerLoopVectorizer::emitTransformedIndex(
3379 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3380 const InductionDescriptor &ID) const {
3381
3382 SCEVExpander Exp(*SE, DL, "induction");
3383 auto Step = ID.getStep();
3384 auto StartValue = ID.getStartValue();
3385 assert(Index->getType()->getScalarType() == Step->getType() &&(static_cast <bool> (Index->getType()->getScalarType
() == Step->getType() && "Index scalar type does not match StepValue type"
) ? void (0) : __assert_fail ("Index->getType()->getScalarType() == Step->getType() && \"Index scalar type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3386, __extension__ __PRETTY_FUNCTION__))
3386 "Index scalar type does not match StepValue type")(static_cast <bool> (Index->getType()->getScalarType
() == Step->getType() && "Index scalar type does not match StepValue type"
) ? void (0) : __assert_fail ("Index->getType()->getScalarType() == Step->getType() && \"Index scalar type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3386, __extension__ __PRETTY_FUNCTION__))
;
3387
3388 // Note: the IR at this point is broken. We cannot use SE to create any new
3389 // SCEV and then expand it, hoping that SCEV's simplification will give us
3390 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3391 // lead to various SCEV crashes. So all we can do is to use builder and rely
3392 // on InstCombine for future simplifications. Here we handle some trivial
3393 // cases only.
3394 auto CreateAdd = [&B](Value *X, Value *Y) {
3395 assert(X->getType() == Y->getType() && "Types don't match!")(static_cast <bool> (X->getType() == Y->getType()
&& "Types don't match!") ? void (0) : __assert_fail (
"X->getType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3395, __extension__ __PRETTY_FUNCTION__))
;
3396 if (auto *CX = dyn_cast<ConstantInt>(X))
3397 if (CX->isZero())
3398 return Y;
3399 if (auto *CY = dyn_cast<ConstantInt>(Y))
3400 if (CY->isZero())
3401 return X;
3402 return B.CreateAdd(X, Y);
3403 };
3404
3405 // We allow X to be a vector type, in which case Y will potentially be
3406 // splatted into a vector with the same element count.
3407 auto CreateMul = [&B](Value *X, Value *Y) {
3408 assert(X->getType()->getScalarType() == Y->getType() &&(static_cast <bool> (X->getType()->getScalarType(
) == Y->getType() && "Types don't match!") ? void (
0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3409, __extension__ __PRETTY_FUNCTION__))
3409 "Types don't match!")(static_cast <bool> (X->getType()->getScalarType(
) == Y->getType() && "Types don't match!") ? void (
0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3409, __extension__ __PRETTY_FUNCTION__))
;
3410 if (auto *CX = dyn_cast<ConstantInt>(X))
3411 if (CX->isOne())
3412 return Y;
3413 if (auto *CY = dyn_cast<ConstantInt>(Y))
3414 if (CY->isOne())
3415 return X;
3416 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
3417 if (XVTy && !isa<VectorType>(Y->getType()))
3418 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
3419 return B.CreateMul(X, Y);
3420 };
3421
3422 // Get a suitable insert point for SCEV expansion. For blocks in the vector
3423 // loop, choose the end of the vector loop header (=LoopVectorBody), because
3424 // the DomTree is not kept up-to-date for additional blocks generated in the
3425 // vector loop. By using the header as insertion point, we guarantee that the
3426 // expanded instructions dominate all their uses.
3427 auto GetInsertPoint = [this, &B]() {
3428 BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3429 if (InsertBB != LoopVectorBody &&
3430 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3431 return LoopVectorBody->getTerminator();
3432 return &*B.GetInsertPoint();
3433 };
3434
3435 switch (ID.getKind()) {
3436 case InductionDescriptor::IK_IntInduction: {
3437 assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for integer inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3438, __extension__ __PRETTY_FUNCTION__))
3438 "Vector indices not supported for integer inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for integer inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3438, __extension__ __PRETTY_FUNCTION__))
;
3439 assert(Index->getType() == StartValue->getType() &&(static_cast <bool> (Index->getType() == StartValue->
getType() && "Index type does not match StartValue type"
) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3440, __extension__ __PRETTY_FUNCTION__))
3440 "Index type does not match StartValue type")(static_cast <bool> (Index->getType() == StartValue->
getType() && "Index type does not match StartValue type"
) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3440, __extension__ __PRETTY_FUNCTION__))
;
3441 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3442 return B.CreateSub(StartValue, Index);
3443 auto *Offset = CreateMul(
3444 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3445 return CreateAdd(StartValue, Offset);
3446 }
3447 case InductionDescriptor::IK_PtrInduction: {
3448 assert(isa<SCEVConstant>(Step) &&(static_cast <bool> (isa<SCEVConstant>(Step) &&
"Expected constant step for pointer induction") ? void (0) :
__assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3449, __extension__ __PRETTY_FUNCTION__))
3449 "Expected constant step for pointer induction")(static_cast <bool> (isa<SCEVConstant>(Step) &&
"Expected constant step for pointer induction") ? void (0) :
__assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3449, __extension__ __PRETTY_FUNCTION__))
;
3450 return B.CreateGEP(
3451 StartValue->getType()->getPointerElementType(), StartValue,
3452 CreateMul(Index,
3453 Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
3454 GetInsertPoint())));
3455 }
3456 case InductionDescriptor::IK_FpInduction: {
3457 assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for FP inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3458, __extension__ __PRETTY_FUNCTION__))
3458 "Vector indices not supported for FP inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for FP inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3458, __extension__ __PRETTY_FUNCTION__))
;
3459 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value")(static_cast <bool> (Step->getType()->isFloatingPointTy
() && "Expected FP Step value") ? void (0) : __assert_fail
("Step->getType()->isFloatingPointTy() && \"Expected FP Step value\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3459, __extension__ __PRETTY_FUNCTION__))
;
3460 auto InductionBinOp = ID.getInductionBinOp();
3461 assert(InductionBinOp &&(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3464, __extension__ __PRETTY_FUNCTION__))
3462 (InductionBinOp->getOpcode() == Instruction::FAdd ||(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3464, __extension__ __PRETTY_FUNCTION__))
3463 InductionBinOp->getOpcode() == Instruction::FSub) &&(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3464, __extension__ __PRETTY_FUNCTION__))
3464 "Original bin op should be defined for FP induction")(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3464, __extension__ __PRETTY_FUNCTION__))
;
3465
3466 Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3467 Value *MulExp = B.CreateFMul(StepValue, Index);
3468 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3469 "induction");
3470 }
3471 case InductionDescriptor::IK_NoInduction:
3472 return nullptr;
3473 }
3474 llvm_unreachable("invalid enum")::llvm::llvm_unreachable_internal("invalid enum", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3474)
;
3475}
3476
3477Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3478 LoopScalarBody = OrigLoop->getHeader();
3479 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3480 assert(LoopVectorPreHeader && "Invalid loop structure")(static_cast <bool> (LoopVectorPreHeader && "Invalid loop structure"
) ? void (0) : __assert_fail ("LoopVectorPreHeader && \"Invalid loop structure\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3480, __extension__ __PRETTY_FUNCTION__))
;
3481 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3482 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&(static_cast <bool> ((LoopExitBlock || Cost->requiresScalarEpilogue
(VF)) && "multiple exit loop without required epilogue?"
) ? void (0) : __assert_fail ("(LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && \"multiple exit loop without required epilogue?\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3483, __extension__ __PRETTY_FUNCTION__))
3483 "multiple exit loop without required epilogue?")(static_cast <bool> ((LoopExitBlock || Cost->requiresScalarEpilogue
(VF)) && "multiple exit loop without required epilogue?"
) ? void (0) : __assert_fail ("(LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && \"multiple exit loop without required epilogue?\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3483, __extension__ __PRETTY_FUNCTION__))
;
3484
3485 LoopMiddleBlock =
3486 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3487 LI, nullptr, Twine(Prefix) + "middle.block");
3488 LoopScalarPreHeader =
3489 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3490 nullptr, Twine(Prefix) + "scalar.ph");
3491
3492 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3493
3494 // Set up the middle block terminator. Two cases:
3495 // 1) If we know that we must execute the scalar epilogue, emit an
3496 // unconditional branch.
3497 // 2) Otherwise, we must have a single unique exit block (due to how we
3498 // implement the multiple exit case). In this case, set up a conditonal
3499 // branch from the middle block to the loop scalar preheader, and the
3500 // exit block. completeLoopSkeleton will update the condition to use an
3501 // iteration check, if required to decide whether to execute the remainder.
3502 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3503 BranchInst::Create(LoopScalarPreHeader) :
3504 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3505 Builder.getTrue());
3506 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3507 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3508
3509 // We intentionally don't let SplitBlock to update LoopInfo since
3510 // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3511 // LoopVectorBody is explicitly added to the correct place few lines later.
3512 LoopVectorBody =
3513 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3514 nullptr, nullptr, Twine(Prefix) + "vector.body");
3515
3516 // Update dominator for loop exit.
3517 if (!Cost->requiresScalarEpilogue(VF))
3518 // If there is an epilogue which must run, there's no edge from the
3519 // middle block to exit blocks and thus no need to update the immediate
3520 // dominator of the exit blocks.
3521 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3522
3523 // Create and register the new vector loop.
3524 Loop *Lp = LI->AllocateLoop();
3525 Loop *ParentLoop = OrigLoop->getParentLoop();
3526
3527 // Insert the new loop into the loop nest and register the new basic blocks
3528 // before calling any utilities such as SCEV that require valid LoopInfo.
3529 if (ParentLoop) {
3530 ParentLoop->addChildLoop(Lp);
3531 } else {
3532 LI->addTopLevelLoop(Lp);
3533 }
3534 Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3535 return Lp;
3536}
3537
3538void InnerLoopVectorizer::createInductionResumeValues(
3539 Loop *L, Value *VectorTripCount,
3540 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3541 assert(VectorTripCount && L && "Expected valid arguments")(static_cast <bool> (VectorTripCount && L &&
"Expected valid arguments") ? void (0) : __assert_fail ("VectorTripCount && L && \"Expected valid arguments\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3541, __extension__ __PRETTY_FUNCTION__))
;
3542 assert(((AdditionalBypass.first && AdditionalBypass.second) ||(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3544, __extension__ __PRETTY_FUNCTION__))
3543 (!AdditionalBypass.first && !AdditionalBypass.second)) &&(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3544, __extension__ __PRETTY_FUNCTION__))
3544 "Inconsistent information about additional bypass.")(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3544, __extension__ __PRETTY_FUNCTION__))
;
3545 // We are going to resume the execution of the scalar loop.
3546 // Go over all of the induction variables that we found and fix the
3547 // PHIs that are left in the scalar version of the loop.
3548 // The starting values of PHI nodes depend on the counter of the last
3549 // iteration in the vectorized loop.
3550 // If we come from a bypass edge then we need to start from the original
3551 // start value.
3552 for (auto &InductionEntry : Legal->getInductionVars()) {
3553 PHINode *OrigPhi = InductionEntry.first;
3554 InductionDescriptor II = InductionEntry.second;
3555
3556 // Create phi nodes to merge from the backedge-taken check block.
3557 PHINode *BCResumeVal =
3558 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3559 LoopScalarPreHeader->getTerminator());
3560 // Copy original phi DL over to the new one.
3561 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3562 Value *&EndValue = IVEndValues[OrigPhi];
3563 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3564 if (OrigPhi == OldInduction) {
3565 // We know what the end value is.
3566 EndValue = VectorTripCount;
3567 } else {
3568 IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3569
3570 // Fast-math-flags propagate from the original induction instruction.
3571 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3572 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3573
3574 Type *StepType = II.getStep()->getType();
3575 Instruction::CastOps CastOp =
3576 CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3577 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3578 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3579 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3580 EndValue->setName("ind.end");
3581
3582 // Compute the end value for the additional bypass (if applicable).
3583 if (AdditionalBypass.first) {
3584 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3585 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3586 StepType, true);
3587 CRD =
3588 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3589 EndValueFromAdditionalBypass =
3590 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3591 EndValueFromAdditionalBypass->setName("ind.end");
3592 }
3593 }
3594 // The new PHI merges the original incoming value, in case of a bypass,
3595 // or the value at the end of the vectorized loop.
3596 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3597
3598 // Fix the scalar body counter (PHI node).
3599 // The old induction's phi node in the scalar body needs the truncated
3600 // value.
3601 for (BasicBlock *BB : LoopBypassBlocks)
3602 BCResumeVal->addIncoming(II.getStartValue(), BB);
3603
3604 if (AdditionalBypass.first)
3605 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3606 EndValueFromAdditionalBypass);
3607
3608 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3609 }
3610}
3611
3612BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3613 MDNode *OrigLoopID) {
3614 assert(L && "Expected valid loop.")(static_cast <bool> (L && "Expected valid loop."
) ? void (0) : __assert_fail ("L && \"Expected valid loop.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3614, __extension__ __PRETTY_FUNCTION__))
;
3615
3616 // The trip counts should be cached by now.
3617 Value *Count = getOrCreateTripCount(L);
3618 Value *VectorTripCount = getOrCreateVectorTripCount(L);
3619
3620 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3621
3622 // Add a check in the middle block to see if we have completed
3623 // all of the iterations in the first vector loop. Three cases:
3624 // 1) If we require a scalar epilogue, there is no conditional branch as
3625 // we unconditionally branch to the scalar preheader. Do nothing.
3626 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3627 // Thus if tail is to be folded, we know we don't need to run the
3628 // remainder and we can use the previous value for the condition (true).
3629 // 3) Otherwise, construct a runtime check.
3630 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3631 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3632 Count, VectorTripCount, "cmp.n",
3633 LoopMiddleBlock->getTerminator());
3634
3635 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3636 // of the corresponding compare because they may have ended up with
3637 // different line numbers and we want to avoid awkward line stepping while
3638 // debugging. Eg. if the compare has got a line number inside the loop.
3639 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3640 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3641 }
3642
3643 // Get ready to start creating new instructions into the vectorized body.
3644 assert(LoopVectorPreHeader == L->getLoopPreheader() &&(static_cast <bool> (LoopVectorPreHeader == L->getLoopPreheader
() && "Inconsistent vector loop preheader") ? void (0
) : __assert_fail ("LoopVectorPreHeader == L->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3645, __extension__ __PRETTY_FUNCTION__))
3645 "Inconsistent vector loop preheader")(static_cast <bool> (LoopVectorPreHeader == L->getLoopPreheader
() && "Inconsistent vector loop preheader") ? void (0
) : __assert_fail ("LoopVectorPreHeader == L->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3645, __extension__ __PRETTY_FUNCTION__))
;
3646 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3647
3648 Optional<MDNode *> VectorizedLoopID =
3649 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3650 LLVMLoopVectorizeFollowupVectorized});
3651 if (VectorizedLoopID.hasValue()) {
3652 L->setLoopID(VectorizedLoopID.getValue());
3653
3654 // Do not setAlreadyVectorized if loop attributes have been defined
3655 // explicitly.
3656 return LoopVectorPreHeader;
3657 }
3658
3659 // Keep all loop hints from the original loop on the vector loop (we'll
3660 // replace the vectorizer-specific hints below).
3661 if (MDNode *LID = OrigLoop->getLoopID())
3662 L->setLoopID(LID);
3663
3664 LoopVectorizeHints Hints(L, true, *ORE);
3665 Hints.setAlreadyVectorized();
3666
3667#ifdef EXPENSIVE_CHECKS
3668 assert(DT->verify(DominatorTree::VerificationLevel::Fast))(static_cast <bool> (DT->verify(DominatorTree::VerificationLevel
::Fast)) ? void (0) : __assert_fail ("DT->verify(DominatorTree::VerificationLevel::Fast)"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3668, __extension__ __PRETTY_FUNCTION__))
;
3669 LI->verify(*DT);
3670#endif
3671
3672 return LoopVectorPreHeader;
3673}
3674
3675BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3676 /*
3677 In this function we generate a new loop. The new loop will contain
3678 the vectorized instructions while the old loop will continue to run the
3679 scalar remainder.
3680
3681 [ ] <-- loop iteration number check.
3682 / |
3683 / v
3684 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3685 | / |
3686 | / v
3687 || [ ] <-- vector pre header.
3688 |/ |
3689 | v
3690 | [ ] \
3691 | [ ]_| <-- vector loop.
3692 | |
3693 | v
3694 \ -[ ] <--- middle-block.
3695 \/ |
3696 /\ v
3697 | ->[ ] <--- new preheader.
3698 | |
3699 (opt) v <-- edge from middle to exit iff epilogue is not required.
3700 | [ ] \
3701 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3702 \ |
3703 \ v
3704 >[ ] <-- exit block(s).
3705 ...
3706 */
3707
3708 // Get the metadata of the original loop before it gets modified.
3709 MDNode *OrigLoopID = OrigLoop->getLoopID();
3710
3711 // Workaround! Compute the trip count of the original loop and cache it
3712 // before we start modifying the CFG. This code has a systemic problem
3713 // wherein it tries to run analysis over partially constructed IR; this is
3714 // wrong, and not simply for SCEV. The trip count of the original loop
3715 // simply happens to be prone to hitting this in practice. In theory, we
3716 // can hit the same issue for any SCEV, or ValueTracking query done during
3717 // mutation. See PR49900.
3718 getOrCreateTripCount(OrigLoop);
3719
3720 // Create an empty vector loop, and prepare basic blocks for the runtime
3721 // checks.
3722 Loop *Lp = createVectorLoopSkeleton("");
3723
3724 // Now, compare the new count to zero. If it is zero skip the vector loop and
3725 // jump to the scalar loop. This check also covers the case where the
3726 // backedge-taken count is uint##_max: adding one to it will overflow leading
3727 // to an incorrect trip count of zero. In this (rare) case we will also jump
3728 // to the scalar loop.
3729 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3730
3731 // Generate the code to check any assumptions that we've made for SCEV
3732 // expressions.
3733 emitSCEVChecks(Lp, LoopScalarPreHeader);
3734
3735 // Generate the code that checks in runtime if arrays overlap. We put the
3736 // checks into a separate block to make the more common case of few elements
3737 // faster.
3738 emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3739
3740 // Some loops have a single integer induction variable, while other loops
3741 // don't. One example is c++ iterators that often have multiple pointer
3742 // induction variables. In the code below we also support a case where we
3743 // don't have a single induction variable.
3744 //
3745 // We try to obtain an induction variable from the original loop as hard
3746 // as possible. However if we don't find one that:
3747 // - is an integer
3748 // - counts from zero, stepping by one
3749 // - is the size of the widest induction variable type
3750 // then we create a new one.
3751 OldInduction = Legal->getPrimaryInduction();
3752 Type *IdxTy = Legal->getWidestInductionType();
3753 Value *StartIdx = ConstantInt::get(IdxTy, 0);
3754 // The loop step is equal to the vectorization factor (num of SIMD elements)
3755 // times the unroll factor (num of SIMD instructions).
3756 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3757 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3758 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3759 Induction =
3760 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3761 getDebugLocFromInstOrOperands(OldInduction));
3762
3763 // Emit phis for the new starting index of the scalar loop.
3764 createInductionResumeValues(Lp, CountRoundDown);
3765
3766 return completeLoopSkeleton(Lp, OrigLoopID);
3767}
3768
3769// Fix up external users of the induction variable. At this point, we are
3770// in LCSSA form, with all external PHIs that use the IV having one input value,
3771// coming from the remainder loop. We need those PHIs to also have a correct
3772// value for the IV when arriving directly from the middle block.
3773void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3774 const InductionDescriptor &II,
3775 Value *CountRoundDown, Value *EndValue,
3776 BasicBlock *MiddleBlock) {
3777 // There are two kinds of external IV usages - those that use the value
3778 // computed in the last iteration (the PHI) and those that use the penultimate
3779 // value (the value that feeds into the phi from the loop latch).
3780 // We allow both, but they, obviously, have different values.
3781
3782 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block")(static_cast <bool> (OrigLoop->getUniqueExitBlock() &&
"Expected a single exit block") ? void (0) : __assert_fail (
"OrigLoop->getUniqueExitBlock() && \"Expected a single exit block\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3782, __extension__ __PRETTY_FUNCTION__))
;
3783
3784 DenseMap<Value *, Value *> MissingVals;
3785
3786 // An external user of the last iteration's value should see the value that
3787 // the remainder loop uses to initialize its own IV.
3788 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3789 for (User *U : PostInc->users()) {
3790 Instruction *UI = cast<Instruction>(U);
3791 if (!OrigLoop->contains(UI)) {
3792 assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3792, __extension__ __PRETTY_FUNCTION__))
;
3793 MissingVals[UI] = EndValue;
3794 }
3795 }
3796
3797 // An external user of the penultimate value need to see EndValue - Step.
3798 // The simplest way to get this is to recompute it from the constituent SCEVs,
3799 // that is Start + (Step * (CRD - 1)).
3800 for (User *U : OrigPhi->users()) {
3801 auto *UI = cast<Instruction>(U);
3802 if (!OrigLoop->contains(UI)) {
3803 const DataLayout &DL =
3804 OrigLoop->getHeader()->getModule()->getDataLayout();
3805 assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3805, __extension__ __PRETTY_FUNCTION__))
;
3806
3807 IRBuilder<> B(MiddleBlock->getTerminator());
3808
3809 // Fast-math-flags propagate from the original induction instruction.
3810 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3811 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3812
3813 Value *CountMinusOne = B.CreateSub(
3814 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3815 Value *CMO =
3816 !II.getStep()->getType()->isIntegerTy()
3817 ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3818 II.getStep()->getType())
3819 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3820 CMO->setName("cast.cmo");
3821 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3822 Escape->setName("ind.escape");
3823 MissingVals[UI] = Escape;
3824 }
3825 }
3826
3827 for (auto &I : MissingVals) {
3828 PHINode *PHI = cast<PHINode>(I.first);
3829 // One corner case we have to handle is two IVs "chasing" each-other,
3830 // that is %IV2 = phi [...], [ %IV1, %latch ]
3831 // In this case, if IV1 has an external use, we need to avoid adding both
3832 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3833 // don't already have an incoming value for the middle block.
3834 if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3835 PHI->addIncoming(I.second, MiddleBlock);
3836 }
3837}
3838
3839namespace {
3840
3841struct CSEDenseMapInfo {
3842 static bool canHandle(const Instruction *I) {
3843 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3844 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3845 }
3846
3847 static inline Instruction *getEmptyKey() {
3848 return DenseMapInfo<Instruction *>::getEmptyKey();
3849 }
3850
3851 static inline Instruction *getTombstoneKey() {
3852 return DenseMapInfo<Instruction *>::getTombstoneKey();
3853 }
3854
3855 static unsigned getHashValue(const Instruction *I) {
3856 assert(canHandle(I) && "Unknown instruction!")(static_cast <bool> (canHandle(I) && "Unknown instruction!"
) ? void (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3856, __extension__ __PRETTY_FUNCTION__))
;
3857 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3858 I->value_op_end()));
3859 }
3860
3861 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3862 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3863 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3864 return LHS == RHS;
3865 return LHS->isIdenticalTo(RHS);
3866 }
3867};
3868
3869} // end anonymous namespace
3870
3871///Perform cse of induction variable instructions.
3872static void cse(BasicBlock *BB) {
3873 // Perform simple cse.
3874 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3875 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3876 Instruction *In = &*I++;
3877
3878 if (!CSEDenseMapInfo::canHandle(In))
3879 continue;
3880
3881 // Check if we can replace this instruction with any of the
3882 // visited instructions.
3883 if (Instruction *V = CSEMap.lookup(In)) {
3884 In->replaceAllUsesWith(V);
3885 In->eraseFromParent();
3886 continue;
3887 }
3888
3889 CSEMap[In] = In;
3890 }
3891}
3892
3893InstructionCost
3894LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3895 bool &NeedToScalarize) const {
3896 Function *F = CI->getCalledFunction();
3897 Type *ScalarRetTy = CI->getType();
3898 SmallVector<Type *, 4> Tys, ScalarTys;
3899 for (auto &ArgOp : CI->arg_operands())
3900 ScalarTys.push_back(ArgOp->getType());
3901
3902 // Estimate cost of scalarized vector call. The source operands are assumed
3903 // to be vectors, so we need to extract individual elements from there,
3904 // execute VF scalar calls, and then gather the result into the vector return
3905 // value.
3906 InstructionCost ScalarCallCost =
3907 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3908 if (VF.isScalar())
3909 return ScalarCallCost;
3910
3911 // Compute corresponding vector type for return value and arguments.
3912 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3913 for (Type *ScalarTy : ScalarTys)
3914 Tys.push_back(ToVectorTy(ScalarTy, VF));
3915
3916 // Compute costs of unpacking argument values for the scalar calls and
3917 // packing the return values to a vector.
3918 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3919
3920 InstructionCost Cost =
3921 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3922
3923 // If we can't emit a vector call for this function, then the currently found
3924 // cost is the cost we need to return.
3925 NeedToScalarize = true;
3926 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3927 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3928
3929 if (!TLI || CI->isNoBuiltin() || !VecFunc)
3930 return Cost;
3931
3932 // If the corresponding vector cost is cheaper, return its cost.
3933 InstructionCost VectorCallCost =
3934 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3935 if (VectorCallCost < Cost) {
3936 NeedToScalarize = false;
3937 Cost = VectorCallCost;
3938 }
3939 return Cost;
3940}
3941
3942static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3943 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3944 return Elt;
3945 return VectorType::get(Elt, VF);
3946}
3947
3948InstructionCost
3949LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3950 ElementCount VF) const {
3951 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3952 assert(ID && "Expected intrinsic call!")(static_cast <bool> (ID && "Expected intrinsic call!"
) ? void (0) : __assert_fail ("ID && \"Expected intrinsic call!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3952, __extension__ __PRETTY_FUNCTION__))
;
3953 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3954 FastMathFlags FMF;
3955 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3956 FMF = FPMO->getFastMathFlags();
3957
3958 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
3959 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3960 SmallVector<Type *> ParamTys;
3961 std::transform(FTy->param_begin(), FTy->param_end(),
3962 std::back_inserter(ParamTys),
3963 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3964
3965 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3966 dyn_cast<IntrinsicInst>(CI));
3967 return TTI.getIntrinsicInstrCost(CostAttrs,
3968 TargetTransformInfo::TCK_RecipThroughput);
3969}
3970
3971static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3972 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3973 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3974 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3975}
3976
3977static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3978 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3979 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3980 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3981}
3982
3983void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3984 // For every instruction `I` in MinBWs, truncate the operands, create a
3985 // truncated version of `I` and reextend its result. InstCombine runs
3986 // later and will remove any ext/trunc pairs.
3987 SmallPtrSet<Value *, 4> Erased;
3988 for (const auto &KV : Cost->getMinimalBitwidths()) {
3989 // If the value wasn't vectorized, we must maintain the original scalar
3990 // type. The absence of the value from State indicates that it
3991 // wasn't vectorized.
3992 VPValue *Def = State.Plan->getVPValue(KV.first);
3993 if (!State.hasAnyVectorValue(Def))
3994 continue;
3995 for (unsigned Part = 0; Part < UF; ++Part) {
3996 Value *I = State.get(Def, Part);
3997 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3998 continue;
3999 Type *OriginalTy = I->getType();
4000 Type *ScalarTruncatedTy =
4001 IntegerType::get(OriginalTy->getContext(), KV.second);
4002 auto *TruncatedTy = VectorType::get(
4003 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
4004 if (TruncatedTy == OriginalTy)
4005 continue;
4006
4007 IRBuilder<> B(cast<Instruction>(I));
4008 auto ShrinkOperand = [&](Value *V) -> Value * {
4009 if (auto *ZI = dyn_cast<ZExtInst>(V))
4010 if (ZI->getSrcTy() == TruncatedTy)
4011 return ZI->getOperand(0);
4012 return B.CreateZExtOrTrunc(V, TruncatedTy);
4013 };
4014
4015 // The actual instruction modification depends on the instruction type,
4016 // unfortunately.
4017 Value *NewI = nullptr;
4018 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
4019 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
4020 ShrinkOperand(BO->getOperand(1)));
4021
4022 // Any wrapping introduced by shrinking this operation shouldn't be
4023 // considered undefined behavior. So, we can't unconditionally copy
4024 // arithmetic wrapping flags to NewI.
4025 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
4026 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
4027 NewI =
4028 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
4029 ShrinkOperand(CI->getOperand(1)));
4030 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
4031 NewI = B.CreateSelect(SI->getCondition(),
4032 ShrinkOperand(SI->getTrueValue()),
4033 ShrinkOperand(SI->getFalseValue()));
4034 } else if (auto *CI = dyn_cast<CastInst>(I)) {
4035 switch (CI->getOpcode()) {
4036 default:
4037 llvm_unreachable("Unhandled cast!")::llvm::llvm_unreachable_internal("Unhandled cast!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4037)
;
4038 case Instruction::Trunc:
4039 NewI = ShrinkOperand(CI->getOperand(0));
4040 break;
4041 case Instruction::SExt:
4042 NewI = B.CreateSExtOrTrunc(
4043 CI->getOperand(0),
4044 smallestIntegerVectorType(OriginalTy, TruncatedTy));
4045 break;
4046 case Instruction::ZExt:
4047 NewI = B.CreateZExtOrTrunc(
4048 CI->getOperand(0),
4049 smallestIntegerVectorType(OriginalTy, TruncatedTy));
4050 break;
4051 }
4052 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
4053 auto Elements0 =
4054 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
4055 auto *O0 = B.CreateZExtOrTrunc(
4056 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
4057 auto Elements1 =
4058 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
4059 auto *O1 = B.CreateZExtOrTrunc(
4060 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
4061
4062 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
4063 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
4064 // Don't do anything with the operands, just extend the result.
4065 continue;
4066 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
4067 auto Elements =
4068 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
4069 auto *O0 = B.CreateZExtOrTrunc(
4070 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
4071 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
4072 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
4073 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
4074 auto Elements =
4075 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
4076 auto *O0 = B.CreateZExtOrTrunc(
4077 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
4078 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
4079 } else {
4080 // If we don't know what to do, be conservative and don't do anything.
4081 continue;
4082 }
4083
4084 // Lastly, extend the result.
4085 NewI->takeName(cast<Instruction>(I));
4086 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
4087 I->replaceAllUsesWith(Res);
4088 cast<Instruction>(I)->eraseFromParent();
4089 Erased.insert(I);
4090 State.reset(Def, Res, Part);
4091 }
4092 }
4093
4094 // We'll have created a bunch of ZExts that are now parentless. Clean up.
4095 for (const auto &KV : Cost->getMinimalBitwidths()) {
4096 // If the value wasn't vectorized, we must maintain the original scalar
4097 // type. The absence of the value from State indicates that it
4098 // wasn't vectorized.
4099 VPValue *Def = State.Plan->getVPValue(KV.first);
4100 if (!State.hasAnyVectorValue(Def))
4101 continue;
4102 for (unsigned Part = 0; Part < UF; ++Part) {
4103 Value *I = State.get(Def, Part);
4104 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
4105 if (Inst && Inst->use_empty()) {
4106 Value *NewI = Inst->getOperand(0);
4107 Inst->eraseFromParent();
4108 State.reset(Def, NewI, Part);
4109 }
4110 }
4111 }
4112}
4113
4114void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
4115 // Insert truncates and extends for any truncated instructions as hints to
4116 // InstCombine.
4117 if (VF.isVector())
4118 truncateToMinimalBitwidths(State);
4119
4120 // Fix widened non-induction PHIs by setting up the PHI operands.
4121 if (OrigPHIsToFix.size()) {
4122 assert(EnableVPlanNativePath &&(static_cast <bool> (EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4123, __extension__ __PRETTY_FUNCTION__))
4123 "Unexpected non-induction PHIs for fixup in non VPlan-native path")(static_cast <bool> (EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4123, __extension__ __PRETTY_FUNCTION__))
;
4124 fixNonInductionPHIs(State);
4125 }
4126
4127 // At this point every instruction in the original loop is widened to a
4128 // vector form. Now we need to fix the recurrences in the loop. These PHI
4129 // nodes are currently empty because we did not want to introduce cycles.
4130 // This is the second stage of vectorizing recurrences.
4131 fixCrossIterationPHIs(State);
4132
4133 // Forget the original basic block.
4134 PSE.getSE()->forgetLoop(OrigLoop);
4135
4136 // If we inserted an edge from the middle block to the unique exit block,
4137 // update uses outside the loop (phis) to account for the newly inserted
4138 // edge.
4139 if (!Cost->requiresScalarEpilogue(VF)) {
4140 // Fix-up external users of the induction variables.
4141 for (auto &Entry : Legal->getInductionVars())
4142 fixupIVUsers(Entry.first, Entry.second,
4143 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4144 IVEndValues[Entry.first], LoopMiddleBlock);
4145
4146 fixLCSSAPHIs(State);
4147 }
4148
4149 for (Instruction *PI : PredicatedInstructions)
4150 sinkScalarOperands(&*PI);
4151
4152 // Remove redundant induction instructions.
4153 cse(LoopVectorBody);
4154
4155 // Set/update profile weights for the vector and remainder loops as original
4156 // loop iterations are now distributed among them. Note that original loop
4157 // represented by LoopScalarBody becomes remainder loop after vectorization.
4158 //
4159 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4160 // end up getting slightly roughened result but that should be OK since
4161 // profile is not inherently precise anyway. Note also possible bypass of
4162 // vector code caused by legality checks is ignored, assigning all the weight
4163 // to the vector loop, optimistically.
4164 //
4165 // For scalable vectorization we can't know at compile time how many iterations
4166 // of the loop are handled in one vector iteration, so instead assume a pessimistic
4167 // vscale of '1'.
4168 setProfileInfoAfterUnrolling(
4169 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
4170 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
4171}
4172
4173void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
4174 // In order to support recurrences we need to be able to vectorize Phi nodes.
4175 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4176 // stage #2: We now need to fix the recurrences by adding incoming edges to
4177 // the currently empty PHI nodes. At this point every instruction in the
4178 // original loop is widened to a vector form so we can use them to construct
4179 // the incoming edges.
4180 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
4181 for (VPRecipeBase &R : Header->phis()) {
4182 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
4183 fixReduction(ReductionPhi, State);
4184 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
4185 fixFirstOrderRecurrence(FOR, State);
4186 }
4187}
4188
4189void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR,
4190 VPTransformState &State) {
4191 // This is the second phase of vectorizing first-order recurrences. An
4192 // overview of the transformation is described below. Suppose we have the
4193 // following loop.
4194 //
4195 // for (int i = 0; i < n; ++i)
4196 // b[i] = a[i] - a[i - 1];
4197 //
4198 // There is a first-order recurrence on "a". For this loop, the shorthand
4199 // scalar IR looks like:
4200 //
4201 // scalar.ph:
4202 // s_init = a[-1]
4203 // br scalar.body
4204 //
4205 // scalar.body:
4206 // i = phi [0, scalar.ph], [i+1, scalar.body]
4207 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4208 // s2 = a[i]
4209 // b[i] = s2 - s1
4210 // br cond, scalar.body, ...
4211 //
4212 // In this example, s1 is a recurrence because it's value depends on the
4213 // previous iteration. In the first phase of vectorization, we created a
4214 // vector phi v1 for s1. We now complete the vectorization and produce the
4215 // shorthand vector IR shown below (for VF = 4, UF = 1).
4216 //
4217 // vector.ph:
4218 // v_init = vector(..., ..., ..., a[-1])
4219 // br vector.body
4220 //
4221 // vector.body
4222 // i = phi [0, vector.ph], [i+4, vector.body]
4223 // v1 = phi [v_init, vector.ph], [v2, vector.body]
4224 // v2 = a[i, i+1, i+2, i+3];
4225 // v3 = vector(v1(3), v2(0, 1, 2))
4226 // b[i, i+1, i+2, i+3] = v2 - v3
4227 // br cond, vector.body, middle.block
4228 //
4229 // middle.block:
4230 // x = v2(3)
4231 // br scalar.ph
4232 //
4233 // scalar.ph:
4234 // s_init = phi [x, middle.block], [a[-1], otherwise]
4235 // br scalar.body
4236 //
4237 // After execution completes the vector loop, we extract the next value of
4238 // the recurrence (x) to use as the initial value in the scalar loop.
4239
4240 // Extract the last vector element in the middle block. This will be the
4241 // initial value for the recurrence when jumping to the scalar loop.
4242 VPValue *PreviousDef = PhiR->getBackedgeValue();
4243 Value *Incoming = State.get(PreviousDef, UF - 1);
4244 auto *ExtractForScalar = Incoming;
4245 auto *IdxTy = Builder.getInt32Ty();
4246 if (VF.isVector()) {
4247 auto *One = ConstantInt::get(IdxTy, 1);
4248 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4249 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4250 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4251 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
4252 "vector.recur.extract");
4253 }
4254 // Extract the second last element in the middle block if the
4255 // Phi is used outside the loop. We need to extract the phi itself
4256 // and not the last element (the phi update in the current iteration). This
4257 // will be the value when jumping to the exit block from the LoopMiddleBlock,
4258 // when the scalar loop is not run at all.
4259 Value *ExtractForPhiUsedOutsideLoop = nullptr;
4260 if (VF.isVector()) {
4261 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4262 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
4263 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4264 Incoming, Idx, "vector.recur.extract.for.phi");
4265 } else if (UF > 1)
4266 // When loop is unrolled without vectorizing, initialize
4267 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
4268 // of `Incoming`. This is analogous to the vectorized case above: extracting
4269 // the second last element when VF > 1.
4270 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4271
4272 // Fix the initial value of the original recurrence in the scalar loop.
4273 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4274 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
4275 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4276 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
4277 for (auto *BB : predecessors(LoopScalarPreHeader)) {
4278 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4279 Start->addIncoming(Incoming, BB);
4280 }
4281
4282 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4283 Phi->setName("scalar.recur");
4284
4285 // Finally, fix users of the recurrence outside the loop. The users will need
4286 // either the last value of the scalar recurrence or the last value of the
4287 // vector recurrence we extracted in the middle block. Since the loop is in
4288 // LCSSA form, we just need to find all the phi nodes for the original scalar
4289 // recurrence in the exit block, and then add an edge for the middle block.
4290 // Note that LCSSA does not imply single entry when the original scalar loop
4291 // had multiple exiting edges (as we always run the last iteration in the
4292 // scalar epilogue); in that case, there is no edge from middle to exit and
4293 // and thus no phis which needed updated.
4294 if (!Cost->requiresScalarEpilogue(VF))
4295 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4296 if (any_of(LCSSAPhi.incoming_values(),
4297 [Phi](Value *V) { return V == Phi; }))
4298 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4299}
4300
4301void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
4302 VPTransformState &State) {
4303 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4304 // Get it's reduction variable descriptor.
4305 assert(Legal->isReductionVariable(OrigPhi) &&(static_cast <bool> (Legal->isReductionVariable(OrigPhi
) && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4306, __extension__ __PRETTY_FUNCTION__))
4306 "Unable to find the reduction variable")(static_cast <bool> (Legal->isReductionVariable(OrigPhi
) && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4306, __extension__ __PRETTY_FUNCTION__))
;
4307 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4308
4309 RecurKind RK = RdxDesc.getRecurrenceKind();
4310 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4311 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4312 setDebugLocFromInst(ReductionStartValue);
4313
4314 VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
4315 // This is the vector-clone of the value that leaves the loop.
4316 Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4317
4318 // Wrap flags are in general invalid after vectorization, clear them.
4319 clearReductionWrapFlags(RdxDesc, State);
4320
4321 // Before each round, move the insertion point right between
4322 // the PHIs and the values we are going to write.
4323 // This allows us to write both PHINodes and the extractelement
4324 // instructions.
4325 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4326
4327 setDebugLocFromInst(LoopExitInst);
4328
4329 Type *PhiTy = OrigPhi->getType();
4330 // If tail is folded by masking, the vector value to leave the loop should be
4331 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4332 // instead of the former. For an inloop reduction the reduction will already
4333 // be predicated, and does not need to be handled here.
4334 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
4335 for (unsigned Part = 0; Part < UF; ++Part) {
4336 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4337 Value *Sel = nullptr;
4338 for (User *U : VecLoopExitInst->users()) {
4339 if (isa<SelectInst>(U)) {
4340 assert(!Sel && "Reduction exit feeding two selects")(static_cast <bool> (!Sel && "Reduction exit feeding two selects"
) ? void (0) : __assert_fail ("!Sel && \"Reduction exit feeding two selects\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4340, __extension__ __PRETTY_FUNCTION__))
;
4341 Sel = U;
4342 } else
4343 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select")(static_cast <bool> (isa<PHINode>(U) && "Reduction exit must feed Phi's or select"
) ? void (0) : __assert_fail ("isa<PHINode>(U) && \"Reduction exit must feed Phi's or select\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4343, __extension__ __PRETTY_FUNCTION__))
;
4344 }
4345 assert(Sel && "Reduction exit feeds no select")(static_cast <bool> (Sel && "Reduction exit feeds no select"
) ? void (0) : __assert_fail ("Sel && \"Reduction exit feeds no select\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4345, __extension__ __PRETTY_FUNCTION__))
;
4346 State.reset(LoopExitInstDef, Sel, Part);
4347
4348 // If the target can create a predicated operator for the reduction at no
4349 // extra cost in the loop (for example a predicated vadd), it can be
4350 // cheaper for the select to remain in the loop than be sunk out of it,
4351 // and so use the select value for the phi instead of the old
4352 // LoopExitValue.
4353 if (PreferPredicatedReductionSelect ||
4354 TTI->preferPredicatedReductionSelect(
4355 RdxDesc.getOpcode(), PhiTy,
4356 TargetTransformInfo::ReductionFlags())) {
4357 auto *VecRdxPhi =
4358 cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part));
4359 VecRdxPhi->setIncomingValueForBlock(
4360 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4361 }
4362 }
4363 }
4364
4365 // If the vector reduction can be performed in a smaller type, we truncate
4366 // then extend the loop exit value to enable InstCombine to evaluate the
4367 // entire expression in the smaller type.
4368 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4369 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!")(static_cast <bool> (!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"
) ? void (0) : __assert_fail ("!PhiR->isInLoop() && \"Unexpected truncated inloop reduction!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4369, __extension__ __PRETTY_FUNCTION__))
;
4370 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4371 Builder.SetInsertPoint(
4372 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4373 VectorParts RdxParts(UF);
4374 for (unsigned Part = 0; Part < UF; ++Part) {
4375 RdxParts[Part] = State.get(LoopExitInstDef, Part);
4376 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4377 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4378 : Builder.CreateZExt(Trunc, VecTy);
4379 for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4380 UI != RdxParts[Part]->user_end();)
4381 if (*UI != Trunc) {
4382 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4383 RdxParts[Part] = Extnd;
4384 } else {
4385 ++UI;
4386 }
4387 }
4388 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4389 for (unsigned Part = 0; Part < UF; ++Part) {
4390 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4391 State.reset(LoopExitInstDef, RdxParts[Part], Part);
4392 }
4393 }
4394
4395 // Reduce all of the unrolled parts into a single vector.
4396 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4397 unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4398
4399 // The middle block terminator has already been assigned a DebugLoc here (the
4400 // OrigLoop's single latch terminator). We want the whole middle block to
4401 // appear to execute on this line because: (a) it is all compiler generated,
4402 // (b) these instructions are always executed after evaluating the latch
4403 // conditional branch, and (c) other passes may add new predecessors which
4404 // terminate on this line. This is the easiest way to ensure we don't
4405 // accidentally cause an extra step back into the loop while debugging.
4406 setDebugLocFromInst(LoopMiddleBlock->getTerminator());
4407 if (PhiR->isOrdered())
4408 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
4409 else {
4410 // Floating-point operations should have some FMF to enable the reduction.
4411 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4412 Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4413 for (unsigned Part = 1; Part < UF; ++Part) {
4414 Value *RdxPart = State.get(LoopExitInstDef, Part);
4415 if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4416 ReducedPartRdx = Builder.CreateBinOp(
4417 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4418 } else {
4419 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4420 }
4421 }
4422 }
4423
4424 // Create the reduction after the loop. Note that inloop reductions create the
4425 // target reduction in the loop using a Reduction recipe.
4426 if (VF.isVector() && !PhiR->isInLoop()) {
4427 ReducedPartRdx =
4428 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
4429 // If the reduction can be performed in a smaller type, we need to extend
4430 // the reduction to the wider type before we branch to the original loop.
4431 if (PhiTy != RdxDesc.getRecurrenceType())
4432 ReducedPartRdx = RdxDesc.isSigned()
4433 ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4434 : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4435 }
4436
4437 // Create a phi node that merges control-flow from the backedge-taken check
4438 // block and the middle block.
4439 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4440 LoopScalarPreHeader->getTerminator());
4441 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4442 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4443 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4444
4445 // Now, we need to fix the users of the reduction variable
4446 // inside and outside of the scalar remainder loop.
4447
4448 // We know that the loop is in LCSSA form. We need to update the PHI nodes
4449 // in the exit blocks. See comment on analogous loop in
4450 // fixFirstOrderRecurrence for a more complete explaination of the logic.
4451 if (!Cost->requiresScalarEpilogue(VF))
4452 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4453 if (any_of(LCSSAPhi.incoming_values(),
4454 [LoopExitInst](Value *V) { return V == LoopExitInst; }))
4455 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4456
4457 // Fix the scalar loop reduction variable with the incoming reduction sum
4458 // from the vector body and from the backedge value.
4459 int IncomingEdgeBlockIdx =
4460 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4461 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")(static_cast <bool> (IncomingEdgeBlockIdx >= 0 &&
"Invalid block index") ? void (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4461, __extension__ __PRETTY_FUNCTION__))
;
4462 // Pick the other block.
4463 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4464 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4465 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4466}
4467
4468void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4469 VPTransformState &State) {
4470 RecurKind RK = RdxDesc.getRecurrenceKind();
4471 if (RK != RecurKind::Add && RK != RecurKind::Mul)
4472 return;
4473
4474 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4475 assert(LoopExitInstr && "null loop exit instruction")(static_cast <bool> (LoopExitInstr && "null loop exit instruction"
) ? void (0) : __assert_fail ("LoopExitInstr && \"null loop exit instruction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4475, __extension__ __PRETTY_FUNCTION__))
;
4476 SmallVector<Instruction *, 8> Worklist;
4477 SmallPtrSet<Instruction *, 8> Visited;
4478 Worklist.push_back(LoopExitInstr);
4479 Visited.insert(LoopExitInstr);
4480
4481 while (!Worklist.empty()) {
4482 Instruction *Cur = Worklist.pop_back_val();
4483 if (isa<OverflowingBinaryOperator>(Cur))
4484 for (unsigned Part = 0; Part < UF; ++Part) {
4485 Value *V = State.get(State.Plan->getVPValue(Cur), Part);
4486 cast<Instruction>(V)->dropPoisonGeneratingFlags();
4487 }
4488
4489 for (User *U : Cur->users()) {
4490 Instruction *UI = cast<Instruction>(U);
4491 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4492 Visited.insert(UI).second)
4493 Worklist.push_back(UI);
4494 }
4495 }
4496}
4497
4498void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4499 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4500 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4501 // Some phis were already hand updated by the reduction and recurrence
4502 // code above, leave them alone.
4503 continue;
4504
4505 auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4506 // Non-instruction incoming values will have only one value.
4507
4508 VPLane Lane = VPLane::getFirstLane();
4509 if (isa<Instruction>(IncomingValue) &&
4510 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4511 VF))
4512 Lane = VPLane::getLastLaneForVF(VF);
4513
4514 // Can be a loop invariant incoming value or the last scalar value to be
4515 // extracted from the vectorized loop.
4516 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4517 Value *lastIncomingValue =
4518 OrigLoop->isLoopInvariant(IncomingValue)
4519 ? IncomingValue
4520 : State.get(State.Plan->getVPValue(IncomingValue),
4521 VPIteration(UF - 1, Lane));
4522 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4523 }
4524}
4525
4526void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4527 // The basic block and loop containing the predicated instruction.
4528 auto *PredBB = PredInst->getParent();
4529 auto *VectorLoop = LI->getLoopFor(PredBB);
4530
4531 // Initialize a worklist with the operands of the predicated instruction.
4532 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4533
4534 // Holds instructions that we need to analyze again. An instruction may be
4535 // reanalyzed if we don't yet know if we can sink it or not.
4536 SmallVector<Instruction *, 8> InstsToReanalyze;
4537
4538 // Returns true if a given use occurs in the predicated block. Phi nodes use
4539 // their operands in their corresponding predecessor blocks.
4540 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4541 auto *I = cast<Instruction>(U.getUser());
4542 BasicBlock *BB = I->getParent();
4543 if (auto *Phi = dyn_cast<PHINode>(I))
4544 BB = Phi->getIncomingBlock(
4545 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4546 return BB == PredBB;
4547 };
4548
4549 // Iteratively sink the scalarized operands of the predicated instruction
4550 // into the block we created for it. When an instruction is sunk, it's
4551 // operands are then added to the worklist. The algorithm ends after one pass
4552 // through the worklist doesn't sink a single instruction.
4553 bool Changed;
4554 do {
4555 // Add the instructions that need to be reanalyzed to the worklist, and
4556 // reset the changed indicator.
4557 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4558 InstsToReanalyze.clear();
4559 Changed = false;
4560
4561 while (!Worklist.empty()) {
4562 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4563
4564 // We can't sink an instruction if it is a phi node, is not in the loop,
4565 // or may have side effects.
4566 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4567 I->mayHaveSideEffects())
4568 continue;
4569
4570 // If the instruction is already in PredBB, check if we can sink its
4571 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4572 // sinking the scalar instruction I, hence it appears in PredBB; but it
4573 // may have failed to sink I's operands (recursively), which we try
4574 // (again) here.
4575 if (I->getParent() == PredBB) {
4576 Worklist.insert(I->op_begin(), I->op_end());
4577 continue;
4578 }
4579
4580 // It's legal to sink the instruction if all its uses occur in the
4581 // predicated block. Otherwise, there's nothing to do yet, and we may
4582 // need to reanalyze the instruction.
4583 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4584 InstsToReanalyze.push_back(I);
4585 continue;
4586 }
4587
4588 // Move the instruction to the beginning of the predicated block, and add
4589 // it's operands to the worklist.
4590 I->moveBefore(&*PredBB->getFirstInsertionPt());
4591 Worklist.insert(I->op_begin(), I->op_end());
4592
4593 // The sinking may have enabled other instructions to be sunk, so we will
4594 // need to iterate.
4595 Changed = true;
4596 }
4597 } while (Changed);
4598}
4599
4600void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4601 for (PHINode *OrigPhi : OrigPHIsToFix) {
4602 VPWidenPHIRecipe *VPPhi =
4603 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4604 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4605 // Make sure the builder has a valid insert point.
4606 Builder.SetInsertPoint(NewPhi);
4607 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4608 VPValue *Inc = VPPhi->getIncomingValue(i);
4609 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4610 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4611 }
4612 }
4613}
4614
4615bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
4616 return Cost->useOrderedReductions(RdxDesc);
4617}
4618
4619void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4620 VPUser &Operands, unsigned UF,
4621 ElementCount VF, bool IsPtrLoopInvariant,
4622 SmallBitVector &IsIndexLoopInvariant,
4623 VPTransformState &State) {
4624 // Construct a vector GEP by widening the operands of the scalar GEP as
4625 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4626 // results in a vector of pointers when at least one operand of the GEP
4627 // is vector-typed. Thus, to keep the representation compact, we only use
4628 // vector-typed operands for loop-varying values.
4629
4630 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4631 // If we are vectorizing, but the GEP has only loop-invariant operands,
4632 // the GEP we build (by only using vector-typed operands for
4633 // loop-varying values) would be a scalar pointer. Thus, to ensure we
4634 // produce a vector of pointers, we need to either arbitrarily pick an
4635 // operand to broadcast, or broadcast a clone of the original GEP.
4636 // Here, we broadcast a clone of the original.
4637 //
4638 // TODO: If at some point we decide to scalarize instructions having
4639 // loop-invariant operands, this special case will no longer be
4640 // required. We would add the scalarization decision to
4641 // collectLoopScalars() and teach getVectorValue() to broadcast
4642 // the lane-zero scalar value.
4643 auto *Clone = Builder.Insert(GEP->clone());
4644 for (unsigned Part = 0; Part < UF; ++Part) {
4645 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4646 State.set(VPDef, EntryPart, Part);
4647 addMetadata(EntryPart, GEP);
4648 }
4649 } else {
4650 // If the GEP has at least one loop-varying operand, we are sure to
4651 // produce a vector of pointers. But if we are only unrolling, we want
4652 // to produce a scalar GEP for each unroll part. Thus, the GEP we
4653 // produce with the code below will be scalar (if VF == 1) or vector
4654 // (otherwise). Note that for the unroll-only case, we still maintain
4655 // values in the vector mapping with initVector, as we do for other
4656 // instructions.
4657 for (unsigned Part = 0; Part < UF; ++Part) {
4658 // The pointer operand of the new GEP. If it's loop-invariant, we
4659 // won't broadcast it.
4660 auto *Ptr = IsPtrLoopInvariant
4661 ? State.get(Operands.getOperand(0), VPIteration(0, 0))
4662 : State.get(Operands.getOperand(0), Part);
4663
4664 // Collect all the indices for the new GEP. If any index is
4665 // loop-invariant, we won't broadcast it.
4666 SmallVector<Value *, 4> Indices;
4667 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4668 VPValue *Operand = Operands.getOperand(I);
4669 if (IsIndexLoopInvariant[I - 1])
4670 Indices.push_back(State.get(Operand, VPIteration(0, 0)));
4671 else
4672 Indices.push_back(State.get(Operand, Part));
4673 }
4674
4675 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4676 // but it should be a vector, otherwise.
4677 auto *NewGEP =
4678 GEP->isInBounds()
4679 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4680 Indices)
4681 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4682 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&(static_cast <bool> ((VF.isScalar() || NewGEP->getType
()->isVectorTy()) && "NewGEP is not a pointer vector"
) ? void (0) : __assert_fail ("(VF.isScalar() || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4683, __extension__ __PRETTY_FUNCTION__))
4683 "NewGEP is not a pointer vector")(static_cast <bool> ((VF.isScalar() || NewGEP->getType
()->isVectorTy()) && "NewGEP is not a pointer vector"
) ? void (0) : __assert_fail ("(VF.isScalar() || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4683, __extension__ __PRETTY_FUNCTION__))
;
4684 State.set(VPDef, NewGEP, Part);
4685 addMetadata(NewGEP, GEP);
4686 }
4687 }
4688}
4689
4690void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4691 VPWidenPHIRecipe *PhiR,
4692 VPTransformState &State) {
4693 PHINode *P = cast<PHINode>(PN);
4694 if (EnableVPlanNativePath) {
4695 // Currently we enter here in the VPlan-native path for non-induction
4696 // PHIs where all control flow is uniform. We simply widen these PHIs.
4697 // Create a vector phi with no operands - the vector phi operands will be
4698 // set at the end of vector code generation.
4699 Type *VecTy = (State.VF.isScalar())
4700 ? PN->getType()
4701 : VectorType::get(PN->getType(), State.VF);
4702 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4703 State.set(PhiR, VecPhi, 0);
4704 OrigPHIsToFix.push_back(P);
4705
4706 return;
4707 }
4708
4709 assert(PN->getParent() == OrigLoop->getHeader() &&(static_cast <bool> (PN->getParent() == OrigLoop->
getHeader() && "Non-header phis should have been handled elsewhere"
) ? void (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4710, __extension__ __PRETTY_FUNCTION__))
4710 "Non-header phis should have been handled elsewhere")(static_cast <bool> (PN->getParent() == OrigLoop->
getHeader() && "Non-header phis should have been handled elsewhere"
) ? void (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4710, __extension__ __PRETTY_FUNCTION__))
;
4711
4712 // In order to support recurrences we need to be able to vectorize Phi nodes.
4713 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4714 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4715 // this value when we vectorize all of the instructions that use the PHI.
4716
4717 assert(!Legal->isReductionVariable(P) &&(static_cast <bool> (!Legal->isReductionVariable(P) &&
"reductions should be handled elsewhere") ? void (0) : __assert_fail
("!Legal->isReductionVariable(P) && \"reductions should be handled elsewhere\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4718, __extension__ __PRETTY_FUNCTION__))
4718 "reductions should be handled elsewhere")(static_cast <bool> (!Legal->isReductionVariable(P) &&
"reductions should be handled elsewhere") ? void (0) : __assert_fail
("!Legal->isReductionVariable(P) && \"reductions should be handled elsewhere\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4718, __extension__ __PRETTY_FUNCTION__))
;
4719
4720 setDebugLocFromInst(P);
4721
4722 // This PHINode must be an induction variable.
4723 // Make sure that we know about it.
4724 assert(Legal->getInductionVars().count(P) && "Not an induction variable")(static_cast <bool> (Legal->getInductionVars().count
(P) && "Not an induction variable") ? void (0) : __assert_fail
("Legal->getInductionVars().count(P) && \"Not an induction variable\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4724, __extension__ __PRETTY_FUNCTION__))
;
4725
4726 InductionDescriptor II = Legal->getInductionVars().lookup(P);
4727 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4728
4729 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4730 // which can be found from the original scalar operations.
4731 switch (II.getKind()) {
4732 case InductionDescriptor::IK_NoInduction:
4733 llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4733)
;
4734 case InductionDescriptor::IK_IntInduction:
4735 case InductionDescriptor::IK_FpInduction:
4736 llvm_unreachable("Integer/fp induction is handled elsewhere.")::llvm::llvm_unreachable_internal("Integer/fp induction is handled elsewhere."
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4736)
;
4737 case InductionDescriptor::IK_PtrInduction: {
4738 // Handle the pointer induction variable case.
4739 assert(P->getType()->isPointerTy() && "Unexpected type.")(static_cast <bool> (P->getType()->isPointerTy() &&
"Unexpected type.") ? void (0) : __assert_fail ("P->getType()->isPointerTy() && \"Unexpected type.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4739, __extension__ __PRETTY_FUNCTION__))
;
4740
4741 if (Cost->isScalarAfterVectorization(P, State.VF)) {
4742 // This is the normalized GEP that starts counting at zero.
4743 Value *PtrInd =
4744 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4745 // Determine the number of scalars we need to generate for each unroll
4746 // iteration. If the instruction is uniform, we only need to generate the
4747 // first lane. Otherwise, we generate all VF values.
4748 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
4749 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
4750
4751 bool NeedsVectorIndex = !IsUniform && VF.isScalable();
4752 Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr;
4753 if (NeedsVectorIndex) {
4754 Type *VecIVTy = VectorType::get(PtrInd->getType(), VF);
4755 UnitStepVec = Builder.CreateStepVector(VecIVTy);
4756 PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd);
4757 }
4758
4759 for (unsigned Part = 0; Part < UF; ++Part) {
4760 Value *PartStart = createStepForVF(
4761 Builder, ConstantInt::get(PtrInd->getType(), Part), VF);
4762
4763 if (NeedsVectorIndex) {
4764 Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
4765 Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
4766 Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
4767 Value *SclrGep =
4768 emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
4769 SclrGep->setName("next.gep");
4770 State.set(PhiR, SclrGep, Part);
4771 // We've cached the whole vector, which means we can support the
4772 // extraction of any lane.
4773 continue;
4774 }
4775
4776 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4777 Value *Idx = Builder.CreateAdd(
4778 PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4779 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4780 Value *SclrGep =
4781 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4782 SclrGep->setName("next.gep");
4783 State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4784 }
4785 }
4786 return;
4787 }
4788 assert(isa<SCEVConstant>(II.getStep()) &&(static_cast <bool> (isa<SCEVConstant>(II.getStep
()) && "Induction step not a SCEV constant!") ? void (
0) : __assert_fail ("isa<SCEVConstant>(II.getStep()) && \"Induction step not a SCEV constant!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4789, __extension__ __PRETTY_FUNCTION__))
4789 "Induction step not a SCEV constant!")(static_cast <bool> (isa<SCEVConstant>(II.getStep
()) && "Induction step not a SCEV constant!") ? void (
0) : __assert_fail ("isa<SCEVConstant>(II.getStep()) && \"Induction step not a SCEV constant!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4789, __extension__ __PRETTY_FUNCTION__))
;
4790 Type *PhiType = II.getStep()->getType();
4791
4792 // Build a pointer phi
4793 Value *ScalarStartValue = II.getStartValue();
4794 Type *ScStValueType = ScalarStartValue->getType();
4795 PHINode *NewPointerPhi =
4796 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4797 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4798
4799 // A pointer induction, performed by using a gep
4800 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4801 Instruction *InductionLoc = LoopLatch->getTerminator();
4802 const SCEV *ScalarStep = II.getStep();
4803 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4804 Value *ScalarStepValue =
4805 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4806 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4807 Value *NumUnrolledElems =
4808 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4809 Value *InductionGEP = GetElementPtrInst::Create(
4810 ScStValueType->getPointerElementType(), NewPointerPhi,
4811 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4812 InductionLoc);
4813 NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4814
4815 // Create UF many actual address geps that use the pointer
4816 // phi as base and a vectorized version of the step value
4817 // (<step*0, ..., step*N>) as offset.
4818 for (unsigned Part = 0; Part < State.UF; ++Part) {
4819 Type *VecPhiType = VectorType::get(PhiType, State.VF);
4820 Value *StartOffsetScalar =
4821 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4822 Value *StartOffset =
4823 Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4824 // Create a vector of consecutive numbers from zero to VF.
4825 StartOffset =
4826 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4827
4828 Value *GEP = Builder.CreateGEP(
4829 ScStValueType->getPointerElementType(), NewPointerPhi,
4830 Builder.CreateMul(
4831 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4832 "vector.gep"));
4833 State.set(PhiR, GEP, Part);
4834 }
4835 }
4836 }
4837}
4838
4839/// A helper function for checking whether an integer division-related
4840/// instruction may divide by zero (in which case it must be predicated if
4841/// executed conditionally in the scalar code).
4842/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4843/// Non-zero divisors that are non compile-time constants will not be
4844/// converted into multiplication, so we will still end up scalarizing
4845/// the division, but can do so w/o predication.
4846static bool mayDivideByZero(Instruction &I) {
4847 assert((I.getOpcode() == Instruction::UDiv ||(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
|| I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4851, __extension__ __PRETTY_FUNCTION__))
4848 I.getOpcode() == Instruction::SDiv ||(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
|| I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4851, __extension__ __PRETTY_FUNCTION__))
4849 I.getOpcode() == Instruction::URem ||(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
|| I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4851, __extension__ __PRETTY_FUNCTION__))
4850 I.getOpcode() == Instruction::SRem) &&(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
|| I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4851, __extension__ __PRETTY_FUNCTION__))
4851 "Unexpected instruction")(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
|| I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4851, __extension__ __PRETTY_FUNCTION__))
;
4852 Value *Divisor = I.getOperand(1);
4853 auto *CInt = dyn_cast<ConstantInt>(Divisor);
4854 return !CInt || CInt->isZero();
4855}
4856
4857void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4858 VPUser &User,
4859 VPTransformState &State) {
4860 switch (I.getOpcode()) {
4861 case Instruction::Call:
4862 case Instruction::Br:
4863 case Instruction::PHI:
4864 case Instruction::GetElementPtr:
4865 case Instruction::Select:
4866 llvm_unreachable("This instruction is handled by a different recipe.")::llvm::llvm_unreachable_internal("This instruction is handled by a different recipe."
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4866)
;
4867 case Instruction::UDiv:
4868 case Instruction::SDiv:
4869 case Instruction::SRem:
4870 case Instruction::URem:
4871 case Instruction::Add:
4872 case Instruction::FAdd:
4873 case Instruction::Sub:
4874 case Instruction::FSub:
4875 case Instruction::FNeg:
4876 case Instruction::Mul:
4877 case Instruction::FMul:
4878 case Instruction::FDiv:
4879 case Instruction::FRem:
4880 case Instruction::Shl:
4881 case Instruction::LShr:
4882 case Instruction::AShr:
4883 case Instruction::And:
4884 case Instruction::Or:
4885 case Instruction::Xor: {
4886 // Just widen unops and binops.
4887 setDebugLocFromInst(&I);
4888
4889 for (unsigned Part = 0; Part < UF; ++Part) {
4890 SmallVector<Value *, 2> Ops;
4891 for (VPValue *VPOp : User.operands())
4892 Ops.push_back(State.get(VPOp, Part));
4893
4894 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4895
4896 if (auto *VecOp = dyn_cast<Instruction>(V))
4897 VecOp->copyIRFlags(&I);
4898
4899 // Use this vector value for all users of the original instruction.
4900 State.set(Def, V, Part);
4901 addMetadata(V, &I);
4902 }
4903
4904 break;
4905 }
4906 case Instruction::ICmp:
4907 case Instruction::FCmp: {
4908 // Widen compares. Generate vector compares.
4909 bool FCmp = (I.getOpcode() == Instruction::FCmp);
4910 auto *Cmp = cast<CmpInst>(&I);
4911 setDebugLocFromInst(Cmp);
4912 for (unsigned Part = 0; Part < UF; ++Part) {
4913 Value *A = State.get(User.getOperand(0), Part);
4914 Value *B = State.get(User.getOperand(1), Part);
4915 Value *C = nullptr;
4916 if (FCmp) {
4917 // Propagate fast math flags.
4918 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4919 Builder.setFastMathFlags(Cmp->getFastMathFlags());
4920 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4921 } else {
4922 C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4923 }
4924 State.set(Def, C, Part);
4925 addMetadata(C, &I);
4926 }
4927
4928 break;
4929 }
4930
4931 case Instruction::ZExt:
4932 case Instruction::SExt:
4933 case Instruction::FPToUI:
4934 case Instruction::FPToSI:
4935 case Instruction::FPExt:
4936 case Instruction::PtrToInt:
4937 case Instruction::IntToPtr:
4938 case Instruction::SIToFP:
4939 case Instruction::UIToFP:
4940 case Instruction::Trunc:
4941 case Instruction::FPTrunc:
4942 case Instruction::BitCast: {
4943 auto *CI = cast<CastInst>(&I);
4944 setDebugLocFromInst(CI);
4945
4946 /// Vectorize casts.
4947 Type *DestTy =
4948 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4949
4950 for (unsigned Part = 0; Part < UF; ++Part) {
4951 Value *A = State.get(User.getOperand(0), Part);
4952 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4953 State.set(Def, Cast, Part);
4954 addMetadata(Cast, &I);
4955 }
4956 break;
4957 }
4958 default:
4959 // This instruction is not vectorized by simple widening.
4960 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an unhandled instruction: "
<< I; } } while (false)
;
4961 llvm_unreachable("Unhandled instruction!")::llvm::llvm_unreachable_internal("Unhandled instruction!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4961)
;
4962 } // end of switch.
4963}
4964
4965void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4966 VPUser &ArgOperands,
4967 VPTransformState &State) {
4968 assert(!isa<DbgInfoIntrinsic>(I) &&(static_cast <bool> (!isa<DbgInfoIntrinsic>(I) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction"
) ? void (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(I) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4969, __extension__ __PRETTY_FUNCTION__))
4969 "DbgInfoIntrinsic should have been dropped during VPlan construction")(static_cast <bool> (!isa<DbgInfoIntrinsic>(I) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction"
) ? void (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(I) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4969, __extension__ __PRETTY_FUNCTION__))
;
4970 setDebugLocFromInst(&I);
4971
4972 Module *M = I.getParent()->getParent()->getParent();
4973 auto *CI = cast<CallInst>(&I);
4974
4975 SmallVector<Type *, 4> Tys;
4976 for (Value *ArgOperand : CI->arg_operands())
4977 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4978
4979 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4980
4981 // The flag shows whether we use Intrinsic or a usual Call for vectorized
4982 // version of the instruction.
4983 // Is it beneficial to perform intrinsic call compared to lib call?
4984 bool NeedToScalarize = false;
4985 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4986 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4987 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4988 assert((UseVectorIntrinsic || !NeedToScalarize) &&(static_cast <bool> ((UseVectorIntrinsic || !NeedToScalarize
) && "Instruction should be scalarized elsewhere.") ?
void (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4989, __extension__ __PRETTY_FUNCTION__))
4989 "Instruction should be scalarized elsewhere.")(static_cast <bool> ((UseVectorIntrinsic || !NeedToScalarize
) && "Instruction should be scalarized elsewhere.") ?
void (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4989, __extension__ __PRETTY_FUNCTION__))
;
4990 assert((IntrinsicCost.isValid() || CallCost.isValid()) &&(static_cast <bool> ((IntrinsicCost.isValid() || CallCost
.isValid()) && "Either the intrinsic cost or vector call cost must be valid"
) ? void (0) : __assert_fail ("(IntrinsicCost.isValid() || CallCost.isValid()) && \"Either the intrinsic cost or vector call cost must be valid\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4991, __extension__ __PRETTY_FUNCTION__))
4991 "Either the intrinsic cost or vector call cost must be valid")(static_cast <bool> ((IntrinsicCost.isValid() || CallCost
.isValid()) && "Either the intrinsic cost or vector call cost must be valid"
) ? void (0) : __assert_fail ("(IntrinsicCost.isValid() || CallCost.isValid()) && \"Either the intrinsic cost or vector call cost must be valid\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4991, __extension__ __PRETTY_FUNCTION__))
;
4992
4993 for (unsigned Part = 0; Part < UF; ++Part) {
4994 SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4995 SmallVector<Value *, 4> Args;
4996 for (auto &I : enumerate(ArgOperands.operands())) {
4997 // Some intrinsics have a scalar argument - don't replace it with a
4998 // vector.
4999 Value *Arg;
5000 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
5001 Arg = State.get(I.value(), Part);
5002 else {
5003 Arg = State.get(I.value(), VPIteration(0, 0));
5004 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
5005 TysForDecl.push_back(Arg->getType());
5006 }
5007 Args.push_back(Arg);
5008 }
5009
5010 Function *VectorF;
5011 if (UseVectorIntrinsic) {
5012 // Use vector version of the intrinsic.
5013 if (VF.isVector())
5014 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
5015 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
5016 assert(VectorF && "Can't retrieve vector intrinsic.")(static_cast <bool> (VectorF && "Can't retrieve vector intrinsic."
) ? void (0) : __assert_fail ("VectorF && \"Can't retrieve vector intrinsic.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5016, __extension__ __PRETTY_FUNCTION__))
;
5017 } else {
5018 // Use vector version of the function call.
5019 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
5020#ifndef NDEBUG
5021 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&(static_cast <bool> (VFDatabase(*CI).getVectorizedFunction
(Shape) != nullptr && "Can't create vector function."
) ? void (0) : __assert_fail ("VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5022, __extension__ __PRETTY_FUNCTION__))
5022 "Can't create vector function.")(static_cast <bool> (VFDatabase(*CI).getVectorizedFunction
(Shape) != nullptr && "Can't create vector function."
) ? void (0) : __assert_fail ("VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5022, __extension__ __PRETTY_FUNCTION__))
;
5023#endif
5024 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
5025 }
5026 SmallVector<OperandBundleDef, 1> OpBundles;
5027 CI->getOperandBundlesAsDefs(OpBundles);
5028 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
5029
5030 if (isa<FPMathOperator>(V))
5031 V->copyFastMathFlags(CI);
5032
5033 State.set(Def, V, Part);
5034 addMetadata(V, &I);
5035 }
5036}
5037
5038void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
5039 VPUser &Operands,
5040 bool InvariantCond,
5041 VPTransformState &State) {
5042 setDebugLocFromInst(&I);
5043
5044 // The condition can be loop invariant but still defined inside the
5045 // loop. This means that we can't just use the original 'cond' value.
5046 // We have to take the 'vectorized' value and pick the first lane.
5047 // Instcombine will make this a no-op.
5048 auto *InvarCond = InvariantCond
5049 ? State.get(Operands.getOperand(0), VPIteration(0, 0))
5050 : nullptr;
5051
5052 for (unsigned Part = 0; Part < UF; ++Part) {
5053 Value *Cond =
5054 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
5055 Value *Op0 = State.get(Operands.getOperand(1), Part);
5056 Value *Op1 = State.get(Operands.getOperand(2), Part);
5057 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
5058 State.set(VPDef, Sel, Part);
5059 addMetadata(Sel, &I);
5060 }
5061}
5062
5063void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
5064 // We should not collect Scalars more than once per VF. Right now, this
5065 // function is called from collectUniformsAndScalars(), which already does
5066 // this check. Collecting Scalars for VF=1 does not make any sense.
5067 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&(static_cast <bool> (VF.isVector() && Scalars.find
(VF) == Scalars.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5068, __extension__ __PRETTY_FUNCTION__))
5068 "This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Scalars.find
(VF) == Scalars.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5068, __extension__ __PRETTY_FUNCTION__))
;
5069
5070 SmallSetVector<Instruction *, 8> Worklist;
5071
5072 // These sets are used to seed the analysis with pointers used by memory
5073 // accesses that will remain scalar.
5074 SmallSetVector<Instruction *, 8> ScalarPtrs;
5075 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
5076 auto *Latch = TheLoop->getLoopLatch();
5077
5078 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
5079 // The pointer operands of loads and stores will be scalar as long as the
5080 // memory access is not a gather or scatter operation. The value operand of a
5081 // store will remain scalar if the store is scalarized.
5082 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
5083 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
5084 assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5085, __extension__ __PRETTY_FUNCTION__))
5085 "Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5085, __extension__ __PRETTY_FUNCTION__))
;
5086 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
5087 if (Ptr == Store->getValueOperand())
5088 return WideningDecision == CM_Scalarize;
5089 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5090, __extension__ __PRETTY_FUNCTION__))
5090 "Ptr is neither a value or pointer operand")(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5090, __extension__ __PRETTY_FUNCTION__))
;
5091 return WideningDecision != CM_GatherScatter;
5092 };
5093
5094 // A helper that returns true if the given value is a bitcast or
5095 // getelementptr instruction contained in the loop.
5096 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
5097 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
5098 isa<GetElementPtrInst>(V)) &&
5099 !TheLoop->isLoopInvariant(V);
5100 };
5101
5102 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
5103 if (!isa<PHINode>(Ptr) ||
5104 !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
5105 return false;
5106 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
5107 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
5108 return false;
5109 return isScalarUse(MemAccess, Ptr);
5110 };
5111
5112 // A helper that evaluates a memory access's use of a pointer. If the
5113 // pointer is actually the pointer induction of a loop, it is being
5114 // inserted into Worklist. If the use will be a scalar use, and the
5115 // pointer is only used by memory accesses, we place the pointer in
5116 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
5117 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
5118 if (isScalarPtrInduction(MemAccess, Ptr)) {
5119 Worklist.insert(cast<Instruction>(Ptr));
5120 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptrdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Ptr << "\n"; } } while (false)
5121 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Ptr << "\n"; } } while (false)
;
5122
5123 Instruction *Update = cast<Instruction>(
5124 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
5125 ScalarPtrs.insert(Update);
5126 return;
5127 }
5128 // We only care about bitcast and getelementptr instructions contained in
5129 // the loop.
5130 if (!isLoopVaryingBitCastOrGEP(Ptr))
5131 return;
5132
5133 // If the pointer has already been identified as scalar (e.g., if it was
5134 // also identified as uniform), there's nothing to do.
5135 auto *I = cast<Instruction>(Ptr);
5136 if (Worklist.count(I))
5137 return;
5138
5139 // If the use of the pointer will be a scalar use, and all users of the
5140 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5141 // place the pointer in PossibleNonScalarPtrs.
5142 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5143 return isa<LoadInst>(U) || isa<StoreInst>(U);
5144 }))
5145 ScalarPtrs.insert(I);
5146 else
5147 PossibleNonScalarPtrs.insert(I);
5148 };
5149
5150 // We seed the scalars analysis with three classes of instructions: (1)
5151 // instructions marked uniform-after-vectorization and (2) bitcast,
5152 // getelementptr and (pointer) phi instructions used by memory accesses
5153 // requiring a scalar use.
5154 //
5155 // (1) Add to the worklist all instructions that have been identified as
5156 // uniform-after-vectorization.
5157 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5158
5159 // (2) Add to the worklist all bitcast and getelementptr instructions used by
5160 // memory accesses requiring a scalar use. The pointer operands of loads and
5161 // stores will be scalar as long as the memory accesses is not a gather or
5162 // scatter operation. The value operand of a store will remain scalar if the
5163 // store is scalarized.
5164 for (auto *BB : TheLoop->blocks())
5165 for (auto &I : *BB) {
5166 if (auto *Load = dyn_cast<LoadInst>(&I)) {
5167 evaluatePtrUse(Load, Load->getPointerOperand());
5168 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5169 evaluatePtrUse(Store, Store->getPointerOperand());
5170 evaluatePtrUse(Store, Store->getValueOperand());
5171 }
5172 }
5173 for (auto *I : ScalarPtrs)
5174 if (!PossibleNonScalarPtrs.count(I)) {
5175 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *I << "\n"; } } while (false)
;
5176 Worklist.insert(I);
5177 }
5178
5179 // Insert the forced scalars.
5180 // FIXME: Currently widenPHIInstruction() often creates a dead vector
5181 // induction variable when the PHI user is scalarized.
5182 auto ForcedScalar = ForcedScalars.find(VF);
5183 if (ForcedScalar != ForcedScalars.end())
5184 for (auto *I : ForcedScalar->second)
5185 Worklist.insert(I);
5186
5187 // Expand the worklist by looking through any bitcasts and getelementptr
5188 // instructions we've already identified as scalar. This is similar to the
5189 // expansion step in collectLoopUniforms(); however, here we're only
5190 // expanding to include additional bitcasts and getelementptr instructions.
5191 unsigned Idx = 0;
5192 while (Idx != Worklist.size()) {
5193 Instruction *Dst = Worklist[Idx++];
5194 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5195 continue;
5196 auto *Src = cast<Instruction>(Dst->getOperand(0));
5197 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5198 auto *J = cast<Instruction>(U);
5199 return !TheLoop->contains(J) || Worklist.count(J) ||
5200 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5201 isScalarUse(J, Src));
5202 })) {
5203 Worklist.insert(Src);
5204 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Src << "\n"; } } while (false)
;
5205 }
5206 }
5207
5208 // An induction variable will remain scalar if all users of the induction
5209 // variable and induction variable update remain scalar.
5210 for (auto &Induction : Legal->getInductionVars()) {
5211 auto *Ind = Induction.first;
5212 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5213
5214 // If tail-folding is applied, the primary induction variable will be used
5215 // to feed a vector compare.
5216 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5217 continue;
5218
5219 // Determine if all users of the induction variable are scalar after
5220 // vectorization.
5221 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5222 auto *I = cast<Instruction>(U);
5223 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5224 });
5225 if (!ScalarInd)
5226 continue;
5227
5228 // Determine if all users of the induction variable update instruction are
5229 // scalar after vectorization.
5230 auto ScalarIndUpdate =
5231 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5232 auto *I = cast<Instruction>(U);
5233 return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5234 });
5235 if (!ScalarIndUpdate)
5236 continue;
5237
5238 // The induction variable and its update instruction will remain scalar.
5239 Worklist.insert(Ind);
5240 Worklist.insert(IndUpdate);
5241 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Ind << "\n"; } } while (false)
;
5242 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
5243 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
;
5244 }
5245
5246 Scalars[VF].insert(Worklist.begin(), Worklist.end());
5247}
5248
5249bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
5250 if (!blockNeedsPredication(I->getParent()))
5251 return false;
5252 switch(I->getOpcode()) {
5253 default:
5254 break;
5255 case Instruction::Load:
5256 case Instruction::Store: {
5257 if (!Legal->isMaskRequired(I))
5258 return false;
5259 auto *Ptr = getLoadStorePointerOperand(I);
5260 auto *Ty = getLoadStoreType(I);
5261 const Align Alignment = getLoadStoreAlignment(I);
5262 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5263 TTI.isLegalMaskedGather(Ty, Alignment))
5264 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5265 TTI.isLegalMaskedScatter(Ty, Alignment));
5266 }
5267 case Instruction::UDiv:
5268 case Instruction::SDiv:
5269 case Instruction::SRem:
5270 case Instruction::URem:
5271 return mayDivideByZero(*I);
5272 }
5273 return false;
5274}
5275
5276bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5277 Instruction *I, ElementCount VF) {
5278 assert(isAccessInterleaved(I) && "Expecting interleaved access.")(static_cast <bool> (isAccessInterleaved(I) && "Expecting interleaved access."
) ? void (0) : __assert_fail ("isAccessInterleaved(I) && \"Expecting interleaved access.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5278, __extension__ __PRETTY_FUNCTION__))
;
5279 assert(getWideningDecision(I, VF) == CM_Unknown &&(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown
&& "Decision should not be set yet.") ? void (0) : __assert_fail
("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5280, __extension__ __PRETTY_FUNCTION__))
5280 "Decision should not be set yet.")(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown
&& "Decision should not be set yet.") ? void (0) : __assert_fail
("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5280, __extension__ __PRETTY_FUNCTION__))
;
5281 auto *Group = getInterleavedAccessGroup(I);
5282 assert(Group && "Must have a group.")(static_cast <bool> (Group && "Must have a group."
) ? void (0) : __assert_fail ("Group && \"Must have a group.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5282, __extension__ __PRETTY_FUNCTION__))
;
5283
5284 // If the instruction's allocated size doesn't equal it's type size, it
5285 // requires padding and will be scalarized.
5286 auto &DL = I->getModule()->getDataLayout();
5287 auto *ScalarTy = getLoadStoreType(I);
5288 if (hasIrregularType(ScalarTy, DL))
5289 return false;
5290
5291 // Check if masking is required.
5292 // A Group may need masking for one of two reasons: it resides in a block that
5293 // needs predication, or it was decided to use masking to deal with gaps
5294 // (either a gap at the end of a load-access that may result in a speculative
5295 // load, or any gaps in a store-access).
5296 bool PredicatedAccessRequiresMasking =
5297 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5298 bool LoadAccessWithGapsRequiresEpilogMasking =
5299 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
5300 !isScalarEpilogueAllowed();
5301 bool StoreAccessWithGapsRequiresMasking =
5302 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
5303 if (!PredicatedAccessRequiresMasking &&
5304 !LoadAccessWithGapsRequiresEpilogMasking &&
5305 !StoreAccessWithGapsRequiresMasking)
5306 return true;
5307
5308 // If masked interleaving is required, we expect that the user/target had
5309 // enabled it, because otherwise it either wouldn't have been created or
5310 // it should have been invalidated by the CostModel.
5311 assert(useMaskedInterleavedAccesses(TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled."
) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5312, __extension__ __PRETTY_FUNCTION__))
5312 "Masked interleave-groups for predicated accesses are not enabled.")(static_cast <bool> (useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled."
) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5312, __extension__ __PRETTY_FUNCTION__))
;
5313
5314 auto *Ty = getLoadStoreType(I);
5315 const Align Alignment = getLoadStoreAlignment(I);
5316 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5317 : TTI.isLegalMaskedStore(Ty, Alignment);
5318}
5319
5320bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5321 Instruction *I, ElementCount VF) {
5322 // Get and ensure we have a valid memory instruction.
5323 LoadInst *LI = dyn_cast<LoadInst>(I);
5324 StoreInst *SI = dyn_cast<StoreInst>(I);
5325 assert((LI || SI) && "Invalid memory instruction")(static_cast <bool> ((LI || SI) && "Invalid memory instruction"
) ? void (0) : __assert_fail ("(LI || SI) && \"Invalid memory instruction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5325, __extension__ __PRETTY_FUNCTION__))
;
5326
5327 auto *Ptr = getLoadStorePointerOperand(I);
5328
5329 // In order to be widened, the pointer should be consecutive, first of all.
5330 if (!Legal->isConsecutivePtr(Ptr))
5331 return false;
5332
5333 // If the instruction is a store located in a predicated block, it will be
5334 // scalarized.
5335 if (isScalarWithPredication(I))
5336 return false;
5337
5338 // If the instruction's allocated size doesn't equal it's type size, it
5339 // requires padding and will be scalarized.
5340 auto &DL = I->getModule()->getDataLayout();
5341 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5342 if (hasIrregularType(ScalarTy, DL))
5343 return false;
5344
5345 return true;
5346}
5347
5348void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5349 // We should not collect Uniforms more than once per VF. Right now,
5350 // this function is called from collectUniformsAndScalars(), which
5351 // already does this check. Collecting Uniforms for VF=1 does not make any
5352 // sense.
5353
5354 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&(static_cast <bool> (VF.isVector() && Uniforms.
find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5355, __extension__ __PRETTY_FUNCTION__))
5355 "This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Uniforms.
find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5355, __extension__ __PRETTY_FUNCTION__))
;
5356
5357 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5358 // not analyze again. Uniforms.count(VF) will return 1.
5359 Uniforms[VF].clear();
5360
5361 // We now know that the loop is vectorizable!
5362 // Collect instructions inside the loop that will remain uniform after
5363 // vectorization.
5364
5365 // Global values, params and instructions outside of current loop are out of
5366 // scope.
5367 auto isOutOfScope = [&](Value *V) -> bool {
5368 Instruction *I = dyn_cast<Instruction>(V);
5369 return (!I || !TheLoop->contains(I));
5370 };
5371
5372 SetVector<Instruction *> Worklist;
5373 BasicBlock *Latch = TheLoop->getLoopLatch();
5374
5375 // Instructions that are scalar with predication must not be considered
5376 // uniform after vectorization, because that would create an erroneous
5377 // replicating region where only a single instance out of VF should be formed.
5378 // TODO: optimize such seldom cases if found important, see PR40816.
5379 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5380 if (isOutOfScope(I)) {
5381 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
5382 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
;
5383 return;
5384 }
5385 if (isScalarWithPredication(I)) {
5386 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() <