Bug Summary

File:llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Warning:line 8915, column 5
Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopVectorize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/build-llvm/lib/Transforms/Vectorize -resource-dir /usr/lib/llvm-13/lib/clang/13.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/build-llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/build-llvm/include -I /build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/include -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-13/lib/clang/13.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/build-llvm/lib/Transforms/Vectorize -fdebug-prefix-map=/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c=. -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-07-26-235520-9401-1 -x c++ /build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/Proposal/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanHCFGBuilder.h"
61#include "VPlanPredicator.h"
62#include "VPlanTransforms.h"
63#include "llvm/ADT/APInt.h"
64#include "llvm/ADT/ArrayRef.h"
65#include "llvm/ADT/DenseMap.h"
66#include "llvm/ADT/DenseMapInfo.h"
67#include "llvm/ADT/Hashing.h"
68#include "llvm/ADT/MapVector.h"
69#include "llvm/ADT/None.h"
70#include "llvm/ADT/Optional.h"
71#include "llvm/ADT/STLExtras.h"
72#include "llvm/ADT/SmallPtrSet.h"
73#include "llvm/ADT/SmallSet.h"
74#include "llvm/ADT/SmallVector.h"
75#include "llvm/ADT/Statistic.h"
76#include "llvm/ADT/StringRef.h"
77#include "llvm/ADT/Twine.h"
78#include "llvm/ADT/iterator_range.h"
79#include "llvm/Analysis/AssumptionCache.h"
80#include "llvm/Analysis/BasicAliasAnalysis.h"
81#include "llvm/Analysis/BlockFrequencyInfo.h"
82#include "llvm/Analysis/CFG.h"
83#include "llvm/Analysis/CodeMetrics.h"
84#include "llvm/Analysis/DemandedBits.h"
85#include "llvm/Analysis/GlobalsModRef.h"
86#include "llvm/Analysis/LoopAccessAnalysis.h"
87#include "llvm/Analysis/LoopAnalysisManager.h"
88#include "llvm/Analysis/LoopInfo.h"
89#include "llvm/Analysis/LoopIterator.h"
90#include "llvm/Analysis/MemorySSA.h"
91#include "llvm/Analysis/OptimizationRemarkEmitter.h"
92#include "llvm/Analysis/ProfileSummaryInfo.h"
93#include "llvm/Analysis/ScalarEvolution.h"
94#include "llvm/Analysis/ScalarEvolutionExpressions.h"
95#include "llvm/Analysis/TargetLibraryInfo.h"
96#include "llvm/Analysis/TargetTransformInfo.h"
97#include "llvm/Analysis/VectorUtils.h"
98#include "llvm/IR/Attributes.h"
99#include "llvm/IR/BasicBlock.h"
100#include "llvm/IR/CFG.h"
101#include "llvm/IR/Constant.h"
102#include "llvm/IR/Constants.h"
103#include "llvm/IR/DataLayout.h"
104#include "llvm/IR/DebugInfoMetadata.h"
105#include "llvm/IR/DebugLoc.h"
106#include "llvm/IR/DerivedTypes.h"
107#include "llvm/IR/DiagnosticInfo.h"
108#include "llvm/IR/Dominators.h"
109#include "llvm/IR/Function.h"
110#include "llvm/IR/IRBuilder.h"
111#include "llvm/IR/InstrTypes.h"
112#include "llvm/IR/Instruction.h"
113#include "llvm/IR/Instructions.h"
114#include "llvm/IR/IntrinsicInst.h"
115#include "llvm/IR/Intrinsics.h"
116#include "llvm/IR/LLVMContext.h"
117#include "llvm/IR/Metadata.h"
118#include "llvm/IR/Module.h"
119#include "llvm/IR/Operator.h"
120#include "llvm/IR/PatternMatch.h"
121#include "llvm/IR/Type.h"
122#include "llvm/IR/Use.h"
123#include "llvm/IR/User.h"
124#include "llvm/IR/Value.h"
125#include "llvm/IR/ValueHandle.h"
126#include "llvm/IR/Verifier.h"
127#include "llvm/InitializePasses.h"
128#include "llvm/Pass.h"
129#include "llvm/Support/Casting.h"
130#include "llvm/Support/CommandLine.h"
131#include "llvm/Support/Compiler.h"
132#include "llvm/Support/Debug.h"
133#include "llvm/Support/ErrorHandling.h"
134#include "llvm/Support/InstructionCost.h"
135#include "llvm/Support/MathExtras.h"
136#include "llvm/Support/raw_ostream.h"
137#include "llvm/Transforms/Utils/BasicBlockUtils.h"
138#include "llvm/Transforms/Utils/InjectTLIMappings.h"
139#include "llvm/Transforms/Utils/LoopSimplify.h"
140#include "llvm/Transforms/Utils/LoopUtils.h"
141#include "llvm/Transforms/Utils/LoopVersioning.h"
142#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
143#include "llvm/Transforms/Utils/SizeOpts.h"
144#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
145#include <algorithm>
146#include <cassert>
147#include <cstdint>
148#include <cstdlib>
149#include <functional>
150#include <iterator>
151#include <limits>
152#include <memory>
153#include <string>
154#include <tuple>
155#include <utility>
156
157using namespace llvm;
158
159#define LV_NAME"loop-vectorize" "loop-vectorize"
160#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"
161
162#ifndef NDEBUG
163const char VerboseDebug[] = DEBUG_TYPE"loop-vectorize" "-verbose";
164#endif
165
166/// @{
167/// Metadata attribute names
168const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
169const char LLVMLoopVectorizeFollowupVectorized[] =
170 "llvm.loop.vectorize.followup_vectorized";
171const char LLVMLoopVectorizeFollowupEpilogue[] =
172 "llvm.loop.vectorize.followup_epilogue";
173/// @}
174
175STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = {"loop-vectorize", "LoopsVectorized"
, "Number of loops vectorized"}
;
176STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = {"loop-vectorize", "LoopsAnalyzed"
, "Number of loops analyzed for vectorization"}
;
177STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized")static llvm::Statistic LoopsEpilogueVectorized = {"loop-vectorize"
, "LoopsEpilogueVectorized", "Number of epilogues vectorized"
}
;
178
179static cl::opt<bool> EnableEpilogueVectorization(
180 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
181 cl::desc("Enable vectorization of epilogue loops."));
182
183static cl::opt<unsigned> EpilogueVectorizationForceVF(
184 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
185 cl::desc("When epilogue vectorization is enabled, and a value greater than "
186 "1 is specified, forces the given VF for all applicable epilogue "
187 "loops."));
188
189static cl::opt<unsigned> EpilogueVectorizationMinVF(
190 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
191 cl::desc("Only loops with vectorization factor equal to or larger than "
192 "the specified value are considered for epilogue vectorization."));
193
194/// Loops with a known constant trip count below this number are vectorized only
195/// if no scalar iteration overheads are incurred.
196static cl::opt<unsigned> TinyTripCountVectorThreshold(
197 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
198 cl::desc("Loops with a constant trip count that is smaller than this "
199 "value are vectorized only if no scalar iteration overheads "
200 "are incurred."));
201
202static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
203 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
204 cl::desc("The maximum allowed number of runtime memory checks with a "
205 "vectorize(enable) pragma."));
206
207// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
208// that predication is preferred, and this lists all options. I.e., the
209// vectorizer will try to fold the tail-loop (epilogue) into the vector body
210// and predicate the instructions accordingly. If tail-folding fails, there are
211// different fallback strategies depending on these values:
212namespace PreferPredicateTy {
213 enum Option {
214 ScalarEpilogue = 0,
215 PredicateElseScalarEpilogue,
216 PredicateOrDontVectorize
217 };
218} // namespace PreferPredicateTy
219
220static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
221 "prefer-predicate-over-epilogue",
222 cl::init(PreferPredicateTy::ScalarEpilogue),
223 cl::Hidden,
224 cl::desc("Tail-folding and predication preferences over creating a scalar "
225 "epilogue loop."),
226 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
227 "scalar-epilogue",llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
228 "Don't tail-predicate loops, create scalar epilogue")llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
,
229 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
230 "predicate-else-scalar-epilogue",llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
231 "prefer tail-folding, create scalar epilogue if tail "llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
232 "folding fails.")llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
,
233 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
234 "predicate-dont-vectorize",llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
235 "prefers tail-folding, don't attempt vectorization if "llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
236 "tail-folding fails.")llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
));
237
238static cl::opt<bool> MaximizeBandwidth(
239 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
240 cl::desc("Maximize bandwidth when selecting vectorization factor which "
241 "will be determined by the smallest type in loop."));
242
243static cl::opt<bool> EnableInterleavedMemAccesses(
244 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
245 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
246
247/// An interleave-group may need masking if it resides in a block that needs
248/// predication, or in order to mask away gaps.
249static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
250 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
251 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
252
253static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
254 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
255 cl::desc("We don't interleave loops with a estimated constant trip count "
256 "below this number"));
257
258static cl::opt<unsigned> ForceTargetNumScalarRegs(
259 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
260 cl::desc("A flag that overrides the target's number of scalar registers."));
261
262static cl::opt<unsigned> ForceTargetNumVectorRegs(
263 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
264 cl::desc("A flag that overrides the target's number of vector registers."));
265
266static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
267 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
268 cl::desc("A flag that overrides the target's max interleave factor for "
269 "scalar loops."));
270
271static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
272 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
273 cl::desc("A flag that overrides the target's max interleave factor for "
274 "vectorized loops."));
275
276static cl::opt<unsigned> ForceTargetInstructionCost(
277 "force-target-instruction-cost", cl::init(0), cl::Hidden,
278 cl::desc("A flag that overrides the target's expected cost for "
279 "an instruction to a single constant value. Mostly "
280 "useful for getting consistent testing."));
281
282static cl::opt<bool> ForceTargetSupportsScalableVectors(
283 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
284 cl::desc(
285 "Pretend that scalable vectors are supported, even if the target does "
286 "not support them. This flag should only be used for testing."));
287
288static cl::opt<unsigned> SmallLoopCost(
289 "small-loop-cost", cl::init(20), cl::Hidden,
290 cl::desc(
291 "The cost of a loop that is considered 'small' by the interleaver."));
292
293static cl::opt<bool> LoopVectorizeWithBlockFrequency(
294 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
295 cl::desc("Enable the use of the block frequency analysis to access PGO "
296 "heuristics minimizing code growth in cold regions and being more "
297 "aggressive in hot regions."));
298
299// Runtime interleave loops for load/store throughput.
300static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
301 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
302 cl::desc(
303 "Enable runtime interleaving until load/store ports are saturated"));
304
305/// Interleave small loops with scalar reductions.
306static cl::opt<bool> InterleaveSmallLoopScalarReduction(
307 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
308 cl::desc("Enable interleaving for loops with small iteration counts that "
309 "contain scalar reductions to expose ILP."));
310
311/// The number of stores in a loop that are allowed to need predication.
312static cl::opt<unsigned> NumberOfStoresToPredicate(
313 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
314 cl::desc("Max number of stores to be predicated behind an if."));
315
316static cl::opt<bool> EnableIndVarRegisterHeur(
317 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
318 cl::desc("Count the induction variable only once when interleaving"));
319
320static cl::opt<bool> EnableCondStoresVectorization(
321 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
322 cl::desc("Enable if predication of stores during vectorization."));
323
324static cl::opt<unsigned> MaxNestedScalarReductionIC(
325 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
326 cl::desc("The maximum interleave count to use when interleaving a scalar "
327 "reduction in a nested loop."));
328
329static cl::opt<bool>
330 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
331 cl::Hidden,
332 cl::desc("Prefer in-loop vector reductions, "
333 "overriding the targets preference."));
334
335cl::opt<bool> EnableStrictReductions(
336 "enable-strict-reductions", cl::init(false), cl::Hidden,
337 cl::desc("Enable the vectorisation of loops with in-order (strict) "
338 "FP reductions"));
339
340static cl::opt<bool> PreferPredicatedReductionSelect(
341 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
342 cl::desc(
343 "Prefer predicating a reduction operation over an after loop select."));
344
345cl::opt<bool> EnableVPlanNativePath(
346 "enable-vplan-native-path", cl::init(false), cl::Hidden,
347 cl::desc("Enable VPlan-native vectorization path with "
348 "support for outer loop vectorization."));
349
350// FIXME: Remove this switch once we have divergence analysis. Currently we
351// assume divergent non-backedge branches when this switch is true.
352cl::opt<bool> EnableVPlanPredication(
353 "enable-vplan-predication", cl::init(false), cl::Hidden,
354 cl::desc("Enable VPlan-native vectorization path predicator with "
355 "support for outer loop vectorization."));
356
357// This flag enables the stress testing of the VPlan H-CFG construction in the
358// VPlan-native vectorization path. It must be used in conjuction with
359// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
360// verification of the H-CFGs built.
361static cl::opt<bool> VPlanBuildStressTest(
362 "vplan-build-stress-test", cl::init(false), cl::Hidden,
363 cl::desc(
364 "Build VPlan for every supported loop nest in the function and bail "
365 "out right after the build (stress test the VPlan H-CFG construction "
366 "in the VPlan-native vectorization path)."));
367
368cl::opt<bool> llvm::EnableLoopInterleaving(
369 "interleave-loops", cl::init(true), cl::Hidden,
370 cl::desc("Enable loop interleaving in Loop vectorization passes"));
371cl::opt<bool> llvm::EnableLoopVectorization(
372 "vectorize-loops", cl::init(true), cl::Hidden,
373 cl::desc("Run the Loop vectorization passes"));
374
375cl::opt<bool> PrintVPlansInDotFormat(
376 "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
377 cl::desc("Use dot format instead of plain text when dumping VPlans"));
378
379/// A helper function that returns true if the given type is irregular. The
380/// type is irregular if its allocated size doesn't equal the store size of an
381/// element of the corresponding vector type.
382static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
383 // Determine if an array of N elements of type Ty is "bitcast compatible"
384 // with a <N x Ty> vector.
385 // This is only true if there is no padding between the array elements.
386 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
387}
388
389/// A helper function that returns the reciprocal of the block probability of
390/// predicated blocks. If we return X, we are assuming the predicated block
391/// will execute once for every X iterations of the loop header.
392///
393/// TODO: We should use actual block probability here, if available. Currently,
394/// we always assume predicated blocks have a 50% chance of executing.
395static unsigned getReciprocalPredBlockProb() { return 2; }
396
397/// A helper function that returns an integer or floating-point constant with
398/// value C.
399static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
400 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
401 : ConstantFP::get(Ty, C);
402}
403
404/// Returns "best known" trip count for the specified loop \p L as defined by
405/// the following procedure:
406/// 1) Returns exact trip count if it is known.
407/// 2) Returns expected trip count according to profile data if any.
408/// 3) Returns upper bound estimate if it is known.
409/// 4) Returns None if all of the above failed.
410static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
411 // Check if exact trip count is known.
412 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
413 return ExpectedTC;
414
415 // Check if there is an expected trip count available from profile data.
416 if (LoopVectorizeWithBlockFrequency)
417 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
418 return EstimatedTC;
419
420 // Check if upper bound estimate is known.
421 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
422 return ExpectedTC;
423
424 return None;
425}
426
427// Forward declare GeneratedRTChecks.
428class GeneratedRTChecks;
429
430namespace llvm {
431
432/// InnerLoopVectorizer vectorizes loops which contain only one basic
433/// block to a specified vectorization factor (VF).
434/// This class performs the widening of scalars into vectors, or multiple
435/// scalars. This class also implements the following features:
436/// * It inserts an epilogue loop for handling loops that don't have iteration
437/// counts that are known to be a multiple of the vectorization factor.
438/// * It handles the code generation for reduction variables.
439/// * Scalarization (implementation using scalars) of un-vectorizable
440/// instructions.
441/// InnerLoopVectorizer does not perform any vectorization-legality
442/// checks, and relies on the caller to check for the different legality
443/// aspects. The InnerLoopVectorizer relies on the
444/// LoopVectorizationLegality class to provide information about the induction
445/// and reduction variables that were found to a given vectorization factor.
446class InnerLoopVectorizer {
447public:
448 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
449 LoopInfo *LI, DominatorTree *DT,
450 const TargetLibraryInfo *TLI,
451 const TargetTransformInfo *TTI, AssumptionCache *AC,
452 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
453 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
454 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
455 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
456 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
457 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
458 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
459 PSI(PSI), RTChecks(RTChecks) {
460 // Query this against the original loop and save it here because the profile
461 // of the original loop header may change as the transformation happens.
462 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
463 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
464 }
465
466 virtual ~InnerLoopVectorizer() = default;
467
468 /// Create a new empty loop that will contain vectorized instructions later
469 /// on, while the old loop will be used as the scalar remainder. Control flow
470 /// is generated around the vectorized (and scalar epilogue) loops consisting
471 /// of various checks and bypasses. Return the pre-header block of the new
472 /// loop.
473 /// In the case of epilogue vectorization, this function is overriden to
474 /// handle the more complex control flow around the loops.
475 virtual BasicBlock *createVectorizedLoopSkeleton();
476
477 /// Widen a single instruction within the innermost loop.
478 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
479 VPTransformState &State);
480
481 /// Widen a single call instruction within the innermost loop.
482 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
483 VPTransformState &State);
484
485 /// Widen a single select instruction within the innermost loop.
486 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
487 bool InvariantCond, VPTransformState &State);
488
489 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
490 void fixVectorizedLoop(VPTransformState &State);
491
492 // Return true if any runtime check is added.
493 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
494
495 /// A type for vectorized values in the new loop. Each value from the
496 /// original loop, when vectorized, is represented by UF vector values in the
497 /// new unrolled loop, where UF is the unroll factor.
498 using VectorParts = SmallVector<Value *, 2>;
499
500 /// Vectorize a single GetElementPtrInst based on information gathered and
501 /// decisions taken during planning.
502 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
503 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
504 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
505
506 /// Vectorize a single first-order recurrence or pointer induction PHINode in
507 /// a block. This method handles the induction variable canonicalization. It
508 /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
509 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
510 VPTransformState &State);
511
512 /// A helper function to scalarize a single Instruction in the innermost loop.
513 /// Generates a sequence of scalar instances for each lane between \p MinLane
514 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
515 /// inclusive. Uses the VPValue operands from \p Operands instead of \p
516 /// Instr's operands.
517 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands,
518 const VPIteration &Instance, bool IfPredicateInstr,
519 VPTransformState &State);
520
521 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
522 /// is provided, the integer induction variable will first be truncated to
523 /// the corresponding type.
524 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc,
525 VPValue *Def, VPValue *CastDef,
526 VPTransformState &State);
527
528 /// Construct the vector value of a scalarized value \p V one lane at a time.
529 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
530 VPTransformState &State);
531
532 /// Try to vectorize interleaved access group \p Group with the base address
533 /// given in \p Addr, optionally masking the vector operations if \p
534 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
535 /// values in the vectorized loop.
536 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
537 ArrayRef<VPValue *> VPDefs,
538 VPTransformState &State, VPValue *Addr,
539 ArrayRef<VPValue *> StoredValues,
540 VPValue *BlockInMask = nullptr);
541
542 /// Vectorize Load and Store instructions with the base address given in \p
543 /// Addr, optionally masking the vector operations if \p BlockInMask is
544 /// non-null. Use \p State to translate given VPValues to IR values in the
545 /// vectorized loop.
546 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
547 VPValue *Def, VPValue *Addr,
548 VPValue *StoredValue, VPValue *BlockInMask);
549
550 /// Set the debug location in the builder \p Ptr using the debug location in
551 /// \p V. If \p Ptr is None then it uses the class member's Builder.
552 void setDebugLocFromInst(const Value *V,
553 Optional<IRBuilder<> *> CustomBuilder = None);
554
555 /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
556 void fixNonInductionPHIs(VPTransformState &State);
557
558 /// Returns true if the reordering of FP operations is not allowed, but we are
559 /// able to vectorize with strict in-order reductions for the given RdxDesc.
560 bool useOrderedReductions(RecurrenceDescriptor &RdxDesc);
561
562 /// Create a broadcast instruction. This method generates a broadcast
563 /// instruction (shuffle) for loop invariant values and for the induction
564 /// value. If this is the induction variable then we extend it to N, N+1, ...
565 /// this is needed because each iteration in the loop corresponds to a SIMD
566 /// element.
567 virtual Value *getBroadcastInstrs(Value *V);
568
569protected:
570 friend class LoopVectorizationPlanner;
571
572 /// A small list of PHINodes.
573 using PhiVector = SmallVector<PHINode *, 4>;
574
575 /// A type for scalarized values in the new loop. Each value from the
576 /// original loop, when scalarized, is represented by UF x VF scalar values
577 /// in the new unrolled loop, where UF is the unroll factor and VF is the
578 /// vectorization factor.
579 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
580
581 /// Set up the values of the IVs correctly when exiting the vector loop.
582 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
583 Value *CountRoundDown, Value *EndValue,
584 BasicBlock *MiddleBlock);
585
586 /// Create a new induction variable inside L.
587 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
588 Value *Step, Instruction *DL);
589
590 /// Handle all cross-iteration phis in the header.
591 void fixCrossIterationPHIs(VPTransformState &State);
592
593 /// Fix a first-order recurrence. This is the second phase of vectorizing
594 /// this phi node.
595 void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State);
596
597 /// Fix a reduction cross-iteration phi. This is the second phase of
598 /// vectorizing this phi node.
599 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
600
601 /// Clear NSW/NUW flags from reduction instructions if necessary.
602 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
603 VPTransformState &State);
604
605 /// Fixup the LCSSA phi nodes in the unique exit block. This simply
606 /// means we need to add the appropriate incoming value from the middle
607 /// block as exiting edges from the scalar epilogue loop (if present) are
608 /// already in place, and we exit the vector loop exclusively to the middle
609 /// block.
610 void fixLCSSAPHIs(VPTransformState &State);
611
612 /// Iteratively sink the scalarized operands of a predicated instruction into
613 /// the block that was created for it.
614 void sinkScalarOperands(Instruction *PredInst);
615
616 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
617 /// represented as.
618 void truncateToMinimalBitwidths(VPTransformState &State);
619
620 /// This function adds
621 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
622 /// to each vector element of Val. The sequence starts at StartIndex.
623 /// \p Opcode is relevant for FP induction variable.
624 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
625 Instruction::BinaryOps Opcode =
626 Instruction::BinaryOpsEnd);
627
628 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
629 /// variable on which to base the steps, \p Step is the size of the step, and
630 /// \p EntryVal is the value from the original loop that maps to the steps.
631 /// Note that \p EntryVal doesn't have to be an induction variable - it
632 /// can also be a truncate instruction.
633 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
634 const InductionDescriptor &ID, VPValue *Def,
635 VPValue *CastDef, VPTransformState &State);
636
637 /// Create a vector induction phi node based on an existing scalar one. \p
638 /// EntryVal is the value from the original loop that maps to the vector phi
639 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
640 /// truncate instruction, instead of widening the original IV, we widen a
641 /// version of the IV truncated to \p EntryVal's type.
642 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
643 Value *Step, Value *Start,
644 Instruction *EntryVal, VPValue *Def,
645 VPValue *CastDef,
646 VPTransformState &State);
647
648 /// Returns true if an instruction \p I should be scalarized instead of
649 /// vectorized for the chosen vectorization factor.
650 bool shouldScalarizeInstruction(Instruction *I) const;
651
652 /// Returns true if we should generate a scalar version of \p IV.
653 bool needsScalarInduction(Instruction *IV) const;
654
655 /// If there is a cast involved in the induction variable \p ID, which should
656 /// be ignored in the vectorized loop body, this function records the
657 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
658 /// cast. We had already proved that the casted Phi is equal to the uncasted
659 /// Phi in the vectorized loop (under a runtime guard), and therefore
660 /// there is no need to vectorize the cast - the same value can be used in the
661 /// vector loop for both the Phi and the cast.
662 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
663 /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
664 ///
665 /// \p EntryVal is the value from the original loop that maps to the vector
666 /// phi node and is used to distinguish what is the IV currently being
667 /// processed - original one (if \p EntryVal is a phi corresponding to the
668 /// original IV) or the "newly-created" one based on the proof mentioned above
669 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
670 /// latter case \p EntryVal is a TruncInst and we must not record anything for
671 /// that IV, but it's error-prone to expect callers of this routine to care
672 /// about that, hence this explicit parameter.
673 void recordVectorLoopValueForInductionCast(
674 const InductionDescriptor &ID, const Instruction *EntryVal,
675 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
676 unsigned Part, unsigned Lane = UINT_MAX(2147483647 *2U +1U));
677
678 /// Generate a shuffle sequence that will reverse the vector Vec.
679 virtual Value *reverseVector(Value *Vec);
680
681 /// Returns (and creates if needed) the original loop trip count.
682 Value *getOrCreateTripCount(Loop *NewLoop);
683
684 /// Returns (and creates if needed) the trip count of the widened loop.
685 Value *getOrCreateVectorTripCount(Loop *NewLoop);
686
687 /// Returns a bitcasted value to the requested vector type.
688 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
689 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
690 const DataLayout &DL);
691
692 /// Emit a bypass check to see if the vector trip count is zero, including if
693 /// it overflows.
694 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
695
696 /// Emit a bypass check to see if all of the SCEV assumptions we've
697 /// had to make are correct. Returns the block containing the checks or
698 /// nullptr if no checks have been added.
699 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
700
701 /// Emit bypass checks to check any memory assumptions we may have made.
702 /// Returns the block containing the checks or nullptr if no checks have been
703 /// added.
704 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
705
706 /// Compute the transformed value of Index at offset StartValue using step
707 /// StepValue.
708 /// For integer induction, returns StartValue + Index * StepValue.
709 /// For pointer induction, returns StartValue[Index * StepValue].
710 /// FIXME: The newly created binary instructions should contain nsw/nuw
711 /// flags, which can be found from the original scalar operations.
712 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
713 const DataLayout &DL,
714 const InductionDescriptor &ID) const;
715
716 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
717 /// vector loop preheader, middle block and scalar preheader. Also
718 /// allocate a loop object for the new vector loop and return it.
719 Loop *createVectorLoopSkeleton(StringRef Prefix);
720
721 /// Create new phi nodes for the induction variables to resume iteration count
722 /// in the scalar epilogue, from where the vectorized loop left off (given by
723 /// \p VectorTripCount).
724 /// In cases where the loop skeleton is more complicated (eg. epilogue
725 /// vectorization) and the resume values can come from an additional bypass
726 /// block, the \p AdditionalBypass pair provides information about the bypass
727 /// block and the end value on the edge from bypass to this loop.
728 void createInductionResumeValues(
729 Loop *L, Value *VectorTripCount,
730 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
731
732 /// Complete the loop skeleton by adding debug MDs, creating appropriate
733 /// conditional branches in the middle block, preparing the builder and
734 /// running the verifier. Take in the vector loop \p L as argument, and return
735 /// the preheader of the completed vector loop.
736 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
737
738 /// Add additional metadata to \p To that was not present on \p Orig.
739 ///
740 /// Currently this is used to add the noalias annotations based on the
741 /// inserted memchecks. Use this for instructions that are *cloned* into the
742 /// vector loop.
743 void addNewMetadata(Instruction *To, const Instruction *Orig);
744
745 /// Add metadata from one instruction to another.
746 ///
747 /// This includes both the original MDs from \p From and additional ones (\see
748 /// addNewMetadata). Use this for *newly created* instructions in the vector
749 /// loop.
750 void addMetadata(Instruction *To, Instruction *From);
751
752 /// Similar to the previous function but it adds the metadata to a
753 /// vector of instructions.
754 void addMetadata(ArrayRef<Value *> To, Instruction *From);
755
756 /// Allow subclasses to override and print debug traces before/after vplan
757 /// execution, when trace information is requested.
758 virtual void printDebugTracesAtStart(){};
759 virtual void printDebugTracesAtEnd(){};
760
761 /// The original loop.
762 Loop *OrigLoop;
763
764 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
765 /// dynamic knowledge to simplify SCEV expressions and converts them to a
766 /// more usable form.
767 PredicatedScalarEvolution &PSE;
768
769 /// Loop Info.
770 LoopInfo *LI;
771
772 /// Dominator Tree.
773 DominatorTree *DT;
774
775 /// Alias Analysis.
776 AAResults *AA;
777
778 /// Target Library Info.
779 const TargetLibraryInfo *TLI;
780
781 /// Target Transform Info.
782 const TargetTransformInfo *TTI;
783
784 /// Assumption Cache.
785 AssumptionCache *AC;
786
787 /// Interface to emit optimization remarks.
788 OptimizationRemarkEmitter *ORE;
789
790 /// LoopVersioning. It's only set up (non-null) if memchecks were
791 /// used.
792 ///
793 /// This is currently only used to add no-alias metadata based on the
794 /// memchecks. The actually versioning is performed manually.
795 std::unique_ptr<LoopVersioning> LVer;
796
797 /// The vectorization SIMD factor to use. Each vector will have this many
798 /// vector elements.
799 ElementCount VF;
800
801 /// The vectorization unroll factor to use. Each scalar is vectorized to this
802 /// many different vector instructions.
803 unsigned UF;
804
805 /// The builder that we use
806 IRBuilder<> Builder;
807
808 // --- Vectorization state ---
809
810 /// The vector-loop preheader.
811 BasicBlock *LoopVectorPreHeader;
812
813 /// The scalar-loop preheader.
814 BasicBlock *LoopScalarPreHeader;
815
816 /// Middle Block between the vector and the scalar.
817 BasicBlock *LoopMiddleBlock;
818
819 /// The unique ExitBlock of the scalar loop if one exists. Note that
820 /// there can be multiple exiting edges reaching this block.
821 BasicBlock *LoopExitBlock;
822
823 /// The vector loop body.
824 BasicBlock *LoopVectorBody;
825
826 /// The scalar loop body.
827 BasicBlock *LoopScalarBody;
828
829 /// A list of all bypass blocks. The first block is the entry of the loop.
830 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
831
832 /// The new Induction variable which was added to the new block.
833 PHINode *Induction = nullptr;
834
835 /// The induction variable of the old basic block.
836 PHINode *OldInduction = nullptr;
837
838 /// Store instructions that were predicated.
839 SmallVector<Instruction *, 4> PredicatedInstructions;
840
841 /// Trip count of the original loop.
842 Value *TripCount = nullptr;
843
844 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
845 Value *VectorTripCount = nullptr;
846
847 /// The legality analysis.
848 LoopVectorizationLegality *Legal;
849
850 /// The profitablity analysis.
851 LoopVectorizationCostModel *Cost;
852
853 // Record whether runtime checks are added.
854 bool AddedSafetyChecks = false;
855
856 // Holds the end values for each induction variable. We save the end values
857 // so we can later fix-up the external users of the induction variables.
858 DenseMap<PHINode *, Value *> IVEndValues;
859
860 // Vector of original scalar PHIs whose corresponding widened PHIs need to be
861 // fixed up at the end of vector code generation.
862 SmallVector<PHINode *, 8> OrigPHIsToFix;
863
864 /// BFI and PSI are used to check for profile guided size optimizations.
865 BlockFrequencyInfo *BFI;
866 ProfileSummaryInfo *PSI;
867
868 // Whether this loop should be optimized for size based on profile guided size
869 // optimizatios.
870 bool OptForSizeBasedOnProfile;
871
872 /// Structure to hold information about generated runtime checks, responsible
873 /// for cleaning the checks, if vectorization turns out unprofitable.
874 GeneratedRTChecks &RTChecks;
875};
876
877class InnerLoopUnroller : public InnerLoopVectorizer {
878public:
879 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
880 LoopInfo *LI, DominatorTree *DT,
881 const TargetLibraryInfo *TLI,
882 const TargetTransformInfo *TTI, AssumptionCache *AC,
883 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
884 LoopVectorizationLegality *LVL,
885 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
886 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
887 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
888 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
889 BFI, PSI, Check) {}
890
891private:
892 Value *getBroadcastInstrs(Value *V) override;
893 Value *getStepVector(Value *Val, int StartIdx, Value *Step,
894 Instruction::BinaryOps Opcode =
895 Instruction::BinaryOpsEnd) override;
896 Value *reverseVector(Value *Vec) override;
897};
898
899/// Encapsulate information regarding vectorization of a loop and its epilogue.
900/// This information is meant to be updated and used across two stages of
901/// epilogue vectorization.
902struct EpilogueLoopVectorizationInfo {
903 ElementCount MainLoopVF = ElementCount::getFixed(0);
904 unsigned MainLoopUF = 0;
905 ElementCount EpilogueVF = ElementCount::getFixed(0);
906 unsigned EpilogueUF = 0;
907 BasicBlock *MainLoopIterationCountCheck = nullptr;
908 BasicBlock *EpilogueIterationCountCheck = nullptr;
909 BasicBlock *SCEVSafetyCheck = nullptr;
910 BasicBlock *MemSafetyCheck = nullptr;
911 Value *TripCount = nullptr;
912 Value *VectorTripCount = nullptr;
913
914 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
915 unsigned EUF)
916 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
917 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
918 assert(EUF == 1 &&(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 919, __extension__ __PRETTY_FUNCTION__))
919 "A high UF for the epilogue loop is likely not beneficial.")(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 919, __extension__ __PRETTY_FUNCTION__))
;
920 }
921};
922
923/// An extension of the inner loop vectorizer that creates a skeleton for a
924/// vectorized loop that has its epilogue (residual) also vectorized.
925/// The idea is to run the vplan on a given loop twice, firstly to setup the
926/// skeleton and vectorize the main loop, and secondly to complete the skeleton
927/// from the first step and vectorize the epilogue. This is achieved by
928/// deriving two concrete strategy classes from this base class and invoking
929/// them in succession from the loop vectorizer planner.
930class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
931public:
932 InnerLoopAndEpilogueVectorizer(
933 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
934 DominatorTree *DT, const TargetLibraryInfo *TLI,
935 const TargetTransformInfo *TTI, AssumptionCache *AC,
936 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
937 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
938 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
939 GeneratedRTChecks &Checks)
940 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
941 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
942 Checks),
943 EPI(EPI) {}
944
945 // Override this function to handle the more complex control flow around the
946 // three loops.
947 BasicBlock *createVectorizedLoopSkeleton() final override {
948 return createEpilogueVectorizedLoopSkeleton();
949 }
950
951 /// The interface for creating a vectorized skeleton using one of two
952 /// different strategies, each corresponding to one execution of the vplan
953 /// as described above.
954 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
955
956 /// Holds and updates state information required to vectorize the main loop
957 /// and its epilogue in two separate passes. This setup helps us avoid
958 /// regenerating and recomputing runtime safety checks. It also helps us to
959 /// shorten the iteration-count-check path length for the cases where the
960 /// iteration count of the loop is so small that the main vector loop is
961 /// completely skipped.
962 EpilogueLoopVectorizationInfo &EPI;
963};
964
965/// A specialized derived class of inner loop vectorizer that performs
966/// vectorization of *main* loops in the process of vectorizing loops and their
967/// epilogues.
968class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
969public:
970 EpilogueVectorizerMainLoop(
971 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
972 DominatorTree *DT, const TargetLibraryInfo *TLI,
973 const TargetTransformInfo *TTI, AssumptionCache *AC,
974 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
975 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
976 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
977 GeneratedRTChecks &Check)
978 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
979 EPI, LVL, CM, BFI, PSI, Check) {}
980 /// Implements the interface for creating a vectorized skeleton using the
981 /// *main loop* strategy (ie the first pass of vplan execution).
982 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
983
984protected:
985 /// Emits an iteration count bypass check once for the main loop (when \p
986 /// ForEpilogue is false) and once for the epilogue loop (when \p
987 /// ForEpilogue is true).
988 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
989 bool ForEpilogue);
990 void printDebugTracesAtStart() override;
991 void printDebugTracesAtEnd() override;
992};
993
994// A specialized derived class of inner loop vectorizer that performs
995// vectorization of *epilogue* loops in the process of vectorizing loops and
996// their epilogues.
997class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
998public:
999 EpilogueVectorizerEpilogueLoop(
1000 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
1001 DominatorTree *DT, const TargetLibraryInfo *TLI,
1002 const TargetTransformInfo *TTI, AssumptionCache *AC,
1003 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
1004 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
1005 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
1006 GeneratedRTChecks &Checks)
1007 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1008 EPI, LVL, CM, BFI, PSI, Checks) {}
1009 /// Implements the interface for creating a vectorized skeleton using the
1010 /// *epilogue loop* strategy (ie the second pass of vplan execution).
1011 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1012
1013protected:
1014 /// Emits an iteration count bypass check after the main vector loop has
1015 /// finished to see if there are any iterations left to execute by either
1016 /// the vector epilogue or the scalar epilogue.
1017 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1018 BasicBlock *Bypass,
1019 BasicBlock *Insert);
1020 void printDebugTracesAtStart() override;
1021 void printDebugTracesAtEnd() override;
1022};
1023} // end namespace llvm
1024
1025/// Look for a meaningful debug location on the instruction or it's
1026/// operands.
1027static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1028 if (!I)
1029 return I;
1030
1031 DebugLoc Empty;
1032 if (I->getDebugLoc() != Empty)
1033 return I;
1034
1035 for (Use &Op : I->operands()) {
1036 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
1037 if (OpInst->getDebugLoc() != Empty)
1038 return OpInst;
1039 }
1040
1041 return I;
1042}
1043
1044void InnerLoopVectorizer::setDebugLocFromInst(
1045 const Value *V, Optional<IRBuilder<> *> CustomBuilder) {
1046 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
1047 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
1048 const DILocation *DIL = Inst->getDebugLoc();
1049
1050 // When a FSDiscriminator is enabled, we don't need to add the multiply
1051 // factors to the discriminators.
1052 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1053 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
1054 // FIXME: For scalable vectors, assume vscale=1.
1055 auto NewDIL =
1056 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1057 if (NewDIL)
1058 B->SetCurrentDebugLocation(NewDIL.getValue());
1059 else
1060 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
1061 << "Failed to create new discriminator: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
1062 << DIL->getFilename() << " Line: " << DIL->getLine())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
;
1063 } else
1064 B->SetCurrentDebugLocation(DIL);
1065 } else
1066 B->SetCurrentDebugLocation(DebugLoc());
1067}
1068
1069/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
1070/// is passed, the message relates to that particular instruction.
1071#ifndef NDEBUG
1072static void debugVectorizationMessage(const StringRef Prefix,
1073 const StringRef DebugMsg,
1074 Instruction *I) {
1075 dbgs() << "LV: " << Prefix << DebugMsg;
1076 if (I != nullptr)
1077 dbgs() << " " << *I;
1078 else
1079 dbgs() << '.';
1080 dbgs() << '\n';
1081}
1082#endif
1083
1084/// Create an analysis remark that explains why vectorization failed
1085///
1086/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
1087/// RemarkName is the identifier for the remark. If \p I is passed it is an
1088/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
1089/// the location of the remark. \return the remark object that can be
1090/// streamed to.
1091static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1092 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1093 Value *CodeRegion = TheLoop->getHeader();
1094 DebugLoc DL = TheLoop->getStartLoc();
1095
1096 if (I) {
1097 CodeRegion = I->getParent();
1098 // If there is no debug location attached to the instruction, revert back to
1099 // using the loop's.
1100 if (I->getDebugLoc())
1101 DL = I->getDebugLoc();
1102 }
1103
1104 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1105}
1106
1107/// Return a value for Step multiplied by VF.
1108static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1109 assert(isa<ConstantInt>(Step) && "Expected an integer step")(static_cast <bool> (isa<ConstantInt>(Step) &&
"Expected an integer step") ? void (0) : __assert_fail ("isa<ConstantInt>(Step) && \"Expected an integer step\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1109, __extension__ __PRETTY_FUNCTION__))
;
1110 Constant *StepVal = ConstantInt::get(
1111 Step->getType(),
1112 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1113 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1114}
1115
1116namespace llvm {
1117
1118/// Return the runtime value for VF.
1119Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
1120 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1121 return VF.isScalable() ? B.CreateVScale(EC) : EC;
1122}
1123
1124void reportVectorizationFailure(const StringRef DebugMsg,
1125 const StringRef OREMsg, const StringRef ORETag,
1126 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1127 Instruction *I) {
1128 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationMessage("Not vectorizing: "
, DebugMsg, I); } } while (false)
;
1129 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1130 ORE->emit(
1131 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1132 << "loop not vectorized: " << OREMsg);
1133}
1134
1135void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1136 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1137 Instruction *I) {
1138 LLVM_DEBUG(debugVectorizationMessage("", Msg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationMessage("", Msg, I); }
} while (false)
;
1139 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1140 ORE->emit(
1141 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1142 << Msg);
1143}
1144
1145} // end namespace llvm
1146
1147#ifndef NDEBUG
1148/// \return string containing a file name and a line # for the given loop.
1149static std::string getDebugLocString(const Loop *L) {
1150 std::string Result;
1151 if (L) {
1152 raw_string_ostream OS(Result);
1153 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1154 LoopDbgLoc.print(OS);
1155 else
1156 // Just print the module name.
1157 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1158 OS.flush();
1159 }
1160 return Result;
1161}
1162#endif
1163
1164void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1165 const Instruction *Orig) {
1166 // If the loop was versioned with memchecks, add the corresponding no-alias
1167 // metadata.
1168 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1169 LVer->annotateInstWithNoAlias(To, Orig);
1170}
1171
1172void InnerLoopVectorizer::addMetadata(Instruction *To,
1173 Instruction *From) {
1174 propagateMetadata(To, From);
1175 addNewMetadata(To, From);
1176}
1177
1178void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1179 Instruction *From) {
1180 for (Value *V : To) {
1181 if (Instruction *I = dyn_cast<Instruction>(V))
1182 addMetadata(I, From);
1183 }
1184}
1185
1186namespace llvm {
1187
1188// Loop vectorization cost-model hints how the scalar epilogue loop should be
1189// lowered.
1190enum ScalarEpilogueLowering {
1191
1192 // The default: allowing scalar epilogues.
1193 CM_ScalarEpilogueAllowed,
1194
1195 // Vectorization with OptForSize: don't allow epilogues.
1196 CM_ScalarEpilogueNotAllowedOptSize,
1197
1198 // A special case of vectorisation with OptForSize: loops with a very small
1199 // trip count are considered for vectorization under OptForSize, thereby
1200 // making sure the cost of their loop body is dominant, free of runtime
1201 // guards and scalar iteration overheads.
1202 CM_ScalarEpilogueNotAllowedLowTripLoop,
1203
1204 // Loop hint predicate indicating an epilogue is undesired.
1205 CM_ScalarEpilogueNotNeededUsePredicate,
1206
1207 // Directive indicating we must either tail fold or not vectorize
1208 CM_ScalarEpilogueNotAllowedUsePredicate
1209};
1210
1211/// ElementCountComparator creates a total ordering for ElementCount
1212/// for the purposes of using it in a set structure.
1213struct ElementCountComparator {
1214 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1215 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1216 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1217 }
1218};
1219using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1220
1221/// LoopVectorizationCostModel - estimates the expected speedups due to
1222/// vectorization.
1223/// In many cases vectorization is not profitable. This can happen because of
1224/// a number of reasons. In this class we mainly attempt to predict the
1225/// expected speedup/slowdowns due to the supported instruction set. We use the
1226/// TargetTransformInfo to query the different backends for the cost of
1227/// different operations.
1228class LoopVectorizationCostModel {
1229public:
1230 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1231 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1232 LoopVectorizationLegality *Legal,
1233 const TargetTransformInfo &TTI,
1234 const TargetLibraryInfo *TLI, DemandedBits *DB,
1235 AssumptionCache *AC,
1236 OptimizationRemarkEmitter *ORE, const Function *F,
1237 const LoopVectorizeHints *Hints,
1238 InterleavedAccessInfo &IAI)
1239 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1240 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1241 Hints(Hints), InterleaveInfo(IAI) {}
1242
1243 /// \return An upper bound for the vectorization factors (both fixed and
1244 /// scalable). If the factors are 0, vectorization and interleaving should be
1245 /// avoided up front.
1246 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1247
1248 /// \return True if runtime checks are required for vectorization, and false
1249 /// otherwise.
1250 bool runtimeChecksRequired();
1251
1252 /// \return The most profitable vectorization factor and the cost of that VF.
1253 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1254 /// then this vectorization factor will be selected if vectorization is
1255 /// possible.
1256 VectorizationFactor
1257 selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1258
1259 VectorizationFactor
1260 selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1261 const LoopVectorizationPlanner &LVP);
1262
1263 /// Setup cost-based decisions for user vectorization factor.
1264 /// \return true if the UserVF is a feasible VF to be chosen.
1265 bool selectUserVectorizationFactor(ElementCount UserVF) {
1266 collectUniformsAndScalars(UserVF);
1267 collectInstsToScalarize(UserVF);
1268 return expectedCost(UserVF).first.isValid();
1269 }
1270
1271 /// \return The size (in bits) of the smallest and widest types in the code
1272 /// that needs to be vectorized. We ignore values that remain scalar such as
1273 /// 64 bit loop indices.
1274 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1275
1276 /// \return The desired interleave count.
1277 /// If interleave count has been specified by metadata it will be returned.
1278 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1279 /// are the selected vectorization factor and the cost of the selected VF.
1280 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1281
1282 /// Memory access instruction may be vectorized in more than one way.
1283 /// Form of instruction after vectorization depends on cost.
1284 /// This function takes cost-based decisions for Load/Store instructions
1285 /// and collects them in a map. This decisions map is used for building
1286 /// the lists of loop-uniform and loop-scalar instructions.
1287 /// The calculated cost is saved with widening decision in order to
1288 /// avoid redundant calculations.
1289 void setCostBasedWideningDecision(ElementCount VF);
1290
1291 /// A struct that represents some properties of the register usage
1292 /// of a loop.
1293 struct RegisterUsage {
1294 /// Holds the number of loop invariant values that are used in the loop.
1295 /// The key is ClassID of target-provided register class.
1296 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1297 /// Holds the maximum number of concurrent live intervals in the loop.
1298 /// The key is ClassID of target-provided register class.
1299 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1300 };
1301
1302 /// \return Returns information about the register usages of the loop for the
1303 /// given vectorization factors.
1304 SmallVector<RegisterUsage, 8>
1305 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1306
1307 /// Collect values we want to ignore in the cost model.
1308 void collectValuesToIgnore();
1309
1310 /// Collect all element types in the loop for which widening is needed.
1311 void collectElementTypesForWidening();
1312
1313 /// Split reductions into those that happen in the loop, and those that happen
1314 /// outside. In loop reductions are collected into InLoopReductionChains.
1315 void collectInLoopReductions();
1316
1317 /// Returns true if we should use strict in-order reductions for the given
1318 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1319 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1320 /// of FP operations.
1321 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1322 return EnableStrictReductions && !Hints->allowReordering() &&
1323 RdxDesc.isOrdered();
1324 }
1325
1326 /// \returns The smallest bitwidth each instruction can be represented with.
1327 /// The vector equivalents of these instructions should be truncated to this
1328 /// type.
1329 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1330 return MinBWs;
1331 }
1332
1333 /// \returns True if it is more profitable to scalarize instruction \p I for
1334 /// vectorization factor \p VF.
1335 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1336 assert(VF.isVector() &&(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1337, __extension__ __PRETTY_FUNCTION__))
1337 "Profitable to scalarize relevant only for VF > 1.")(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1337, __extension__ __PRETTY_FUNCTION__))
;
1338
1339 // Cost model is not run in the VPlan-native path - return conservative
1340 // result until this changes.
1341 if (EnableVPlanNativePath)
1342 return false;
1343
1344 auto Scalars = InstsToScalarize.find(VF);
1345 assert(Scalars != InstsToScalarize.end() &&(static_cast <bool> (Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability") ? void
(0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1346, __extension__ __PRETTY_FUNCTION__))
1346 "VF not yet analyzed for scalarization profitability")(static_cast <bool> (Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability") ? void
(0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1346, __extension__ __PRETTY_FUNCTION__))
;
1347 return Scalars->second.find(I) != Scalars->second.end();
1348 }
1349
1350 /// Returns true if \p I is known to be uniform after vectorization.
1351 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1352 if (VF.isScalar())
1353 return true;
1354
1355 // Cost model is not run in the VPlan-native path - return conservative
1356 // result until this changes.
1357 if (EnableVPlanNativePath)
1358 return false;
1359
1360 auto UniformsPerVF = Uniforms.find(VF);
1361 assert(UniformsPerVF != Uniforms.end() &&(static_cast <bool> (UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity") ? void (0) : __assert_fail
("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1362, __extension__ __PRETTY_FUNCTION__))
1362 "VF not yet analyzed for uniformity")(static_cast <bool> (UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity") ? void (0) : __assert_fail
("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1362, __extension__ __PRETTY_FUNCTION__))
;
1363 return UniformsPerVF->second.count(I);
1364 }
1365
1366 /// Returns true if \p I is known to be scalar after vectorization.
1367 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1368 if (VF.isScalar())
1369 return true;
1370
1371 // Cost model is not run in the VPlan-native path - return conservative
1372 // result until this changes.
1373 if (EnableVPlanNativePath)
1374 return false;
1375
1376 auto ScalarsPerVF = Scalars.find(VF);
1377 assert(ScalarsPerVF != Scalars.end() &&(static_cast <bool> (ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF") ? void (0) : __assert_fail
("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1378, __extension__ __PRETTY_FUNCTION__))
1378 "Scalar values are not calculated for VF")(static_cast <bool> (ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF") ? void (0) : __assert_fail
("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1378, __extension__ __PRETTY_FUNCTION__))
;
1379 return ScalarsPerVF->second.count(I);
1380 }
1381
1382 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1383 /// for vectorization factor \p VF.
1384 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1385 return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1386 !isProfitableToScalarize(I, VF) &&
1387 !isScalarAfterVectorization(I, VF);
1388 }
1389
1390 /// Decision that was taken during cost calculation for memory instruction.
1391 enum InstWidening {
1392 CM_Unknown,
1393 CM_Widen, // For consecutive accesses with stride +1.
1394 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1395 CM_Interleave,
1396 CM_GatherScatter,
1397 CM_Scalarize
1398 };
1399
1400 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1401 /// instruction \p I and vector width \p VF.
1402 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1403 InstructionCost Cost) {
1404 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1404, __extension__ __PRETTY_FUNCTION__))
;
1405 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1406 }
1407
1408 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1409 /// interleaving group \p Grp and vector width \p VF.
1410 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1411 ElementCount VF, InstWidening W,
1412 InstructionCost Cost) {
1413 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1413, __extension__ __PRETTY_FUNCTION__))
;
1414 /// Broadcast this decicion to all instructions inside the group.
1415 /// But the cost will be assigned to one instruction only.
1416 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1417 if (auto *I = Grp->getMember(i)) {
1418 if (Grp->getInsertPos() == I)
1419 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1420 else
1421 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1422 }
1423 }
1424 }
1425
1426 /// Return the cost model decision for the given instruction \p I and vector
1427 /// width \p VF. Return CM_Unknown if this instruction did not pass
1428 /// through the cost modeling.
1429 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1430 assert(VF.isVector() && "Expected VF to be a vector VF")(static_cast <bool> (VF.isVector() && "Expected VF to be a vector VF"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF to be a vector VF\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1430, __extension__ __PRETTY_FUNCTION__))
;
1431 // Cost model is not run in the VPlan-native path - return conservative
1432 // result until this changes.
1433 if (EnableVPlanNativePath)
1434 return CM_GatherScatter;
1435
1436 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1437 auto Itr = WideningDecisions.find(InstOnVF);
1438 if (Itr == WideningDecisions.end())
1439 return CM_Unknown;
1440 return Itr->second.first;
1441 }
1442
1443 /// Return the vectorization cost for the given instruction \p I and vector
1444 /// width \p VF.
1445 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1446 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1446, __extension__ __PRETTY_FUNCTION__))
;
1447 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1448 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&(static_cast <bool> (WideningDecisions.find(InstOnVF) !=
WideningDecisions.end() && "The cost is not calculated"
) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1449, __extension__ __PRETTY_FUNCTION__))
1449 "The cost is not calculated")(static_cast <bool> (WideningDecisions.find(InstOnVF) !=
WideningDecisions.end() && "The cost is not calculated"
) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1449, __extension__ __PRETTY_FUNCTION__))
;
1450 return WideningDecisions[InstOnVF].second;
1451 }
1452
1453 /// Return True if instruction \p I is an optimizable truncate whose operand
1454 /// is an induction variable. Such a truncate will be removed by adding a new
1455 /// induction variable with the destination type.
1456 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1457 // If the instruction is not a truncate, return false.
1458 auto *Trunc = dyn_cast<TruncInst>(I);
1459 if (!Trunc)
1460 return false;
1461
1462 // Get the source and destination types of the truncate.
1463 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1464 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1465
1466 // If the truncate is free for the given types, return false. Replacing a
1467 // free truncate with an induction variable would add an induction variable
1468 // update instruction to each iteration of the loop. We exclude from this
1469 // check the primary induction variable since it will need an update
1470 // instruction regardless.
1471 Value *Op = Trunc->getOperand(0);
1472 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1473 return false;
1474
1475 // If the truncated value is not an induction variable, return false.
1476 return Legal->isInductionPhi(Op);
1477 }
1478
1479 /// Collects the instructions to scalarize for each predicated instruction in
1480 /// the loop.
1481 void collectInstsToScalarize(ElementCount VF);
1482
1483 /// Collect Uniform and Scalar values for the given \p VF.
1484 /// The sets depend on CM decision for Load/Store instructions
1485 /// that may be vectorized as interleave, gather-scatter or scalarized.
1486 void collectUniformsAndScalars(ElementCount VF) {
1487 // Do the analysis once.
1488 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1489 return;
1490 setCostBasedWideningDecision(VF);
1491 collectLoopUniforms(VF);
1492 collectLoopScalars(VF);
1493 }
1494
1495 /// Returns true if the target machine supports masked store operation
1496 /// for the given \p DataType and kind of access to \p Ptr.
1497 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1498 return Legal->isConsecutivePtr(Ptr) &&
1499 TTI.isLegalMaskedStore(DataType, Alignment);
1500 }
1501
1502 /// Returns true if the target machine supports masked load operation
1503 /// for the given \p DataType and kind of access to \p Ptr.
1504 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1505 return Legal->isConsecutivePtr(Ptr) &&
1506 TTI.isLegalMaskedLoad(DataType, Alignment);
1507 }
1508
1509 /// Returns true if the target machine can represent \p V as a masked gather
1510 /// or scatter operation.
1511 bool isLegalGatherOrScatter(Value *V) {
1512 bool LI = isa<LoadInst>(V);
1513 bool SI = isa<StoreInst>(V);
1514 if (!LI && !SI)
1515 return false;
1516 auto *Ty = getLoadStoreType(V);
1517 Align Align = getLoadStoreAlignment(V);
1518 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1519 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1520 }
1521
1522 /// Returns true if the target machine supports all of the reduction
1523 /// variables found for the given VF.
1524 bool canVectorizeReductions(ElementCount VF) const {
1525 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1526 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1527 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1528 }));
1529 }
1530
1531 /// Returns true if \p I is an instruction that will be scalarized with
1532 /// predication. Such instructions include conditional stores and
1533 /// instructions that may divide by zero.
1534 /// If a non-zero VF has been calculated, we check if I will be scalarized
1535 /// predication for that VF.
1536 bool isScalarWithPredication(Instruction *I) const;
1537
1538 // Returns true if \p I is an instruction that will be predicated either
1539 // through scalar predication or masked load/store or masked gather/scatter.
1540 // Superset of instructions that return true for isScalarWithPredication.
1541 bool isPredicatedInst(Instruction *I) {
1542 if (!blockNeedsPredication(I->getParent()))
1543 return false;
1544 // Loads and stores that need some form of masked operation are predicated
1545 // instructions.
1546 if (isa<LoadInst>(I) || isa<StoreInst>(I))
1547 return Legal->isMaskRequired(I);
1548 return isScalarWithPredication(I);
1549 }
1550
1551 /// Returns true if \p I is a memory instruction with consecutive memory
1552 /// access that can be widened.
1553 bool
1554 memoryInstructionCanBeWidened(Instruction *I,
1555 ElementCount VF = ElementCount::getFixed(1));
1556
1557 /// Returns true if \p I is a memory instruction in an interleaved-group
1558 /// of memory accesses that can be vectorized with wide vector loads/stores
1559 /// and shuffles.
1560 bool
1561 interleavedAccessCanBeWidened(Instruction *I,
1562 ElementCount VF = ElementCount::getFixed(1));
1563
1564 /// Check if \p Instr belongs to any interleaved access group.
1565 bool isAccessInterleaved(Instruction *Instr) {
1566 return InterleaveInfo.isInterleaved(Instr);
1567 }
1568
1569 /// Get the interleaved access group that \p Instr belongs to.
1570 const InterleaveGroup<Instruction> *
1571 getInterleavedAccessGroup(Instruction *Instr) {
1572 return InterleaveInfo.getInterleaveGroup(Instr);
1573 }
1574
1575 /// Returns true if we're required to use a scalar epilogue for at least
1576 /// the final iteration of the original loop.
1577 bool requiresScalarEpilogue(ElementCount VF) const {
1578 if (!isScalarEpilogueAllowed())
1579 return false;
1580 // If we might exit from anywhere but the latch, must run the exiting
1581 // iteration in scalar form.
1582 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1583 return true;
1584 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1585 }
1586
1587 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1588 /// loop hint annotation.
1589 bool isScalarEpilogueAllowed() const {
1590 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1591 }
1592
1593 /// Returns true if all loop blocks should be masked to fold tail loop.
1594 bool foldTailByMasking() const { return FoldTailByMasking; }
1595
1596 bool blockNeedsPredication(BasicBlock *BB) const {
1597 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1598 }
1599
1600 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1601 /// nodes to the chain of instructions representing the reductions. Uses a
1602 /// MapVector to ensure deterministic iteration order.
1603 using ReductionChainMap =
1604 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1605
1606 /// Return the chain of instructions representing an inloop reduction.
1607 const ReductionChainMap &getInLoopReductionChains() const {
1608 return InLoopReductionChains;
1609 }
1610
1611 /// Returns true if the Phi is part of an inloop reduction.
1612 bool isInLoopReduction(PHINode *Phi) const {
1613 return InLoopReductionChains.count(Phi);
1614 }
1615
1616 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1617 /// with factor VF. Return the cost of the instruction, including
1618 /// scalarization overhead if it's needed.
1619 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1620
1621 /// Estimate cost of a call instruction CI if it were vectorized with factor
1622 /// VF. Return the cost of the instruction, including scalarization overhead
1623 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1624 /// scalarized -
1625 /// i.e. either vector version isn't available, or is too expensive.
1626 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1627 bool &NeedToScalarize) const;
1628
1629 /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1630 /// that of B.
1631 bool isMoreProfitable(const VectorizationFactor &A,
1632 const VectorizationFactor &B) const;
1633
1634 /// Invalidates decisions already taken by the cost model.
1635 void invalidateCostModelingDecisions() {
1636 WideningDecisions.clear();
1637 Uniforms.clear();
1638 Scalars.clear();
1639 }
1640
1641private:
1642 unsigned NumPredStores = 0;
1643
1644 /// \return An upper bound for the vectorization factors for both
1645 /// fixed and scalable vectorization, where the minimum-known number of
1646 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1647 /// disabled or unsupported, then the scalable part will be equal to
1648 /// ElementCount::getScalable(0).
1649 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1650 ElementCount UserVF);
1651
1652 /// \return the maximized element count based on the targets vector
1653 /// registers and the loop trip-count, but limited to a maximum safe VF.
1654 /// This is a helper function of computeFeasibleMaxVF.
1655 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1656 /// issue that occurred on one of the buildbots which cannot be reproduced
1657 /// without having access to the properietary compiler (see comments on
1658 /// D98509). The issue is currently under investigation and this workaround
1659 /// will be removed as soon as possible.
1660 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1661 unsigned SmallestType,
1662 unsigned WidestType,
1663 const ElementCount &MaxSafeVF);
1664
1665 /// \return the maximum legal scalable VF, based on the safe max number
1666 /// of elements.
1667 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1668
1669 /// The vectorization cost is a combination of the cost itself and a boolean
1670 /// indicating whether any of the contributing operations will actually
1671 /// operate on vector values after type legalization in the backend. If this
1672 /// latter value is false, then all operations will be scalarized (i.e. no
1673 /// vectorization has actually taken place).
1674 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1675
1676 /// Returns the expected execution cost. The unit of the cost does
1677 /// not matter because we use the 'cost' units to compare different
1678 /// vector widths. The cost that is returned is *not* normalized by
1679 /// the factor width. If \p Invalid is not nullptr, this function
1680 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1681 /// each instruction that has an Invalid cost for the given VF.
1682 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1683 VectorizationCostTy
1684 expectedCost(ElementCount VF,
1685 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1686
1687 /// Returns the execution time cost of an instruction for a given vector
1688 /// width. Vector width of one means scalar.
1689 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1690
1691 /// The cost-computation logic from getInstructionCost which provides
1692 /// the vector type as an output parameter.
1693 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1694 Type *&VectorTy);
1695
1696 /// Return the cost of instructions in an inloop reduction pattern, if I is
1697 /// part of that pattern.
1698 Optional<InstructionCost>
1699 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1700 TTI::TargetCostKind CostKind);
1701
1702 /// Calculate vectorization cost of memory instruction \p I.
1703 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1704
1705 /// The cost computation for scalarized memory instruction.
1706 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1707
1708 /// The cost computation for interleaving group of memory instructions.
1709 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1710
1711 /// The cost computation for Gather/Scatter instruction.
1712 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1713
1714 /// The cost computation for widening instruction \p I with consecutive
1715 /// memory access.
1716 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1717
1718 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1719 /// Load: scalar load + broadcast.
1720 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1721 /// element)
1722 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1723
1724 /// Estimate the overhead of scalarizing an instruction. This is a
1725 /// convenience wrapper for the type-based getScalarizationOverhead API.
1726 InstructionCost getScalarizationOverhead(Instruction *I,
1727 ElementCount VF) const;
1728
1729 /// Returns whether the instruction is a load or store and will be a emitted
1730 /// as a vector operation.
1731 bool isConsecutiveLoadOrStore(Instruction *I);
1732
1733 /// Returns true if an artificially high cost for emulated masked memrefs
1734 /// should be used.
1735 bool useEmulatedMaskMemRefHack(Instruction *I);
1736
1737 /// Map of scalar integer values to the smallest bitwidth they can be legally
1738 /// represented as. The vector equivalents of these values should be truncated
1739 /// to this type.
1740 MapVector<Instruction *, uint64_t> MinBWs;
1741
1742 /// A type representing the costs for instructions if they were to be
1743 /// scalarized rather than vectorized. The entries are Instruction-Cost
1744 /// pairs.
1745 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1746
1747 /// A set containing all BasicBlocks that are known to present after
1748 /// vectorization as a predicated block.
1749 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1750
1751 /// Records whether it is allowed to have the original scalar loop execute at
1752 /// least once. This may be needed as a fallback loop in case runtime
1753 /// aliasing/dependence checks fail, or to handle the tail/remainder
1754 /// iterations when the trip count is unknown or doesn't divide by the VF,
1755 /// or as a peel-loop to handle gaps in interleave-groups.
1756 /// Under optsize and when the trip count is very small we don't allow any
1757 /// iterations to execute in the scalar loop.
1758 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1759
1760 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1761 bool FoldTailByMasking = false;
1762
1763 /// A map holding scalar costs for different vectorization factors. The
1764 /// presence of a cost for an instruction in the mapping indicates that the
1765 /// instruction will be scalarized when vectorizing with the associated
1766 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1767 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1768
1769 /// Holds the instructions known to be uniform after vectorization.
1770 /// The data is collected per VF.
1771 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1772
1773 /// Holds the instructions known to be scalar after vectorization.
1774 /// The data is collected per VF.
1775 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1776
1777 /// Holds the instructions (address computations) that are forced to be
1778 /// scalarized.
1779 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1780
1781 /// PHINodes of the reductions that should be expanded in-loop along with
1782 /// their associated chains of reduction operations, in program order from top
1783 /// (PHI) to bottom
1784 ReductionChainMap InLoopReductionChains;
1785
1786 /// A Map of inloop reduction operations and their immediate chain operand.
1787 /// FIXME: This can be removed once reductions can be costed correctly in
1788 /// vplan. This was added to allow quick lookup to the inloop operations,
1789 /// without having to loop through InLoopReductionChains.
1790 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1791
1792 /// Returns the expected difference in cost from scalarizing the expression
1793 /// feeding a predicated instruction \p PredInst. The instructions to
1794 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1795 /// non-negative return value implies the expression will be scalarized.
1796 /// Currently, only single-use chains are considered for scalarization.
1797 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1798 ElementCount VF);
1799
1800 /// Collect the instructions that are uniform after vectorization. An
1801 /// instruction is uniform if we represent it with a single scalar value in
1802 /// the vectorized loop corresponding to each vector iteration. Examples of
1803 /// uniform instructions include pointer operands of consecutive or
1804 /// interleaved memory accesses. Note that although uniformity implies an
1805 /// instruction will be scalar, the reverse is not true. In general, a
1806 /// scalarized instruction will be represented by VF scalar values in the
1807 /// vectorized loop, each corresponding to an iteration of the original
1808 /// scalar loop.
1809 void collectLoopUniforms(ElementCount VF);
1810
1811 /// Collect the instructions that are scalar after vectorization. An
1812 /// instruction is scalar if it is known to be uniform or will be scalarized
1813 /// during vectorization. Non-uniform scalarized instructions will be
1814 /// represented by VF values in the vectorized loop, each corresponding to an
1815 /// iteration of the original scalar loop.
1816 void collectLoopScalars(ElementCount VF);
1817
1818 /// Keeps cost model vectorization decision and cost for instructions.
1819 /// Right now it is used for memory instructions only.
1820 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1821 std::pair<InstWidening, InstructionCost>>;
1822
1823 DecisionList WideningDecisions;
1824
1825 /// Returns true if \p V is expected to be vectorized and it needs to be
1826 /// extracted.
1827 bool needsExtract(Value *V, ElementCount VF) const {
1828 Instruction *I = dyn_cast<Instruction>(V);
1829 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1830 TheLoop->isLoopInvariant(I))
1831 return false;
1832
1833 // Assume we can vectorize V (and hence we need extraction) if the
1834 // scalars are not computed yet. This can happen, because it is called
1835 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1836 // the scalars are collected. That should be a safe assumption in most
1837 // cases, because we check if the operands have vectorizable types
1838 // beforehand in LoopVectorizationLegality.
1839 return Scalars.find(VF) == Scalars.end() ||
1840 !isScalarAfterVectorization(I, VF);
1841 };
1842
1843 /// Returns a range containing only operands needing to be extracted.
1844 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1845 ElementCount VF) const {
1846 return SmallVector<Value *, 4>(make_filter_range(
1847 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1848 }
1849
1850 /// Determines if we have the infrastructure to vectorize loop \p L and its
1851 /// epilogue, assuming the main loop is vectorized by \p VF.
1852 bool isCandidateForEpilogueVectorization(const Loop &L,
1853 const ElementCount VF) const;
1854
1855 /// Returns true if epilogue vectorization is considered profitable, and
1856 /// false otherwise.
1857 /// \p VF is the vectorization factor chosen for the original loop.
1858 bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1859
1860public:
1861 /// The loop that we evaluate.
1862 Loop *TheLoop;
1863
1864 /// Predicated scalar evolution analysis.
1865 PredicatedScalarEvolution &PSE;
1866
1867 /// Loop Info analysis.
1868 LoopInfo *LI;
1869
1870 /// Vectorization legality.
1871 LoopVectorizationLegality *Legal;
1872
1873 /// Vector target information.
1874 const TargetTransformInfo &TTI;
1875
1876 /// Target Library Info.
1877 const TargetLibraryInfo *TLI;
1878
1879 /// Demanded bits analysis.
1880 DemandedBits *DB;
1881
1882 /// Assumption cache.
1883 AssumptionCache *AC;
1884
1885 /// Interface to emit optimization remarks.
1886 OptimizationRemarkEmitter *ORE;
1887
1888 const Function *TheFunction;
1889
1890 /// Loop Vectorize Hint.
1891 const LoopVectorizeHints *Hints;
1892
1893 /// The interleave access information contains groups of interleaved accesses
1894 /// with the same stride and close to each other.
1895 InterleavedAccessInfo &InterleaveInfo;
1896
1897 /// Values to ignore in the cost model.
1898 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1899
1900 /// Values to ignore in the cost model when VF > 1.
1901 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1902
1903 /// All element types found in the loop.
1904 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1905
1906 /// Profitable vector factors.
1907 SmallVector<VectorizationFactor, 8> ProfitableVFs;
1908};
1909} // end namespace llvm
1910
1911/// Helper struct to manage generating runtime checks for vectorization.
1912///
1913/// The runtime checks are created up-front in temporary blocks to allow better
1914/// estimating the cost and un-linked from the existing IR. After deciding to
1915/// vectorize, the checks are moved back. If deciding not to vectorize, the
1916/// temporary blocks are completely removed.
1917class GeneratedRTChecks {
1918 /// Basic block which contains the generated SCEV checks, if any.
1919 BasicBlock *SCEVCheckBlock = nullptr;
1920
1921 /// The value representing the result of the generated SCEV checks. If it is
1922 /// nullptr, either no SCEV checks have been generated or they have been used.
1923 Value *SCEVCheckCond = nullptr;
1924
1925 /// Basic block which contains the generated memory runtime checks, if any.
1926 BasicBlock *MemCheckBlock = nullptr;
1927
1928 /// The value representing the result of the generated memory runtime checks.
1929 /// If it is nullptr, either no memory runtime checks have been generated or
1930 /// they have been used.
1931 Instruction *MemRuntimeCheckCond = nullptr;
1932
1933 DominatorTree *DT;
1934 LoopInfo *LI;
1935
1936 SCEVExpander SCEVExp;
1937 SCEVExpander MemCheckExp;
1938
1939public:
1940 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1941 const DataLayout &DL)
1942 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1943 MemCheckExp(SE, DL, "scev.check") {}
1944
1945 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1946 /// accurately estimate the cost of the runtime checks. The blocks are
1947 /// un-linked from the IR and is added back during vector code generation. If
1948 /// there is no vector code generation, the check blocks are removed
1949 /// completely.
1950 void Create(Loop *L, const LoopAccessInfo &LAI,
1951 const SCEVUnionPredicate &UnionPred) {
1952
1953 BasicBlock *LoopHeader = L->getHeader();
1954 BasicBlock *Preheader = L->getLoopPreheader();
1955
1956 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1957 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1958 // may be used by SCEVExpander. The blocks will be un-linked from their
1959 // predecessors and removed from LI & DT at the end of the function.
1960 if (!UnionPred.isAlwaysTrue()) {
1961 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1962 nullptr, "vector.scevcheck");
1963
1964 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1965 &UnionPred, SCEVCheckBlock->getTerminator());
1966 }
1967
1968 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1969 if (RtPtrChecking.Need) {
1970 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1971 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1972 "vector.memcheck");
1973
1974 std::tie(std::ignore, MemRuntimeCheckCond) =
1975 addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1976 RtPtrChecking.getChecks(), MemCheckExp);
1977 assert(MemRuntimeCheckCond &&(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1979, __extension__ __PRETTY_FUNCTION__))
1978 "no RT checks generated although RtPtrChecking "(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1979, __extension__ __PRETTY_FUNCTION__))
1979 "claimed checks are required")(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1979, __extension__ __PRETTY_FUNCTION__))
;
1980 }
1981
1982 if (!MemCheckBlock && !SCEVCheckBlock)
1983 return;
1984
1985 // Unhook the temporary block with the checks, update various places
1986 // accordingly.
1987 if (SCEVCheckBlock)
1988 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1989 if (MemCheckBlock)
1990 MemCheckBlock->replaceAllUsesWith(Preheader);
1991
1992 if (SCEVCheckBlock) {
1993 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1994 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1995 Preheader->getTerminator()->eraseFromParent();
1996 }
1997 if (MemCheckBlock) {
1998 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1999 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2000 Preheader->getTerminator()->eraseFromParent();
2001 }
2002
2003 DT->changeImmediateDominator(LoopHeader, Preheader);
2004 if (MemCheckBlock) {
2005 DT->eraseNode(MemCheckBlock);
2006 LI->removeBlock(MemCheckBlock);
2007 }
2008 if (SCEVCheckBlock) {
2009 DT->eraseNode(SCEVCheckBlock);
2010 LI->removeBlock(SCEVCheckBlock);
2011 }
2012 }
2013
2014 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2015 /// unused.
2016 ~GeneratedRTChecks() {
2017 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
2018 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
2019 if (!SCEVCheckCond)
2020 SCEVCleaner.markResultUsed();
2021
2022 if (!MemRuntimeCheckCond)
2023 MemCheckCleaner.markResultUsed();
2024
2025 if (MemRuntimeCheckCond) {
2026 auto &SE = *MemCheckExp.getSE();
2027 // Memory runtime check generation creates compares that use expanded
2028 // values. Remove them before running the SCEVExpanderCleaners.
2029 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2030 if (MemCheckExp.isInsertedInstruction(&I))
2031 continue;
2032 SE.forgetValue(&I);
2033 SE.eraseValueFromMap(&I);
2034 I.eraseFromParent();
2035 }
2036 }
2037 MemCheckCleaner.cleanup();
2038 SCEVCleaner.cleanup();
2039
2040 if (SCEVCheckCond)
2041 SCEVCheckBlock->eraseFromParent();
2042 if (MemRuntimeCheckCond)
2043 MemCheckBlock->eraseFromParent();
2044 }
2045
2046 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2047 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2048 /// depending on the generated condition.
2049 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
2050 BasicBlock *LoopVectorPreHeader,
2051 BasicBlock *LoopExitBlock) {
2052 if (!SCEVCheckCond)
2053 return nullptr;
2054 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2055 if (C->isZero())
2056 return nullptr;
2057
2058 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2059
2060 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2061 // Create new preheader for vector loop.
2062 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2063 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2064
2065 SCEVCheckBlock->getTerminator()->eraseFromParent();
2066 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2067 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2068 SCEVCheckBlock);
2069
2070 DT->addNewBlock(SCEVCheckBlock, Pred);
2071 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2072
2073 ReplaceInstWithInst(
2074 SCEVCheckBlock->getTerminator(),
2075 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2076 // Mark the check as used, to prevent it from being removed during cleanup.
2077 SCEVCheckCond = nullptr;
2078 return SCEVCheckBlock;
2079 }
2080
2081 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2082 /// the branches to branch to the vector preheader or \p Bypass, depending on
2083 /// the generated condition.
2084 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2085 BasicBlock *LoopVectorPreHeader) {
2086 // Check if we generated code that checks in runtime if arrays overlap.
2087 if (!MemRuntimeCheckCond)
2088 return nullptr;
2089
2090 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2091 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2092 MemCheckBlock);
2093
2094 DT->addNewBlock(MemCheckBlock, Pred);
2095 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2096 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2097
2098 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2099 PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2100
2101 ReplaceInstWithInst(
2102 MemCheckBlock->getTerminator(),
2103 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2104 MemCheckBlock->getTerminator()->setDebugLoc(
2105 Pred->getTerminator()->getDebugLoc());
2106
2107 // Mark the check as used, to prevent it from being removed during cleanup.
2108 MemRuntimeCheckCond = nullptr;
2109 return MemCheckBlock;
2110 }
2111};
2112
2113// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2114// vectorization. The loop needs to be annotated with #pragma omp simd
2115// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2116// vector length information is not provided, vectorization is not considered
2117// explicit. Interleave hints are not allowed either. These limitations will be
2118// relaxed in the future.
2119// Please, note that we are currently forced to abuse the pragma 'clang
2120// vectorize' semantics. This pragma provides *auto-vectorization hints*
2121// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2122// provides *explicit vectorization hints* (LV can bypass legal checks and
2123// assume that vectorization is legal). However, both hints are implemented
2124// using the same metadata (llvm.loop.vectorize, processed by
2125// LoopVectorizeHints). This will be fixed in the future when the native IR
2126// representation for pragma 'omp simd' is introduced.
2127static bool isExplicitVecOuterLoop(Loop *OuterLp,
2128 OptimizationRemarkEmitter *ORE) {
2129 assert(!OuterLp->isInnermost() && "This is not an outer loop")(static_cast <bool> (!OuterLp->isInnermost() &&
"This is not an outer loop") ? void (0) : __assert_fail ("!OuterLp->isInnermost() && \"This is not an outer loop\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2129, __extension__ __PRETTY_FUNCTION__))
;
2130 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2131
2132 // Only outer loops with an explicit vectorization hint are supported.
2133 // Unannotated outer loops are ignored.
2134 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2135 return false;
2136
2137 Function *Fn = OuterLp->getHeader()->getParent();
2138 if (!Hints.allowVectorization(Fn, OuterLp,
2139 true /*VectorizeOnlyWhenForced*/)) {
2140 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"
; } } while (false)
;
2141 return false;
2142 }
2143
2144 if (Hints.getInterleave() > 1) {
2145 // TODO: Interleave support is future work.
2146 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
2147 "outer loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
;
2148 Hints.emitRemarkWithHints();
2149 return false;
2150 }
2151
2152 return true;
2153}
2154
2155static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2156 OptimizationRemarkEmitter *ORE,
2157 SmallVectorImpl<Loop *> &V) {
2158 // Collect inner loops and outer loops without irreducible control flow. For
2159 // now, only collect outer loops that have explicit vectorization hints. If we
2160 // are stress testing the VPlan H-CFG construction, we collect the outermost
2161 // loop of every loop nest.
2162 if (L.isInnermost() || VPlanBuildStressTest ||
2163 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2164 LoopBlocksRPO RPOT(&L);
2165 RPOT.perform(LI);
2166 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2167 V.push_back(&L);
2168 // TODO: Collect inner loops inside marked outer loops in case
2169 // vectorization fails for the outer loop. Do not invoke
2170 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2171 // already known to be reducible. We can use an inherited attribute for
2172 // that.
2173 return;
2174 }
2175 }
2176 for (Loop *InnerL : L)
2177 collectSupportedLoops(*InnerL, LI, ORE, V);
2178}
2179
2180namespace {
2181
2182/// The LoopVectorize Pass.
2183struct LoopVectorize : public FunctionPass {
2184 /// Pass identification, replacement for typeid
2185 static char ID;
2186
2187 LoopVectorizePass Impl;
2188
2189 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2190 bool VectorizeOnlyWhenForced = false)
2191 : FunctionPass(ID),
2192 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2193 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2194 }
2195
2196 bool runOnFunction(Function &F) override {
2197 if (skipFunction(F))
2198 return false;
2199
2200 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2201 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2202 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2203 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2204 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2205 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2206 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2207 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2208 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2209 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2210 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2211 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2212 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2213
2214 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2215 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2216
2217 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2218 GetLAA, *ORE, PSI).MadeAnyChange;
2219 }
2220
2221 void getAnalysisUsage(AnalysisUsage &AU) const override {
2222 AU.addRequired<AssumptionCacheTracker>();
2223 AU.addRequired<BlockFrequencyInfoWrapperPass>();
2224 AU.addRequired<DominatorTreeWrapperPass>();
2225 AU.addRequired<LoopInfoWrapperPass>();
2226 AU.addRequired<ScalarEvolutionWrapperPass>();
2227 AU.addRequired<TargetTransformInfoWrapperPass>();
2228 AU.addRequired<AAResultsWrapperPass>();
2229 AU.addRequired<LoopAccessLegacyAnalysis>();
2230 AU.addRequired<DemandedBitsWrapperPass>();
2231 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2232 AU.addRequired<InjectTLIMappingsLegacy>();
2233
2234 // We currently do not preserve loopinfo/dominator analyses with outer loop
2235 // vectorization. Until this is addressed, mark these analyses as preserved
2236 // only for non-VPlan-native path.
2237 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2238 if (!EnableVPlanNativePath) {
2239 AU.addPreserved<LoopInfoWrapperPass>();
2240 AU.addPreserved<DominatorTreeWrapperPass>();
2241 }
2242
2243 AU.addPreserved<BasicAAWrapperPass>();
2244 AU.addPreserved<GlobalsAAWrapperPass>();
2245 AU.addRequired<ProfileSummaryInfoWrapperPass>();
2246 }
2247};
2248
2249} // end anonymous namespace
2250
2251//===----------------------------------------------------------------------===//
2252// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2253// LoopVectorizationCostModel and LoopVectorizationPlanner.
2254//===----------------------------------------------------------------------===//
2255
2256Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2257 // We need to place the broadcast of invariant variables outside the loop,
2258 // but only if it's proven safe to do so. Else, broadcast will be inside
2259 // vector loop body.
2260 Instruction *Instr = dyn_cast<Instruction>(V);
2261 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2262 (!Instr ||
2263 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2264 // Place the code for broadcasting invariant variables in the new preheader.
2265 IRBuilder<>::InsertPointGuard Guard(Builder);
2266 if (SafeToHoist)
2267 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2268
2269 // Broadcast the scalar into all locations in the vector.
2270 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2271
2272 return Shuf;
2273}
2274
2275void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2276 const InductionDescriptor &II, Value *Step, Value *Start,
2277 Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
2278 VPTransformState &State) {
2279 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2280, __extension__ __PRETTY_FUNCTION__))
2280 "Expected either an induction phi-node or a truncate of it!")(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2280, __extension__ __PRETTY_FUNCTION__))
;
2281
2282 // Construct the initial value of the vector IV in the vector loop preheader
2283 auto CurrIP = Builder.saveIP();
2284 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2285 if (isa<TruncInst>(EntryVal)) {
2286 assert(Start->getType()->isIntegerTy() &&(static_cast <bool> (Start->getType()->isIntegerTy
() && "Truncation requires an integer type") ? void (
0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2287, __extension__ __PRETTY_FUNCTION__))
2287 "Truncation requires an integer type")(static_cast <bool> (Start->getType()->isIntegerTy
() && "Truncation requires an integer type") ? void (
0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2287, __extension__ __PRETTY_FUNCTION__))
;
2288 auto *TruncType = cast<IntegerType>(EntryVal->getType());
2289 Step = Builder.CreateTrunc(Step, TruncType);
2290 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2291 }
2292 Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2293 Value *SteppedStart =
2294 getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2295
2296 // We create vector phi nodes for both integer and floating-point induction
2297 // variables. Here, we determine the kind of arithmetic we will perform.
2298 Instruction::BinaryOps AddOp;
2299 Instruction::BinaryOps MulOp;
2300 if (Step->getType()->isIntegerTy()) {
2301 AddOp = Instruction::Add;
2302 MulOp = Instruction::Mul;
2303 } else {
2304 AddOp = II.getInductionOpcode();
2305 MulOp = Instruction::FMul;
2306 }
2307
2308 // Multiply the vectorization factor by the step using integer or
2309 // floating-point arithmetic as appropriate.
2310 Type *StepType = Step->getType();
2311 if (Step->getType()->isFloatingPointTy())
2312 StepType = IntegerType::get(StepType->getContext(),
2313 StepType->getScalarSizeInBits());
2314 Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF);
2315 if (Step->getType()->isFloatingPointTy())
2316 RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType());
2317 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2318
2319 // Create a vector splat to use in the induction update.
2320 //
2321 // FIXME: If the step is non-constant, we create the vector splat with
2322 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2323 // handle a constant vector splat.
2324 Value *SplatVF = isa<Constant>(Mul)
2325 ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2326 : Builder.CreateVectorSplat(VF, Mul);
2327 Builder.restoreIP(CurrIP);
2328
2329 // We may need to add the step a number of times, depending on the unroll
2330 // factor. The last of those goes into the PHI.
2331 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2332 &*LoopVectorBody->getFirstInsertionPt());
2333 VecInd->setDebugLoc(EntryVal->getDebugLoc());
2334 Instruction *LastInduction = VecInd;
2335 for (unsigned Part = 0; Part < UF; ++Part) {
2336 State.set(Def, LastInduction, Part);
2337
2338 if (isa<TruncInst>(EntryVal))
2339 addMetadata(LastInduction, EntryVal);
2340 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
2341 State, Part);
2342
2343 LastInduction = cast<Instruction>(
2344 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2345 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2346 }
2347
2348 // Move the last step to the end of the latch block. This ensures consistent
2349 // placement of all induction updates.
2350 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2351 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2352 auto *ICmp = cast<Instruction>(Br->getCondition());
2353 LastInduction->moveBefore(ICmp);
2354 LastInduction->setName("vec.ind.next");
2355
2356 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2357 VecInd->addIncoming(LastInduction, LoopVectorLatch);
2358}
2359
2360bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2361 return Cost->isScalarAfterVectorization(I, VF) ||
2362 Cost->isProfitableToScalarize(I, VF);
2363}
2364
2365bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2366 if (shouldScalarizeInstruction(IV))
2367 return true;
2368 auto isScalarInst = [&](User *U) -> bool {
2369 auto *I = cast<Instruction>(U);
2370 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2371 };
2372 return llvm::any_of(IV->users(), isScalarInst);
2373}
2374
2375void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2376 const InductionDescriptor &ID, const Instruction *EntryVal,
2377 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
2378 unsigned Part, unsigned Lane) {
2379 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2380, __extension__ __PRETTY_FUNCTION__))
2380 "Expected either an induction phi-node or a truncate of it!")(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2380, __extension__ __PRETTY_FUNCTION__))
;
2381
2382 // This induction variable is not the phi from the original loop but the
2383 // newly-created IV based on the proof that casted Phi is equal to the
2384 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2385 // re-uses the same InductionDescriptor that original IV uses but we don't
2386 // have to do any recording in this case - that is done when original IV is
2387 // processed.
2388 if (isa<TruncInst>(EntryVal))
2389 return;
2390
2391 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2392 if (Casts.empty())
2393 return;
2394 // Only the first Cast instruction in the Casts vector is of interest.
2395 // The rest of the Casts (if exist) have no uses outside the
2396 // induction update chain itself.
2397 if (Lane < UINT_MAX(2147483647 *2U +1U))
2398 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
2399 else
2400 State.set(CastDef, VectorLoopVal, Part);
2401}
2402
2403void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
2404 TruncInst *Trunc, VPValue *Def,
2405 VPValue *CastDef,
2406 VPTransformState &State) {
2407 assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&(static_cast <bool> ((IV->getType()->isIntegerTy(
) || IV != OldInduction) && "Primary induction variable must have an integer type"
) ? void (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2408, __extension__ __PRETTY_FUNCTION__))
2408 "Primary induction variable must have an integer type")(static_cast <bool> ((IV->getType()->isIntegerTy(
) || IV != OldInduction) && "Primary induction variable must have an integer type"
) ? void (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2408, __extension__ __PRETTY_FUNCTION__))
;
2409
2410 auto II = Legal->getInductionVars().find(IV);
2411 assert(II != Legal->getInductionVars().end() && "IV is not an induction")(static_cast <bool> (II != Legal->getInductionVars()
.end() && "IV is not an induction") ? void (0) : __assert_fail
("II != Legal->getInductionVars().end() && \"IV is not an induction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2411, __extension__ __PRETTY_FUNCTION__))
;
2412
2413 auto ID = II->second;
2414 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match")(static_cast <bool> (IV->getType() == ID.getStartValue
()->getType() && "Types must match") ? void (0) : __assert_fail
("IV->getType() == ID.getStartValue()->getType() && \"Types must match\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2414, __extension__ __PRETTY_FUNCTION__))
;
2415
2416 // The value from the original loop to which we are mapping the new induction
2417 // variable.
2418 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2419
2420 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2421
2422 // Generate code for the induction step. Note that induction steps are
2423 // required to be loop-invariant
2424 auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2425 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&(static_cast <bool> (PSE.getSE()->isLoopInvariant(Step
, OrigLoop) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(Step, OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2426, __extension__ __PRETTY_FUNCTION__))
2426 "Induction step should be loop invariant")(static_cast <bool> (PSE.getSE()->isLoopInvariant(Step
, OrigLoop) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(Step, OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2426, __extension__ __PRETTY_FUNCTION__))
;
2427 if (PSE.getSE()->isSCEVable(IV->getType())) {
2428 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2429 return Exp.expandCodeFor(Step, Step->getType(),
2430 LoopVectorPreHeader->getTerminator());
2431 }
2432 return cast<SCEVUnknown>(Step)->getValue();
2433 };
2434
2435 // The scalar value to broadcast. This is derived from the canonical
2436 // induction variable. If a truncation type is given, truncate the canonical
2437 // induction variable and step. Otherwise, derive these values from the
2438 // induction descriptor.
2439 auto CreateScalarIV = [&](Value *&Step) -> Value * {
2440 Value *ScalarIV = Induction;
2441 if (IV != OldInduction) {
2442 ScalarIV = IV->getType()->isIntegerTy()
2443 ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2444 : Builder.CreateCast(Instruction::SIToFP, Induction,
2445 IV->getType());
2446 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2447 ScalarIV->setName("offset.idx");
2448 }
2449 if (Trunc) {
2450 auto *TruncType = cast<IntegerType>(Trunc->getType());
2451 assert(Step->getType()->isIntegerTy() &&(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2452, __extension__ __PRETTY_FUNCTION__))
2452 "Truncation requires an integer step")(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2452, __extension__ __PRETTY_FUNCTION__))
;
2453 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2454 Step = Builder.CreateTrunc(Step, TruncType);
2455 }
2456 return ScalarIV;
2457 };
2458
2459 // Create the vector values from the scalar IV, in the absence of creating a
2460 // vector IV.
2461 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2462 Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2463 for (unsigned Part = 0; Part < UF; ++Part) {
2464 assert(!VF.isScalable() && "scalable vectors not yet supported.")(static_cast <bool> (!VF.isScalable() && "scalable vectors not yet supported."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2464, __extension__ __PRETTY_FUNCTION__))
;
2465 Value *EntryPart =
2466 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2467 ID.getInductionOpcode());
2468 State.set(Def, EntryPart, Part);
2469 if (Trunc)
2470 addMetadata(EntryPart, Trunc);
2471 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
2472 State, Part);
2473 }
2474 };
2475
2476 // Fast-math-flags propagate from the original induction instruction.
2477 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2478 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2479 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2480
2481 // Now do the actual transformations, and start with creating the step value.
2482 Value *Step = CreateStepValue(ID.getStep());
2483 if (VF.isZero() || VF.isScalar()) {
2484 Value *ScalarIV = CreateScalarIV(Step);
2485 CreateSplatIV(ScalarIV, Step);
2486 return;
2487 }
2488
2489 // Determine if we want a scalar version of the induction variable. This is
2490 // true if the induction variable itself is not widened, or if it has at
2491 // least one user in the loop that is not widened.
2492 auto NeedsScalarIV = needsScalarInduction(EntryVal);
2493 if (!NeedsScalarIV) {
2494 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2495 State);
2496 return;
2497 }
2498
2499 // Try to create a new independent vector induction variable. If we can't
2500 // create the phi node, we will splat the scalar induction variable in each
2501 // loop iteration.
2502 if (!shouldScalarizeInstruction(EntryVal)) {
2503 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2504 State);
2505 Value *ScalarIV = CreateScalarIV(Step);
2506 // Create scalar steps that can be used by instructions we will later
2507 // scalarize. Note that the addition of the scalar steps will not increase
2508 // the number of instructions in the loop in the common case prior to
2509 // InstCombine. We will be trading one vector extract for each scalar step.
2510 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2511 return;
2512 }
2513
2514 // All IV users are scalar instructions, so only emit a scalar IV, not a
2515 // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2516 // predicate used by the masked loads/stores.
2517 Value *ScalarIV = CreateScalarIV(Step);
2518 if (!Cost->isScalarEpilogueAllowed())
2519 CreateSplatIV(ScalarIV, Step);
2520 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2521}
2522
2523Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2524 Instruction::BinaryOps BinOp) {
2525 // Create and check the types.
2526 auto *ValVTy = cast<VectorType>(Val->getType());
2527 ElementCount VLen = ValVTy->getElementCount();
2528
2529 Type *STy = Val->getType()->getScalarType();
2530 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2531, __extension__ __PRETTY_FUNCTION__))
2531 "Induction Step must be an integer or FP")(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2531, __extension__ __PRETTY_FUNCTION__))
;
2532 assert(Step->getType() == STy && "Step has wrong type")(static_cast <bool> (Step->getType() == STy &&
"Step has wrong type") ? void (0) : __assert_fail ("Step->getType() == STy && \"Step has wrong type\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2532, __extension__ __PRETTY_FUNCTION__))
;
2533
2534 SmallVector<Constant *, 8> Indices;
2535
2536 // Create a vector of consecutive numbers from zero to VF.
2537 VectorType *InitVecValVTy = ValVTy;
2538 Type *InitVecValSTy = STy;
2539 if (STy->isFloatingPointTy()) {
2540 InitVecValSTy =
2541 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2542 InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2543 }
2544 Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2545
2546 // Add on StartIdx
2547 Value *StartIdxSplat = Builder.CreateVectorSplat(
2548 VLen, ConstantInt::get(InitVecValSTy, StartIdx));
2549 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2550
2551 if (STy->isIntegerTy()) {
2552 Step = Builder.CreateVectorSplat(VLen, Step);
2553 assert(Step->getType() == Val->getType() && "Invalid step vec")(static_cast <bool> (Step->getType() == Val->getType
() && "Invalid step vec") ? void (0) : __assert_fail (
"Step->getType() == Val->getType() && \"Invalid step vec\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2553, __extension__ __PRETTY_FUNCTION__))
;
2554 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2555 // which can be found from the original scalar operations.
2556 Step = Builder.CreateMul(InitVec, Step);
2557 return Builder.CreateAdd(Val, Step, "induction");
2558 }
2559
2560 // Floating point induction.
2561 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
== Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2562, __extension__ __PRETTY_FUNCTION__))
2562 "Binary Opcode should be specified for FP induction")(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
== Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2562, __extension__ __PRETTY_FUNCTION__))
;
2563 InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2564 Step = Builder.CreateVectorSplat(VLen, Step);
2565 Value *MulOp = Builder.CreateFMul(InitVec, Step);
2566 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2567}
2568
2569void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2570 Instruction *EntryVal,
2571 const InductionDescriptor &ID,
2572 VPValue *Def, VPValue *CastDef,
2573 VPTransformState &State) {
2574 // We shouldn't have to build scalar steps if we aren't vectorizing.
2575 assert(VF.isVector() && "VF should be greater than one")(static_cast <bool> (VF.isVector() && "VF should be greater than one"
) ? void (0) : __assert_fail ("VF.isVector() && \"VF should be greater than one\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2575, __extension__ __PRETTY_FUNCTION__))
;
2576 // Get the value type and ensure it and the step have the same integer type.
2577 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2578 assert(ScalarIVTy == Step->getType() &&(static_cast <bool> (ScalarIVTy == Step->getType() &&
"Val and Step should have the same type") ? void (0) : __assert_fail
("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2579, __extension__ __PRETTY_FUNCTION__))
2579 "Val and Step should have the same type")(static_cast <bool> (ScalarIVTy == Step->getType() &&
"Val and Step should have the same type") ? void (0) : __assert_fail
("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2579, __extension__ __PRETTY_FUNCTION__))
;
2580
2581 // We build scalar steps for both integer and floating-point induction
2582 // variables. Here, we determine the kind of arithmetic we will perform.
2583 Instruction::BinaryOps AddOp;
2584 Instruction::BinaryOps MulOp;
2585 if (ScalarIVTy->isIntegerTy()) {
2586 AddOp = Instruction::Add;
2587 MulOp = Instruction::Mul;
2588 } else {
2589 AddOp = ID.getInductionOpcode();
2590 MulOp = Instruction::FMul;
2591 }
2592
2593 // Determine the number of scalars we need to generate for each unroll
2594 // iteration. If EntryVal is uniform, we only need to generate the first
2595 // lane. Otherwise, we generate all VF values.
2596 bool IsUniform =
2597 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF);
2598 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue();
2599 // Compute the scalar steps and save the results in State.
2600 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2601 ScalarIVTy->getScalarSizeInBits());
2602 Type *VecIVTy = nullptr;
2603 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2604 if (!IsUniform && VF.isScalable()) {
2605 VecIVTy = VectorType::get(ScalarIVTy, VF);
2606 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF));
2607 SplatStep = Builder.CreateVectorSplat(VF, Step);
2608 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV);
2609 }
2610
2611 for (unsigned Part = 0; Part < UF; ++Part) {
2612 Value *StartIdx0 =
2613 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2614
2615 if (!IsUniform && VF.isScalable()) {
2616 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0);
2617 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2618 if (ScalarIVTy->isFloatingPointTy())
2619 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2620 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2621 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2622 State.set(Def, Add, Part);
2623 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2624 Part);
2625 // It's useful to record the lane values too for the known minimum number
2626 // of elements so we do those below. This improves the code quality when
2627 // trying to extract the first element, for example.
2628 }
2629
2630 if (ScalarIVTy->isFloatingPointTy())
2631 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2632
2633 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2634 Value *StartIdx = Builder.CreateBinOp(
2635 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2636 // The step returned by `createStepForVF` is a runtime-evaluated value
2637 // when VF is scalable. Otherwise, it should be folded into a Constant.
2638 assert((VF.isScalable() || isa<Constant>(StartIdx)) &&(static_cast <bool> ((VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2640, __extension__ __PRETTY_FUNCTION__))
2639 "Expected StartIdx to be folded to a constant when VF is not "(static_cast <bool> ((VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2640, __extension__ __PRETTY_FUNCTION__))
2640 "scalable")(static_cast <bool> ((VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2640, __extension__ __PRETTY_FUNCTION__))
;
2641 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2642 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2643 State.set(Def, Add, VPIteration(Part, Lane));
2644 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2645 Part, Lane);
2646 }
2647 }
2648}
2649
2650void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2651 const VPIteration &Instance,
2652 VPTransformState &State) {
2653 Value *ScalarInst = State.get(Def, Instance);
2654 Value *VectorValue = State.get(Def, Instance.Part);
2655 VectorValue = Builder.CreateInsertElement(
2656 VectorValue, ScalarInst,
2657 Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2658 State.set(Def, VectorValue, Instance.Part);
2659}
2660
2661Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2662 assert(Vec->getType()->isVectorTy() && "Invalid type")(static_cast <bool> (Vec->getType()->isVectorTy()
&& "Invalid type") ? void (0) : __assert_fail ("Vec->getType()->isVectorTy() && \"Invalid type\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2662, __extension__ __PRETTY_FUNCTION__))
;
2663 return Builder.CreateVectorReverse(Vec, "reverse");
2664}
2665
2666// Return whether we allow using masked interleave-groups (for dealing with
2667// strided loads/stores that reside in predicated blocks, or for dealing
2668// with gaps).
2669static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2670 // If an override option has been passed in for interleaved accesses, use it.
2671 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2672 return EnableMaskedInterleavedMemAccesses;
2673
2674 return TTI.enableMaskedInterleavedAccessVectorization();
2675}
2676
2677// Try to vectorize the interleave group that \p Instr belongs to.
2678//
2679// E.g. Translate following interleaved load group (factor = 3):
2680// for (i = 0; i < N; i+=3) {
2681// R = Pic[i]; // Member of index 0
2682// G = Pic[i+1]; // Member of index 1
2683// B = Pic[i+2]; // Member of index 2
2684// ... // do something to R, G, B
2685// }
2686// To:
2687// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2688// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2689// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2690// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2691//
2692// Or translate following interleaved store group (factor = 3):
2693// for (i = 0; i < N; i+=3) {
2694// ... do something to R, G, B
2695// Pic[i] = R; // Member of index 0
2696// Pic[i+1] = G; // Member of index 1
2697// Pic[i+2] = B; // Member of index 2
2698// }
2699// To:
2700// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2701// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2702// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2703// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2704// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2705void InnerLoopVectorizer::vectorizeInterleaveGroup(
2706 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2707 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2708 VPValue *BlockInMask) {
2709 Instruction *Instr = Group->getInsertPos();
2710 const DataLayout &DL = Instr->getModule()->getDataLayout();
2711
2712 // Prepare for the vector type of the interleaved load/store.
2713 Type *ScalarTy = getLoadStoreType(Instr);
2714 unsigned InterleaveFactor = Group->getFactor();
2715 assert(!VF.isScalable() && "scalable vectors not yet supported.")(static_cast <bool> (!VF.isScalable() && "scalable vectors not yet supported."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2715, __extension__ __PRETTY_FUNCTION__))
;
2716 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2717
2718 // Prepare for the new pointers.
2719 SmallVector<Value *, 2> AddrParts;
2720 unsigned Index = Group->getIndex(Instr);
2721
2722 // TODO: extend the masked interleaved-group support to reversed access.
2723 assert((!BlockInMask || !Group->isReverse()) &&(static_cast <bool> ((!BlockInMask || !Group->isReverse
()) && "Reversed masked interleave-group not supported."
) ? void (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2724, __extension__ __PRETTY_FUNCTION__))
2724 "Reversed masked interleave-group not supported.")(static_cast <bool> ((!BlockInMask || !Group->isReverse
()) && "Reversed masked interleave-group not supported."
) ? void (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2724, __extension__ __PRETTY_FUNCTION__))
;
2725
2726 // If the group is reverse, adjust the index to refer to the last vector lane
2727 // instead of the first. We adjust the index from the first vector lane,
2728 // rather than directly getting the pointer for lane VF - 1, because the
2729 // pointer operand of the interleaved access is supposed to be uniform. For
2730 // uniform instructions, we're only required to generate a value for the
2731 // first vector lane in each unroll iteration.
2732 if (Group->isReverse())
2733 Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2734
2735 for (unsigned Part = 0; Part < UF; Part++) {
2736 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2737 setDebugLocFromInst(AddrPart);
2738
2739 // Notice current instruction could be any index. Need to adjust the address
2740 // to the member of index 0.
2741 //
2742 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2743 // b = A[i]; // Member of index 0
2744 // Current pointer is pointed to A[i+1], adjust it to A[i].
2745 //
2746 // E.g. A[i+1] = a; // Member of index 1
2747 // A[i] = b; // Member of index 0
2748 // A[i+2] = c; // Member of index 2 (Current instruction)
2749 // Current pointer is pointed to A[i+2], adjust it to A[i].
2750
2751 bool InBounds = false;
2752 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2753 InBounds = gep->isInBounds();
2754 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2755 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2756
2757 // Cast to the vector pointer type.
2758 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2759 Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2760 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2761 }
2762
2763 setDebugLocFromInst(Instr);
2764 Value *PoisonVec = PoisonValue::get(VecTy);
2765
2766 Value *MaskForGaps = nullptr;
2767 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2768 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2769 assert(MaskForGaps && "Mask for Gaps is required but it is null")(static_cast <bool> (MaskForGaps && "Mask for Gaps is required but it is null"
) ? void (0) : __assert_fail ("MaskForGaps && \"Mask for Gaps is required but it is null\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2769, __extension__ __PRETTY_FUNCTION__))
;
2770 }
2771
2772 // Vectorize the interleaved load group.
2773 if (isa<LoadInst>(Instr)) {
2774 // For each unroll part, create a wide load for the group.
2775 SmallVector<Value *, 2> NewLoads;
2776 for (unsigned Part = 0; Part < UF; Part++) {
2777 Instruction *NewLoad;
2778 if (BlockInMask || MaskForGaps) {
2779 assert(useMaskedInterleavedAccesses(*TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(*TTI)
&& "masked interleaved groups are not allowed.") ? void
(0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2780, __extension__ __PRETTY_FUNCTION__))
2780 "masked interleaved groups are not allowed.")(static_cast <bool> (useMaskedInterleavedAccesses(*TTI)
&& "masked interleaved groups are not allowed.") ? void
(0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2780, __extension__ __PRETTY_FUNCTION__))
;
2781 Value *GroupMask = MaskForGaps;
2782 if (BlockInMask) {
2783 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2784 Value *ShuffledMask = Builder.CreateShuffleVector(
2785 BlockInMaskPart,
2786 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2787 "interleaved.mask");
2788 GroupMask = MaskForGaps
2789 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2790 MaskForGaps)
2791 : ShuffledMask;
2792 }
2793 NewLoad =
2794 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2795 GroupMask, PoisonVec, "wide.masked.vec");
2796 }
2797 else
2798 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2799 Group->getAlign(), "wide.vec");
2800 Group->addMetadata(NewLoad);
2801 NewLoads.push_back(NewLoad);
2802 }
2803
2804 // For each member in the group, shuffle out the appropriate data from the
2805 // wide loads.
2806 unsigned J = 0;
2807 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2808 Instruction *Member = Group->getMember(I);
2809
2810 // Skip the gaps in the group.
2811 if (!Member)
2812 continue;
2813
2814 auto StrideMask =
2815 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2816 for (unsigned Part = 0; Part < UF; Part++) {
2817 Value *StridedVec = Builder.CreateShuffleVector(
2818 NewLoads[Part], StrideMask, "strided.vec");
2819
2820 // If this member has different type, cast the result type.
2821 if (Member->getType() != ScalarTy) {
2822 assert(!VF.isScalable() && "VF is assumed to be non scalable.")(static_cast <bool> (!VF.isScalable() && "VF is assumed to be non scalable."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2822, __extension__ __PRETTY_FUNCTION__))
;
2823 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2824 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2825 }
2826
2827 if (Group->isReverse())
2828 StridedVec = reverseVector(StridedVec);
2829
2830 State.set(VPDefs[J], StridedVec, Part);
2831 }
2832 ++J;
2833 }
2834 return;
2835 }
2836
2837 // The sub vector type for current instruction.
2838 auto *SubVT = VectorType::get(ScalarTy, VF);
2839
2840 // Vectorize the interleaved store group.
2841 for (unsigned Part = 0; Part < UF; Part++) {
2842 // Collect the stored vector from each member.
2843 SmallVector<Value *, 4> StoredVecs;
2844 for (unsigned i = 0; i < InterleaveFactor; i++) {
2845 // Interleaved store group doesn't allow a gap, so each index has a member
2846 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group")(static_cast <bool> (Group->getMember(i) && "Fail to get a member from an interleaved store group"
) ? void (0) : __assert_fail ("Group->getMember(i) && \"Fail to get a member from an interleaved store group\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2846, __extension__ __PRETTY_FUNCTION__))
;
2847
2848 Value *StoredVec = State.get(StoredValues[i], Part);
2849
2850 if (Group->isReverse())
2851 StoredVec = reverseVector(StoredVec);
2852
2853 // If this member has different type, cast it to a unified type.
2854
2855 if (StoredVec->getType() != SubVT)
2856 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2857
2858 StoredVecs.push_back(StoredVec);
2859 }
2860
2861 // Concatenate all vectors into a wide vector.
2862 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2863
2864 // Interleave the elements in the wide vector.
2865 Value *IVec = Builder.CreateShuffleVector(
2866 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2867 "interleaved.vec");
2868
2869 Instruction *NewStoreInstr;
2870 if (BlockInMask) {
2871 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2872 Value *ShuffledMask = Builder.CreateShuffleVector(
2873 BlockInMaskPart,
2874 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2875 "interleaved.mask");
2876 NewStoreInstr = Builder.CreateMaskedStore(
2877 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2878 }
2879 else
2880 NewStoreInstr =
2881 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2882
2883 Group->addMetadata(NewStoreInstr);
2884 }
2885}
2886
2887void InnerLoopVectorizer::vectorizeMemoryInstruction(
2888 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2889 VPValue *StoredValue, VPValue *BlockInMask) {
2890 // Attempt to issue a wide load.
2891 LoadInst *LI = dyn_cast<LoadInst>(Instr);
2892 StoreInst *SI = dyn_cast<StoreInst>(Instr);
2893
2894 assert((LI || SI) && "Invalid Load/Store instruction")(static_cast <bool> ((LI || SI) && "Invalid Load/Store instruction"
) ? void (0) : __assert_fail ("(LI || SI) && \"Invalid Load/Store instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2894, __extension__ __PRETTY_FUNCTION__))
;
2895 assert((!SI || StoredValue) && "No stored value provided for widened store")(static_cast <bool> ((!SI || StoredValue) && "No stored value provided for widened store"
) ? void (0) : __assert_fail ("(!SI || StoredValue) && \"No stored value provided for widened store\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2895, __extension__ __PRETTY_FUNCTION__))
;
2896 assert((!LI || !StoredValue) && "Stored value provided for widened load")(static_cast <bool> ((!LI || !StoredValue) && "Stored value provided for widened load"
) ? void (0) : __assert_fail ("(!LI || !StoredValue) && \"Stored value provided for widened load\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2896, __extension__ __PRETTY_FUNCTION__))
;
2897
2898 LoopVectorizationCostModel::InstWidening Decision =
2899 Cost->getWideningDecision(Instr, VF);
2900 assert((Decision == LoopVectorizationCostModel::CM_Widen ||(static_cast <bool> ((Decision == LoopVectorizationCostModel
::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse
|| Decision == LoopVectorizationCostModel::CM_GatherScatter)
&& "CM decision is not to widen the memory instruction"
) ? void (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2903, __extension__ __PRETTY_FUNCTION__))
2901 Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||(static_cast <bool> ((Decision == LoopVectorizationCostModel
::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse
|| Decision == LoopVectorizationCostModel::CM_GatherScatter)
&& "CM decision is not to widen the memory instruction"
) ? void (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2903, __extension__ __PRETTY_FUNCTION__))
2902 Decision == LoopVectorizationCostModel::CM_GatherScatter) &&(static_cast <bool> ((Decision == LoopVectorizationCostModel
::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse
|| Decision == LoopVectorizationCostModel::CM_GatherScatter)
&& "CM decision is not to widen the memory instruction"
) ? void (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2903, __extension__ __PRETTY_FUNCTION__))
2903 "CM decision is not to widen the memory instruction")(static_cast <bool> ((Decision == LoopVectorizationCostModel
::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse
|| Decision == LoopVectorizationCostModel::CM_GatherScatter)
&& "CM decision is not to widen the memory instruction"
) ? void (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2903, __extension__ __PRETTY_FUNCTION__))
;
2904
2905 Type *ScalarDataTy = getLoadStoreType(Instr);
2906
2907 auto *DataTy = VectorType::get(ScalarDataTy, VF);
2908 const Align Alignment = getLoadStoreAlignment(Instr);
2909
2910 // Determine if the pointer operand of the access is either consecutive or
2911 // reverse consecutive.
2912 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2913 bool ConsecutiveStride =
2914 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2915 bool CreateGatherScatter =
2916 (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2917
2918 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2919 // gather/scatter. Otherwise Decision should have been to Scalarize.
2920 assert((ConsecutiveStride || CreateGatherScatter) &&(static_cast <bool> ((ConsecutiveStride || CreateGatherScatter
) && "The instruction should be scalarized") ? void (
0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2921, __extension__ __PRETTY_FUNCTION__))
2921 "The instruction should be scalarized")(static_cast <bool> ((ConsecutiveStride || CreateGatherScatter
) && "The instruction should be scalarized") ? void (
0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2921, __extension__ __PRETTY_FUNCTION__))
;
2922 (void)ConsecutiveStride;
2923
2924 VectorParts BlockInMaskParts(UF);
2925 bool isMaskRequired = BlockInMask;
2926 if (isMaskRequired)
2927 for (unsigned Part = 0; Part < UF; ++Part)
2928 BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2929
2930 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2931 // Calculate the pointer for the specific unroll-part.
2932 GetElementPtrInst *PartPtr = nullptr;
2933
2934 bool InBounds = false;
2935 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2936 InBounds = gep->isInBounds();
2937 if (Reverse) {
2938 // If the address is consecutive but reversed, then the
2939 // wide store needs to start at the last vector element.
2940 // RunTimeVF = VScale * VF.getKnownMinValue()
2941 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
2942 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2943 // NumElt = -Part * RunTimeVF
2944 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
2945 // LastLane = 1 - RunTimeVF
2946 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
2947 PartPtr =
2948 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
2949 PartPtr->setIsInBounds(InBounds);
2950 PartPtr = cast<GetElementPtrInst>(
2951 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
2952 PartPtr->setIsInBounds(InBounds);
2953 if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2954 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2955 } else {
2956 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2957 PartPtr = cast<GetElementPtrInst>(
2958 Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2959 PartPtr->setIsInBounds(InBounds);
2960 }
2961
2962 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2963 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2964 };
2965
2966 // Handle Stores:
2967 if (SI) {
2968 setDebugLocFromInst(SI);
2969
2970 for (unsigned Part = 0; Part < UF; ++Part) {
2971 Instruction *NewSI = nullptr;
2972 Value *StoredVal = State.get(StoredValue, Part);
2973 if (CreateGatherScatter) {
2974 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2975 Value *VectorGep = State.get(Addr, Part);
2976 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2977 MaskPart);
2978 } else {
2979 if (Reverse) {
2980 // If we store to reverse consecutive memory locations, then we need
2981 // to reverse the order of elements in the stored value.
2982 StoredVal = reverseVector(StoredVal);
2983 // We don't want to update the value in the map as it might be used in
2984 // another expression. So don't call resetVectorValue(StoredVal).
2985 }
2986 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2987 if (isMaskRequired)
2988 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2989 BlockInMaskParts[Part]);
2990 else
2991 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2992 }
2993 addMetadata(NewSI, SI);
2994 }
2995 return;
2996 }
2997
2998 // Handle loads.
2999 assert(LI && "Must have a load instruction")(static_cast <bool> (LI && "Must have a load instruction"
) ? void (0) : __assert_fail ("LI && \"Must have a load instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2999, __extension__ __PRETTY_FUNCTION__))
;
3000 setDebugLocFromInst(LI);
3001 for (unsigned Part = 0; Part < UF; ++Part) {
3002 Value *NewLI;
3003 if (CreateGatherScatter) {
3004 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
3005 Value *VectorGep = State.get(Addr, Part);
3006 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
3007 nullptr, "wide.masked.gather");
3008 addMetadata(NewLI, LI);
3009 } else {
3010 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
3011 if (isMaskRequired)
3012 NewLI = Builder.CreateMaskedLoad(
3013 DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
3014 PoisonValue::get(DataTy), "wide.masked.load");
3015 else
3016 NewLI =
3017 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
3018
3019 // Add metadata to the load, but setVectorValue to the reverse shuffle.
3020 addMetadata(NewLI, LI);
3021 if (Reverse)
3022 NewLI = reverseVector(NewLI);
3023 }
3024
3025 State.set(Def, NewLI, Part);
3026 }
3027}
3028
3029void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
3030 VPUser &User,
3031 const VPIteration &Instance,
3032 bool IfPredicateInstr,
3033 VPTransformState &State) {
3034 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")(static_cast <bool> (!Instr->getType()->isAggregateType
() && "Can't handle vectors") ? void (0) : __assert_fail
("!Instr->getType()->isAggregateType() && \"Can't handle vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3034, __extension__ __PRETTY_FUNCTION__))
;
3035
3036 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
3037 // the first lane and part.
3038 if (isa<NoAliasScopeDeclInst>(Instr))
3039 if (!Instance.isFirstIteration())
3040 return;
3041
3042 setDebugLocFromInst(Instr);
3043
3044 // Does this instruction return a value ?
3045 bool IsVoidRetTy = Instr->getType()->isVoidTy();
3046
3047 Instruction *Cloned = Instr->clone();
3048 if (!IsVoidRetTy)
3049 Cloned->setName(Instr->getName() + ".cloned");
3050
3051 State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
3052 Builder.GetInsertPoint());
3053 // Replace the operands of the cloned instructions with their scalar
3054 // equivalents in the new loop.
3055 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
3056 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
3057 auto InputInstance = Instance;
3058 if (!Operand || !OrigLoop->contains(Operand) ||
3059 (Cost->isUniformAfterVectorization(Operand, State.VF)))
3060 InputInstance.Lane = VPLane::getFirstLane();
3061 auto *NewOp = State.get(User.getOperand(op), InputInstance);
3062 Cloned->setOperand(op, NewOp);
3063 }
3064 addNewMetadata(Cloned, Instr);
3065
3066 // Place the cloned scalar in the new loop.
3067 Builder.Insert(Cloned);
3068
3069 State.set(Def, Cloned, Instance);
3070
3071 // If we just cloned a new assumption, add it the assumption cache.
3072 if (auto *II = dyn_cast<AssumeInst>(Cloned))
3073 AC->registerAssumption(II);
3074
3075 // End if-block.
3076 if (IfPredicateInstr)
3077 PredicatedInstructions.push_back(Cloned);
3078}
3079
3080PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
3081 Value *End, Value *Step,
3082 Instruction *DL) {
3083 BasicBlock *Header = L->getHeader();
3084 BasicBlock *Latch = L->getLoopLatch();
3085 // As we're just creating this loop, it's possible no latch exists
3086 // yet. If so, use the header as this will be a single block loop.
3087 if (!Latch)
3088 Latch = Header;
3089
3090 IRBuilder<> B(&*Header->getFirstInsertionPt());
3091 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3092 setDebugLocFromInst(OldInst, &B);
3093 auto *Induction = B.CreatePHI(Start->getType(), 2, "index");
3094
3095 B.SetInsertPoint(Latch->getTerminator());
3096 setDebugLocFromInst(OldInst, &B);
3097
3098 // Create i+1 and fill the PHINode.
3099 //
3100 // If the tail is not folded, we know that End - Start >= Step (either
3101 // statically or through the minimum iteration checks). We also know that both
3102 // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV +
3103 // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned
3104 // overflows and we can mark the induction increment as NUW.
3105 Value *Next = B.CreateAdd(Induction, Step, "index.next",
3106 /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false);
3107 Induction->addIncoming(Start, L->getLoopPreheader());
3108 Induction->addIncoming(Next, Latch);
3109 // Create the compare.
3110 Value *ICmp = B.CreateICmpEQ(Next, End);
3111 B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
3112
3113 // Now we have two terminators. Remove the old one from the block.
3114 Latch->getTerminator()->eraseFromParent();
3115
3116 return Induction;
3117}
3118
3119Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3120 if (TripCount)
3121 return TripCount;
3122
3123 assert(L && "Create Trip Count for null loop.")(static_cast <bool> (L && "Create Trip Count for null loop."
) ? void (0) : __assert_fail ("L && \"Create Trip Count for null loop.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3123, __extension__ __PRETTY_FUNCTION__))
;
3124 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3125 // Find the loop boundaries.
3126 ScalarEvolution *SE = PSE.getSE();
3127 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3128 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&(static_cast <bool> (!isa<SCEVCouldNotCompute>(BackedgeTakenCount
) && "Invalid loop count") ? void (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3129, __extension__ __PRETTY_FUNCTION__))
3129 "Invalid loop count")(static_cast <bool> (!isa<SCEVCouldNotCompute>(BackedgeTakenCount
) && "Invalid loop count") ? void (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3129, __extension__ __PRETTY_FUNCTION__))
;
3130
3131 Type *IdxTy = Legal->getWidestInductionType();
3132 assert(IdxTy && "No type for induction")(static_cast <bool> (IdxTy && "No type for induction"
) ? void (0) : __assert_fail ("IdxTy && \"No type for induction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3132, __extension__ __PRETTY_FUNCTION__))
;
3133
3134 // The exit count might have the type of i64 while the phi is i32. This can
3135 // happen if we have an induction variable that is sign extended before the
3136 // compare. The only way that we get a backedge taken count is that the
3137 // induction variable was signed and as such will not overflow. In such a case
3138 // truncation is legal.
3139 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3140 IdxTy->getPrimitiveSizeInBits())
3141 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3142 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3143
3144 // Get the total trip count from the count by adding 1.
3145 const SCEV *ExitCount = SE->getAddExpr(
3146 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3147
3148 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3149
3150 // Expand the trip count and place the new instructions in the preheader.
3151 // Notice that the pre-header does not change, only the loop body.
3152 SCEVExpander Exp(*SE, DL, "induction");
3153
3154 // Count holds the overall loop count (N).
3155 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3156 L->getLoopPreheader()->getTerminator());
3157
3158 if (TripCount->getType()->isPointerTy())
3159 TripCount =
3160 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3161 L->getLoopPreheader()->getTerminator());
3162
3163 return TripCount;
3164}
3165
3166Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3167 if (VectorTripCount)
3168 return VectorTripCount;
3169
3170 Value *TC = getOrCreateTripCount(L);
3171 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3172
3173 Type *Ty = TC->getType();
3174 // This is where we can make the step a runtime constant.
3175 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
3176
3177 // If the tail is to be folded by masking, round the number of iterations N
3178 // up to a multiple of Step instead of rounding down. This is done by first
3179 // adding Step-1 and then rounding down. Note that it's ok if this addition
3180 // overflows: the vector induction variable will eventually wrap to zero given
3181 // that it starts at zero and its Step is a power of two; the loop will then
3182 // exit, with the last early-exit vector comparison also producing all-true.
3183 if (Cost->foldTailByMasking()) {
3184 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
) * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3185, __extension__ __PRETTY_FUNCTION__))
3185 "VF*UF must be a power of 2 when folding tail by masking")(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
) * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3185, __extension__ __PRETTY_FUNCTION__))
;
3186 assert(!VF.isScalable() &&(static_cast <bool> (!VF.isScalable() && "Tail folding not yet supported for scalable vectors"
) ? void (0) : __assert_fail ("!VF.isScalable() && \"Tail folding not yet supported for scalable vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3187, __extension__ __PRETTY_FUNCTION__))
3187 "Tail folding not yet supported for scalable vectors")(static_cast <bool> (!VF.isScalable() && "Tail folding not yet supported for scalable vectors"
) ? void (0) : __assert_fail ("!VF.isScalable() && \"Tail folding not yet supported for scalable vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3187, __extension__ __PRETTY_FUNCTION__))
;
3188 TC = Builder.CreateAdd(
3189 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3190 }
3191
3192 // Now we need to generate the expression for the part of the loop that the
3193 // vectorized body will execute. This is equal to N - (N % Step) if scalar
3194 // iterations are not required for correctness, or N - Step, otherwise. Step
3195 // is equal to the vectorization factor (number of SIMD elements) times the
3196 // unroll factor (number of SIMD instructions).
3197 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3198
3199 // There are cases where we *must* run at least one iteration in the remainder
3200 // loop. See the cost model for when this can happen. If the step evenly
3201 // divides the trip count, we set the remainder to be equal to the step. If
3202 // the step does not evenly divide the trip count, no adjustment is necessary
3203 // since there will already be scalar iterations. Note that the minimum
3204 // iterations check ensures that N >= Step.
3205 if (Cost->requiresScalarEpilogue(VF)) {
3206 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3207 R = Builder.CreateSelect(IsZero, Step, R);
3208 }
3209
3210 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3211
3212 return VectorTripCount;
3213}
3214
3215Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3216 const DataLayout &DL) {
3217 // Verify that V is a vector type with same number of elements as DstVTy.
3218 auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3219 unsigned VF = DstFVTy->getNumElements();
3220 auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3221 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match")(static_cast <bool> ((VF == SrcVecTy->getNumElements
()) && "Vector dimensions do not match") ? void (0) :
__assert_fail ("(VF == SrcVecTy->getNumElements()) && \"Vector dimensions do not match\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3221, __extension__ __PRETTY_FUNCTION__))
;
3222 Type *SrcElemTy = SrcVecTy->getElementType();
3223 Type *DstElemTy = DstFVTy->getElementType();
3224 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3225, __extension__ __PRETTY_FUNCTION__))
3225 "Vector elements must have same size")(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3225, __extension__ __PRETTY_FUNCTION__))
;
3226
3227 // Do a direct cast if element types are castable.
3228 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3229 return Builder.CreateBitOrPointerCast(V, DstFVTy);
3230 }
3231 // V cannot be directly casted to desired vector type.
3232 // May happen when V is a floating point vector but DstVTy is a vector of
3233 // pointers or vice-versa. Handle this using a two-step bitcast using an
3234 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3235 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3236, __extension__ __PRETTY_FUNCTION__))
3236 "Only one type should be a pointer type")(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3236, __extension__ __PRETTY_FUNCTION__))
;
3237 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&(static_cast <bool> ((DstElemTy->isFloatingPointTy()
!= SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3238, __extension__ __PRETTY_FUNCTION__))
3238 "Only one type should be a floating point type")(static_cast <bool> ((DstElemTy->isFloatingPointTy()
!= SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3238, __extension__ __PRETTY_FUNCTION__))
;
3239 Type *IntTy =
3240 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3241 auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3242 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3243 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3244}
3245
3246void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3247 BasicBlock *Bypass) {
3248 Value *Count = getOrCreateTripCount(L);
3249 // Reuse existing vector loop preheader for TC checks.
3250 // Note that new preheader block is generated for vector loop.
3251 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3252 IRBuilder<> Builder(TCCheckBlock->getTerminator());
3253
3254 // Generate code to check if the loop's trip count is less than VF * UF, or
3255 // equal to it in case a scalar epilogue is required; this implies that the
3256 // vector trip count is zero. This check also covers the case where adding one
3257 // to the backedge-taken count overflowed leading to an incorrect trip count
3258 // of zero. In this case we will also jump to the scalar loop.
3259 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
3260 : ICmpInst::ICMP_ULT;
3261
3262 // If tail is to be folded, vector loop takes care of all iterations.
3263 Value *CheckMinIters = Builder.getFalse();
3264 if (!Cost->foldTailByMasking()) {
3265 Value *Step =
3266 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3267 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3268 }
3269 // Create new preheader for vector loop.
3270 LoopVectorPreHeader =
3271 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3272 "vector.ph");
3273
3274 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3276, __extension__ __PRETTY_FUNCTION__))
3275 DT->getNode(Bypass)->getIDom()) &&(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3276, __extension__ __PRETTY_FUNCTION__))
3276 "TC check is expected to dominate Bypass")(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3276, __extension__ __PRETTY_FUNCTION__))
;
3277
3278 // Update dominator for Bypass & LoopExit (if needed).
3279 DT->changeImmediateDominator(Bypass, TCCheckBlock);
3280 if (!Cost->requiresScalarEpilogue(VF))
3281 // If there is an epilogue which must run, there's no edge from the
3282 // middle block to exit blocks and thus no need to update the immediate
3283 // dominator of the exit blocks.
3284 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3285
3286 ReplaceInstWithInst(
3287 TCCheckBlock->getTerminator(),
3288 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3289 LoopBypassBlocks.push_back(TCCheckBlock);
3290}
3291
3292BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3293
3294 BasicBlock *const SCEVCheckBlock =
3295 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3296 if (!SCEVCheckBlock)
3297 return nullptr;
3298
3299 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3302, __extension__ __PRETTY_FUNCTION__))
3300 (OptForSizeBasedOnProfile &&(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3302, __extension__ __PRETTY_FUNCTION__))
3301 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3302, __extension__ __PRETTY_FUNCTION__))
3302 "Cannot SCEV check stride or overflow when optimizing for size")(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3302, __extension__ __PRETTY_FUNCTION__))
;
3303
3304
3305 // Update dominator only if this is first RT check.
3306 if (LoopBypassBlocks.empty()) {
3307 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3308 if (!Cost->requiresScalarEpilogue(VF))
3309 // If there is an epilogue which must run, there's no edge from the
3310 // middle block to exit blocks and thus no need to update the immediate
3311 // dominator of the exit blocks.
3312 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3313 }
3314
3315 LoopBypassBlocks.push_back(SCEVCheckBlock);
3316 AddedSafetyChecks = true;
3317 return SCEVCheckBlock;
3318}
3319
3320BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3321 BasicBlock *Bypass) {
3322 // VPlan-native path does not do any analysis for runtime checks currently.
3323 if (EnableVPlanNativePath)
3324 return nullptr;
3325
3326 BasicBlock *const MemCheckBlock =
3327 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3328
3329 // Check if we generated code that checks in runtime if arrays overlap. We put
3330 // the checks into a separate block to make the more common case of few
3331 // elements faster.
3332 if (!MemCheckBlock)
3333 return nullptr;
3334
3335 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3336 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3338, __extension__ __PRETTY_FUNCTION__))
3337 "Cannot emit memory checks when optimizing for size, unless forced "(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3338, __extension__ __PRETTY_FUNCTION__))
3338 "to vectorize.")(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3338, __extension__ __PRETTY_FUNCTION__))
;
3339 ORE->emit([&]() {
3340 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationCodeSize",
3341 L->getStartLoc(), L->getHeader())
3342 << "Code-size may be reduced by not forcing "
3343 "vectorization, or by source-code modifications "
3344 "eliminating the need for runtime checks "
3345 "(e.g., adding 'restrict').";
3346 });
3347 }
3348
3349 LoopBypassBlocks.push_back(MemCheckBlock);
3350
3351 AddedSafetyChecks = true;
3352
3353 // We currently don't use LoopVersioning for the actual loop cloning but we
3354 // still use it to add the noalias metadata.
3355 LVer = std::make_unique<LoopVersioning>(
3356 *Legal->getLAI(),
3357 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3358 DT, PSE.getSE());
3359 LVer->prepareNoAliasMetadata();
3360 return MemCheckBlock;
3361}
3362
3363Value *InnerLoopVectorizer::emitTransformedIndex(
3364 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3365 const InductionDescriptor &ID) const {
3366
3367 SCEVExpander Exp(*SE, DL, "induction");
3368 auto Step = ID.getStep();
3369 auto StartValue = ID.getStartValue();
3370 assert(Index->getType()->getScalarType() == Step->getType() &&(static_cast <bool> (Index->getType()->getScalarType
() == Step->getType() && "Index scalar type does not match StepValue type"
) ? void (0) : __assert_fail ("Index->getType()->getScalarType() == Step->getType() && \"Index scalar type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3371, __extension__ __PRETTY_FUNCTION__))
3371 "Index scalar type does not match StepValue type")(static_cast <bool> (Index->getType()->getScalarType
() == Step->getType() && "Index scalar type does not match StepValue type"
) ? void (0) : __assert_fail ("Index->getType()->getScalarType() == Step->getType() && \"Index scalar type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3371, __extension__ __PRETTY_FUNCTION__))
;
3372
3373 // Note: the IR at this point is broken. We cannot use SE to create any new
3374 // SCEV and then expand it, hoping that SCEV's simplification will give us
3375 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3376 // lead to various SCEV crashes. So all we can do is to use builder and rely
3377 // on InstCombine for future simplifications. Here we handle some trivial
3378 // cases only.
3379 auto CreateAdd = [&B](Value *X, Value *Y) {
3380 assert(X->getType() == Y->getType() && "Types don't match!")(static_cast <bool> (X->getType() == Y->getType()
&& "Types don't match!") ? void (0) : __assert_fail (
"X->getType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3380, __extension__ __PRETTY_FUNCTION__))
;
3381 if (auto *CX = dyn_cast<ConstantInt>(X))
3382 if (CX->isZero())
3383 return Y;
3384 if (auto *CY = dyn_cast<ConstantInt>(Y))
3385 if (CY->isZero())
3386 return X;
3387 return B.CreateAdd(X, Y);
3388 };
3389
3390 // We allow X to be a vector type, in which case Y will potentially be
3391 // splatted into a vector with the same element count.
3392 auto CreateMul = [&B](Value *X, Value *Y) {
3393 assert(X->getType()->getScalarType() == Y->getType() &&(static_cast <bool> (X->getType()->getScalarType(
) == Y->getType() && "Types don't match!") ? void (
0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3394, __extension__ __PRETTY_FUNCTION__))
3394 "Types don't match!")(static_cast <bool> (X->getType()->getScalarType(
) == Y->getType() && "Types don't match!") ? void (
0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3394, __extension__ __PRETTY_FUNCTION__))
;
3395 if (auto *CX = dyn_cast<ConstantInt>(X))
3396 if (CX->isOne())
3397 return Y;
3398 if (auto *CY = dyn_cast<ConstantInt>(Y))
3399 if (CY->isOne())
3400 return X;
3401 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
3402 if (XVTy && !isa<VectorType>(Y->getType()))
3403 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
3404 return B.CreateMul(X, Y);
3405 };
3406
3407 // Get a suitable insert point for SCEV expansion. For blocks in the vector
3408 // loop, choose the end of the vector loop header (=LoopVectorBody), because
3409 // the DomTree is not kept up-to-date for additional blocks generated in the
3410 // vector loop. By using the header as insertion point, we guarantee that the
3411 // expanded instructions dominate all their uses.
3412 auto GetInsertPoint = [this, &B]() {
3413 BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3414 if (InsertBB != LoopVectorBody &&
3415 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3416 return LoopVectorBody->getTerminator();
3417 return &*B.GetInsertPoint();
3418 };
3419
3420 switch (ID.getKind()) {
3421 case InductionDescriptor::IK_IntInduction: {
3422 assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for integer inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3423, __extension__ __PRETTY_FUNCTION__))
3423 "Vector indices not supported for integer inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for integer inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3423, __extension__ __PRETTY_FUNCTION__))
;
3424 assert(Index->getType() == StartValue->getType() &&(static_cast <bool> (Index->getType() == StartValue->
getType() && "Index type does not match StartValue type"
) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3425, __extension__ __PRETTY_FUNCTION__))
3425 "Index type does not match StartValue type")(static_cast <bool> (Index->getType() == StartValue->
getType() && "Index type does not match StartValue type"
) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3425, __extension__ __PRETTY_FUNCTION__))
;
3426 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3427 return B.CreateSub(StartValue, Index);
3428 auto *Offset = CreateMul(
3429 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3430 return CreateAdd(StartValue, Offset);
3431 }
3432 case InductionDescriptor::IK_PtrInduction: {
3433 assert(isa<SCEVConstant>(Step) &&(static_cast <bool> (isa<SCEVConstant>(Step) &&
"Expected constant step for pointer induction") ? void (0) :
__assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3434, __extension__ __PRETTY_FUNCTION__))
3434 "Expected constant step for pointer induction")(static_cast <bool> (isa<SCEVConstant>(Step) &&
"Expected constant step for pointer induction") ? void (0) :
__assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3434, __extension__ __PRETTY_FUNCTION__))
;
3435 return B.CreateGEP(
3436 StartValue->getType()->getPointerElementType(), StartValue,
3437 CreateMul(Index,
3438 Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
3439 GetInsertPoint())));
3440 }
3441 case InductionDescriptor::IK_FpInduction: {
3442 assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for FP inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3443, __extension__ __PRETTY_FUNCTION__))
3443 "Vector indices not supported for FP inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for FP inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3443, __extension__ __PRETTY_FUNCTION__))
;
3444 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value")(static_cast <bool> (Step->getType()->isFloatingPointTy
() && "Expected FP Step value") ? void (0) : __assert_fail
("Step->getType()->isFloatingPointTy() && \"Expected FP Step value\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3444, __extension__ __PRETTY_FUNCTION__))
;
3445 auto InductionBinOp = ID.getInductionBinOp();
3446 assert(InductionBinOp &&(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3449, __extension__ __PRETTY_FUNCTION__))
3447 (InductionBinOp->getOpcode() == Instruction::FAdd ||(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3449, __extension__ __PRETTY_FUNCTION__))
3448 InductionBinOp->getOpcode() == Instruction::FSub) &&(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3449, __extension__ __PRETTY_FUNCTION__))
3449 "Original bin op should be defined for FP induction")(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3449, __extension__ __PRETTY_FUNCTION__))
;
3450
3451 Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3452 Value *MulExp = B.CreateFMul(StepValue, Index);
3453 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3454 "induction");
3455 }
3456 case InductionDescriptor::IK_NoInduction:
3457 return nullptr;
3458 }
3459 llvm_unreachable("invalid enum")::llvm::llvm_unreachable_internal("invalid enum", "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3459)
;
3460}
3461
3462Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3463 LoopScalarBody = OrigLoop->getHeader();
3464 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3465 assert(LoopVectorPreHeader && "Invalid loop structure")(static_cast <bool> (LoopVectorPreHeader && "Invalid loop structure"
) ? void (0) : __assert_fail ("LoopVectorPreHeader && \"Invalid loop structure\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3465, __extension__ __PRETTY_FUNCTION__))
;
3466 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3467 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&(static_cast <bool> ((LoopExitBlock || Cost->requiresScalarEpilogue
(VF)) && "multiple exit loop without required epilogue?"
) ? void (0) : __assert_fail ("(LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && \"multiple exit loop without required epilogue?\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3468, __extension__ __PRETTY_FUNCTION__))
3468 "multiple exit loop without required epilogue?")(static_cast <bool> ((LoopExitBlock || Cost->requiresScalarEpilogue
(VF)) && "multiple exit loop without required epilogue?"
) ? void (0) : __assert_fail ("(LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && \"multiple exit loop without required epilogue?\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3468, __extension__ __PRETTY_FUNCTION__))
;
3469
3470 LoopMiddleBlock =
3471 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3472 LI, nullptr, Twine(Prefix) + "middle.block");
3473 LoopScalarPreHeader =
3474 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3475 nullptr, Twine(Prefix) + "scalar.ph");
3476
3477 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3478
3479 // Set up the middle block terminator. Two cases:
3480 // 1) If we know that we must execute the scalar epilogue, emit an
3481 // unconditional branch.
3482 // 2) Otherwise, we must have a single unique exit block (due to how we
3483 // implement the multiple exit case). In this case, set up a conditonal
3484 // branch from the middle block to the loop scalar preheader, and the
3485 // exit block. completeLoopSkeleton will update the condition to use an
3486 // iteration check, if required to decide whether to execute the remainder.
3487 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3488 BranchInst::Create(LoopScalarPreHeader) :
3489 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3490 Builder.getTrue());
3491 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3492 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3493
3494 // We intentionally don't let SplitBlock to update LoopInfo since
3495 // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3496 // LoopVectorBody is explicitly added to the correct place few lines later.
3497 LoopVectorBody =
3498 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3499 nullptr, nullptr, Twine(Prefix) + "vector.body");
3500
3501 // Update dominator for loop exit.
3502 if (!Cost->requiresScalarEpilogue(VF))
3503 // If there is an epilogue which must run, there's no edge from the
3504 // middle block to exit blocks and thus no need to update the immediate
3505 // dominator of the exit blocks.
3506 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3507
3508 // Create and register the new vector loop.
3509 Loop *Lp = LI->AllocateLoop();
3510 Loop *ParentLoop = OrigLoop->getParentLoop();
3511
3512 // Insert the new loop into the loop nest and register the new basic blocks
3513 // before calling any utilities such as SCEV that require valid LoopInfo.
3514 if (ParentLoop) {
3515 ParentLoop->addChildLoop(Lp);
3516 } else {
3517 LI->addTopLevelLoop(Lp);
3518 }
3519 Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3520 return Lp;
3521}
3522
3523void InnerLoopVectorizer::createInductionResumeValues(
3524 Loop *L, Value *VectorTripCount,
3525 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3526 assert(VectorTripCount && L && "Expected valid arguments")(static_cast <bool> (VectorTripCount && L &&
"Expected valid arguments") ? void (0) : __assert_fail ("VectorTripCount && L && \"Expected valid arguments\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3526, __extension__ __PRETTY_FUNCTION__))
;
3527 assert(((AdditionalBypass.first && AdditionalBypass.second) ||(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3529, __extension__ __PRETTY_FUNCTION__))
3528 (!AdditionalBypass.first && !AdditionalBypass.second)) &&(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3529, __extension__ __PRETTY_FUNCTION__))
3529 "Inconsistent information about additional bypass.")(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3529, __extension__ __PRETTY_FUNCTION__))
;
3530 // We are going to resume the execution of the scalar loop.
3531 // Go over all of the induction variables that we found and fix the
3532 // PHIs that are left in the scalar version of the loop.
3533 // The starting values of PHI nodes depend on the counter of the last
3534 // iteration in the vectorized loop.
3535 // If we come from a bypass edge then we need to start from the original
3536 // start value.
3537 for (auto &InductionEntry : Legal->getInductionVars()) {
3538 PHINode *OrigPhi = InductionEntry.first;
3539 InductionDescriptor II = InductionEntry.second;
3540
3541 // Create phi nodes to merge from the backedge-taken check block.
3542 PHINode *BCResumeVal =
3543 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3544 LoopScalarPreHeader->getTerminator());
3545 // Copy original phi DL over to the new one.
3546 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3547 Value *&EndValue = IVEndValues[OrigPhi];
3548 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3549 if (OrigPhi == OldInduction) {
3550 // We know what the end value is.
3551 EndValue = VectorTripCount;
3552 } else {
3553 IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3554
3555 // Fast-math-flags propagate from the original induction instruction.
3556 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3557 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3558
3559 Type *StepType = II.getStep()->getType();
3560 Instruction::CastOps CastOp =
3561 CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3562 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3563 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3564 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3565 EndValue->setName("ind.end");
3566
3567 // Compute the end value for the additional bypass (if applicable).
3568 if (AdditionalBypass.first) {
3569 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3570 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3571 StepType, true);
3572 CRD =
3573 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3574 EndValueFromAdditionalBypass =
3575 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3576 EndValueFromAdditionalBypass->setName("ind.end");
3577 }
3578 }
3579 // The new PHI merges the original incoming value, in case of a bypass,
3580 // or the value at the end of the vectorized loop.
3581 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3582
3583 // Fix the scalar body counter (PHI node).
3584 // The old induction's phi node in the scalar body needs the truncated
3585 // value.
3586 for (BasicBlock *BB : LoopBypassBlocks)
3587 BCResumeVal->addIncoming(II.getStartValue(), BB);
3588
3589 if (AdditionalBypass.first)
3590 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3591 EndValueFromAdditionalBypass);
3592
3593 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3594 }
3595}
3596
3597BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3598 MDNode *OrigLoopID) {
3599 assert(L && "Expected valid loop.")(static_cast <bool> (L && "Expected valid loop."
) ? void (0) : __assert_fail ("L && \"Expected valid loop.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3599, __extension__ __PRETTY_FUNCTION__))
;
3600
3601 // The trip counts should be cached by now.
3602 Value *Count = getOrCreateTripCount(L);
3603 Value *VectorTripCount = getOrCreateVectorTripCount(L);
3604
3605 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3606
3607 // Add a check in the middle block to see if we have completed
3608 // all of the iterations in the first vector loop. Three cases:
3609 // 1) If we require a scalar epilogue, there is no conditional branch as
3610 // we unconditionally branch to the scalar preheader. Do nothing.
3611 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3612 // Thus if tail is to be folded, we know we don't need to run the
3613 // remainder and we can use the previous value for the condition (true).
3614 // 3) Otherwise, construct a runtime check.
3615 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3616 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3617 Count, VectorTripCount, "cmp.n",
3618 LoopMiddleBlock->getTerminator());
3619
3620 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3621 // of the corresponding compare because they may have ended up with
3622 // different line numbers and we want to avoid awkward line stepping while
3623 // debugging. Eg. if the compare has got a line number inside the loop.
3624 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3625 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3626 }
3627
3628 // Get ready to start creating new instructions into the vectorized body.
3629 assert(LoopVectorPreHeader == L->getLoopPreheader() &&(static_cast <bool> (LoopVectorPreHeader == L->getLoopPreheader
() && "Inconsistent vector loop preheader") ? void (0
) : __assert_fail ("LoopVectorPreHeader == L->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3630, __extension__ __PRETTY_FUNCTION__))
3630 "Inconsistent vector loop preheader")(static_cast <bool> (LoopVectorPreHeader == L->getLoopPreheader
() && "Inconsistent vector loop preheader") ? void (0
) : __assert_fail ("LoopVectorPreHeader == L->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3630, __extension__ __PRETTY_FUNCTION__))
;
3631 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3632
3633 Optional<MDNode *> VectorizedLoopID =
3634 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3635 LLVMLoopVectorizeFollowupVectorized});
3636 if (VectorizedLoopID.hasValue()) {
3637 L->setLoopID(VectorizedLoopID.getValue());
3638
3639 // Do not setAlreadyVectorized if loop attributes have been defined
3640 // explicitly.
3641 return LoopVectorPreHeader;
3642 }
3643
3644 // Keep all loop hints from the original loop on the vector loop (we'll
3645 // replace the vectorizer-specific hints below).
3646 if (MDNode *LID = OrigLoop->getLoopID())
3647 L->setLoopID(LID);
3648
3649 LoopVectorizeHints Hints(L, true, *ORE);
3650 Hints.setAlreadyVectorized();
3651
3652#ifdef EXPENSIVE_CHECKS
3653 assert(DT->verify(DominatorTree::VerificationLevel::Fast))(static_cast <bool> (DT->verify(DominatorTree::VerificationLevel
::Fast)) ? void (0) : __assert_fail ("DT->verify(DominatorTree::VerificationLevel::Fast)"
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3653, __extension__ __PRETTY_FUNCTION__))
;
3654 LI->verify(*DT);
3655#endif
3656
3657 return LoopVectorPreHeader;
3658}
3659
3660BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3661 /*
3662 In this function we generate a new loop. The new loop will contain
3663 the vectorized instructions while the old loop will continue to run the
3664 scalar remainder.
3665
3666 [ ] <-- loop iteration number check.
3667 / |
3668 / v
3669 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3670 | / |
3671 | / v
3672 || [ ] <-- vector pre header.
3673 |/ |
3674 | v
3675 | [ ] \
3676 | [ ]_| <-- vector loop.
3677 | |
3678 | v
3679 \ -[ ] <--- middle-block.
3680 \/ |
3681 /\ v
3682 | ->[ ] <--- new preheader.
3683 | |
3684 (opt) v <-- edge from middle to exit iff epilogue is not required.
3685 | [ ] \
3686 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3687 \ |
3688 \ v
3689 >[ ] <-- exit block(s).
3690 ...
3691 */
3692
3693 // Get the metadata of the original loop before it gets modified.
3694 MDNode *OrigLoopID = OrigLoop->getLoopID();
3695
3696 // Workaround! Compute the trip count of the original loop and cache it
3697 // before we start modifying the CFG. This code has a systemic problem
3698 // wherein it tries to run analysis over partially constructed IR; this is
3699 // wrong, and not simply for SCEV. The trip count of the original loop
3700 // simply happens to be prone to hitting this in practice. In theory, we
3701 // can hit the same issue for any SCEV, or ValueTracking query done during
3702 // mutation. See PR49900.
3703 getOrCreateTripCount(OrigLoop);
3704
3705 // Create an empty vector loop, and prepare basic blocks for the runtime
3706 // checks.
3707 Loop *Lp = createVectorLoopSkeleton("");
3708
3709 // Now, compare the new count to zero. If it is zero skip the vector loop and
3710 // jump to the scalar loop. This check also covers the case where the
3711 // backedge-taken count is uint##_max: adding one to it will overflow leading
3712 // to an incorrect trip count of zero. In this (rare) case we will also jump
3713 // to the scalar loop.
3714 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3715
3716 // Generate the code to check any assumptions that we've made for SCEV
3717 // expressions.
3718 emitSCEVChecks(Lp, LoopScalarPreHeader);
3719
3720 // Generate the code that checks in runtime if arrays overlap. We put the
3721 // checks into a separate block to make the more common case of few elements
3722 // faster.
3723 emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3724
3725 // Some loops have a single integer induction variable, while other loops
3726 // don't. One example is c++ iterators that often have multiple pointer
3727 // induction variables. In the code below we also support a case where we
3728 // don't have a single induction variable.
3729 //
3730 // We try to obtain an induction variable from the original loop as hard
3731 // as possible. However if we don't find one that:
3732 // - is an integer
3733 // - counts from zero, stepping by one
3734 // - is the size of the widest induction variable type
3735 // then we create a new one.
3736 OldInduction = Legal->getPrimaryInduction();
3737 Type *IdxTy = Legal->getWidestInductionType();
3738 Value *StartIdx = ConstantInt::get(IdxTy, 0);
3739 // The loop step is equal to the vectorization factor (num of SIMD elements)
3740 // times the unroll factor (num of SIMD instructions).
3741 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3742 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3743 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3744 Induction =
3745 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3746 getDebugLocFromInstOrOperands(OldInduction));
3747
3748 // Emit phis for the new starting index of the scalar loop.
3749 createInductionResumeValues(Lp, CountRoundDown);
3750
3751 return completeLoopSkeleton(Lp, OrigLoopID);
3752}
3753
3754// Fix up external users of the induction variable. At this point, we are
3755// in LCSSA form, with all external PHIs that use the IV having one input value,
3756// coming from the remainder loop. We need those PHIs to also have a correct
3757// value for the IV when arriving directly from the middle block.
3758void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3759 const InductionDescriptor &II,
3760 Value *CountRoundDown, Value *EndValue,
3761 BasicBlock *MiddleBlock) {
3762 // There are two kinds of external IV usages - those that use the value
3763 // computed in the last iteration (the PHI) and those that use the penultimate
3764 // value (the value that feeds into the phi from the loop latch).
3765 // We allow both, but they, obviously, have different values.
3766
3767 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block")(static_cast <bool> (OrigLoop->getUniqueExitBlock() &&
"Expected a single exit block") ? void (0) : __assert_fail (
"OrigLoop->getUniqueExitBlock() && \"Expected a single exit block\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3767, __extension__ __PRETTY_FUNCTION__))
;
3768
3769 DenseMap<Value *, Value *> MissingVals;
3770
3771 // An external user of the last iteration's value should see the value that
3772 // the remainder loop uses to initialize its own IV.
3773 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3774 for (User *U : PostInc->users()) {
3775 Instruction *UI = cast<Instruction>(U);
3776 if (!OrigLoop->contains(UI)) {
3777 assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3777, __extension__ __PRETTY_FUNCTION__))
;
3778 MissingVals[UI] = EndValue;
3779 }
3780 }
3781
3782 // An external user of the penultimate value need to see EndValue - Step.
3783 // The simplest way to get this is to recompute it from the constituent SCEVs,
3784 // that is Start + (Step * (CRD - 1)).
3785 for (User *U : OrigPhi->users()) {
3786 auto *UI = cast<Instruction>(U);
3787 if (!OrigLoop->contains(UI)) {
3788 const DataLayout &DL =
3789 OrigLoop->getHeader()->getModule()->getDataLayout();
3790 assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3790, __extension__ __PRETTY_FUNCTION__))
;
3791
3792 IRBuilder<> B(MiddleBlock->getTerminator());
3793
3794 // Fast-math-flags propagate from the original induction instruction.
3795 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3796 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3797
3798 Value *CountMinusOne = B.CreateSub(
3799 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3800 Value *CMO =
3801 !II.getStep()->getType()->isIntegerTy()
3802 ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3803 II.getStep()->getType())
3804 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3805 CMO->setName("cast.cmo");
3806 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3807 Escape->setName("ind.escape");
3808 MissingVals[UI] = Escape;
3809 }
3810 }
3811
3812 for (auto &I : MissingVals) {
3813 PHINode *PHI = cast<PHINode>(I.first);
3814 // One corner case we have to handle is two IVs "chasing" each-other,
3815 // that is %IV2 = phi [...], [ %IV1, %latch ]
3816 // In this case, if IV1 has an external use, we need to avoid adding both
3817 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3818 // don't already have an incoming value for the middle block.
3819 if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3820 PHI->addIncoming(I.second, MiddleBlock);
3821 }
3822}
3823
3824namespace {
3825
3826struct CSEDenseMapInfo {
3827 static bool canHandle(const Instruction *I) {
3828 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3829 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3830 }
3831
3832 static inline Instruction *getEmptyKey() {
3833 return DenseMapInfo<Instruction *>::getEmptyKey();
3834 }
3835
3836 static inline Instruction *getTombstoneKey() {
3837 return DenseMapInfo<Instruction *>::getTombstoneKey();
3838 }
3839
3840 static unsigned getHashValue(const Instruction *I) {
3841 assert(canHandle(I) && "Unknown instruction!")(static_cast <bool> (canHandle(I) && "Unknown instruction!"
) ? void (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3841, __extension__ __PRETTY_FUNCTION__))
;
3842 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3843 I->value_op_end()));
3844 }
3845
3846 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3847 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3848 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3849 return LHS == RHS;
3850 return LHS->isIdenticalTo(RHS);
3851 }
3852};
3853
3854} // end anonymous namespace
3855
3856///Perform cse of induction variable instructions.
3857static void cse(BasicBlock *BB) {
3858 // Perform simple cse.
3859 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3860 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3861 Instruction *In = &*I++;
3862
3863 if (!CSEDenseMapInfo::canHandle(In))
3864 continue;
3865
3866 // Check if we can replace this instruction with any of the
3867 // visited instructions.
3868 if (Instruction *V = CSEMap.lookup(In)) {
3869 In->replaceAllUsesWith(V);
3870 In->eraseFromParent();
3871 continue;
3872 }
3873
3874 CSEMap[In] = In;
3875 }
3876}
3877
3878InstructionCost
3879LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3880 bool &NeedToScalarize) const {
3881 Function *F = CI->getCalledFunction();
3882 Type *ScalarRetTy = CI->getType();
3883 SmallVector<Type *, 4> Tys, ScalarTys;
3884 for (auto &ArgOp : CI->arg_operands())
3885 ScalarTys.push_back(ArgOp->getType());
3886
3887 // Estimate cost of scalarized vector call. The source operands are assumed
3888 // to be vectors, so we need to extract individual elements from there,
3889 // execute VF scalar calls, and then gather the result into the vector return
3890 // value.
3891 InstructionCost ScalarCallCost =
3892 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3893 if (VF.isScalar())
3894 return ScalarCallCost;
3895
3896 // Compute corresponding vector type for return value and arguments.
3897 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3898 for (Type *ScalarTy : ScalarTys)
3899 Tys.push_back(ToVectorTy(ScalarTy, VF));
3900
3901 // Compute costs of unpacking argument values for the scalar calls and
3902 // packing the return values to a vector.
3903 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3904
3905 InstructionCost Cost =
3906 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3907
3908 // If we can't emit a vector call for this function, then the currently found
3909 // cost is the cost we need to return.
3910 NeedToScalarize = true;
3911 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3912 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3913
3914 if (!TLI || CI->isNoBuiltin() || !VecFunc)
3915 return Cost;
3916
3917 // If the corresponding vector cost is cheaper, return its cost.
3918 InstructionCost VectorCallCost =
3919 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3920 if (VectorCallCost < Cost) {
3921 NeedToScalarize = false;
3922 Cost = VectorCallCost;
3923 }
3924 return Cost;
3925}
3926
3927static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3928 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3929 return Elt;
3930 return VectorType::get(Elt, VF);
3931}
3932
3933InstructionCost
3934LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3935 ElementCount VF) const {
3936 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3937 assert(ID && "Expected intrinsic call!")(static_cast <bool> (ID && "Expected intrinsic call!"
) ? void (0) : __assert_fail ("ID && \"Expected intrinsic call!\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3937, __extension__ __PRETTY_FUNCTION__))
;
3938 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3939 FastMathFlags FMF;
3940 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3941 FMF = FPMO->getFastMathFlags();
3942
3943 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
3944 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3945 SmallVector<Type *> ParamTys;
3946 std::transform(FTy->param_begin(), FTy->param_end(),
3947 std::back_inserter(ParamTys),
3948 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3949
3950 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3951 dyn_cast<IntrinsicInst>(CI));
3952 return TTI.getIntrinsicInstrCost(CostAttrs,
3953 TargetTransformInfo::TCK_RecipThroughput);
3954}
3955
3956static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3957 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3958 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3959 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3960}
3961
3962static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3963 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3964 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3965 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3966}
3967
3968void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3969 // For every instruction `I` in MinBWs, truncate the operands, create a
3970 // truncated version of `I` and reextend its result. InstCombine runs
3971 // later and will remove any ext/trunc pairs.
3972 SmallPtrSet<Value *, 4> Erased;
3973 for (const auto &KV : Cost->getMinimalBitwidths()) {
3974 // If the value wasn't vectorized, we must maintain the original scalar
3975 // type. The absence of the value from State indicates that it
3976 // wasn't vectorized.
3977 VPValue *Def = State.Plan->getVPValue(KV.first);
3978 if (!State.hasAnyVectorValue(Def))
3979 continue;
3980 for (unsigned Part = 0; Part < UF; ++Part) {
3981 Value *I = State.get(Def, Part);
3982 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3983 continue;
3984 Type *OriginalTy = I->getType();
3985 Type *ScalarTruncatedTy =
3986 IntegerType::get(OriginalTy->getContext(), KV.second);
3987 auto *TruncatedTy = VectorType::get(
3988 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3989 if (TruncatedTy == OriginalTy)
3990 continue;
3991
3992 IRBuilder<> B(cast<Instruction>(I));
3993 auto ShrinkOperand = [&](Value *V) -> Value * {
3994 if (auto *ZI = dyn_cast<ZExtInst>(V))
3995 if (ZI->getSrcTy() == TruncatedTy)
3996 return ZI->getOperand(0);
3997 return B.CreateZExtOrTrunc(V, TruncatedTy);
3998 };
3999
4000 // The actual instruction modification depends on the instruction type,
4001 // unfortunately.
4002 Value *NewI = nullptr;
4003 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
4004 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
4005 ShrinkOperand(BO->getOperand(1)));
4006
4007 // Any wrapping introduced by shrinking this operation shouldn't be
4008 // considered undefined behavior. So, we can't unconditionally copy
4009 // arithmetic wrapping flags to NewI.
4010 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
4011 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
4012 NewI =
4013 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
4014 ShrinkOperand(CI->getOperand(1)));
4015 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
4016 NewI = B.CreateSelect(SI->getCondition(),
4017 ShrinkOperand(SI->getTrueValue()),
4018 ShrinkOperand(SI->getFalseValue()));
4019 } else if (auto *CI = dyn_cast<CastInst>(I)) {
4020 switch (CI->getOpcode()) {
4021 default:
4022 llvm_unreachable("Unhandled cast!")::llvm::llvm_unreachable_internal("Unhandled cast!", "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4022)
;
4023 case Instruction::Trunc:
4024 NewI = ShrinkOperand(CI->getOperand(0));
4025 break;
4026 case Instruction::SExt:
4027 NewI = B.CreateSExtOrTrunc(
4028 CI->getOperand(0),
4029 smallestIntegerVectorType(OriginalTy, TruncatedTy));
4030 break;
4031 case Instruction::ZExt:
4032 NewI = B.CreateZExtOrTrunc(
4033 CI->getOperand(0),
4034 smallestIntegerVectorType(OriginalTy, TruncatedTy));
4035 break;
4036 }
4037 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
4038 auto Elements0 =
4039 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
4040 auto *O0 = B.CreateZExtOrTrunc(
4041 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
4042 auto Elements1 =
4043 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
4044 auto *O1 = B.CreateZExtOrTrunc(
4045 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
4046
4047 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
4048 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
4049 // Don't do anything with the operands, just extend the result.
4050 continue;
4051 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
4052 auto Elements =
4053 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
4054 auto *O0 = B.CreateZExtOrTrunc(
4055 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
4056 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
4057 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
4058 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
4059 auto Elements =
4060 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
4061 auto *O0 = B.CreateZExtOrTrunc(
4062 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
4063 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
4064 } else {
4065 // If we don't know what to do, be conservative and don't do anything.
4066 continue;
4067 }
4068
4069 // Lastly, extend the result.
4070 NewI->takeName(cast<Instruction>(I));
4071 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
4072 I->replaceAllUsesWith(Res);
4073 cast<Instruction>(I)->eraseFromParent();
4074 Erased.insert(I);
4075 State.reset(Def, Res, Part);
4076 }
4077 }
4078
4079 // We'll have created a bunch of ZExts that are now parentless. Clean up.
4080 for (const auto &KV : Cost->getMinimalBitwidths()) {
4081 // If the value wasn't vectorized, we must maintain the original scalar
4082 // type. The absence of the value from State indicates that it
4083 // wasn't vectorized.
4084 VPValue *Def = State.Plan->getVPValue(KV.first);
4085 if (!State.hasAnyVectorValue(Def))
4086 continue;
4087 for (unsigned Part = 0; Part < UF; ++Part) {
4088 Value *I = State.get(Def, Part);
4089 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
4090 if (Inst && Inst->use_empty()) {
4091 Value *NewI = Inst->getOperand(0);
4092 Inst->eraseFromParent();
4093 State.reset(Def, NewI, Part);
4094 }
4095 }
4096 }
4097}
4098
4099void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
4100 // Insert truncates and extends for any truncated instructions as hints to
4101 // InstCombine.
4102 if (VF.isVector())
4103 truncateToMinimalBitwidths(State);
4104
4105 // Fix widened non-induction PHIs by setting up the PHI operands.
4106 if (OrigPHIsToFix.size()) {
4107 assert(EnableVPlanNativePath &&(static_cast <bool> (EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4108, __extension__ __PRETTY_FUNCTION__))
4108 "Unexpected non-induction PHIs for fixup in non VPlan-native path")(static_cast <bool> (EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4108, __extension__ __PRETTY_FUNCTION__))
;
4109 fixNonInductionPHIs(State);
4110 }
4111
4112 // At this point every instruction in the original loop is widened to a
4113 // vector form. Now we need to fix the recurrences in the loop. These PHI
4114 // nodes are currently empty because we did not want to introduce cycles.
4115 // This is the second stage of vectorizing recurrences.
4116 fixCrossIterationPHIs(State);
4117
4118 // Forget the original basic block.
4119 PSE.getSE()->forgetLoop(OrigLoop);
4120
4121 // If we inserted an edge from the middle block to the unique exit block,
4122 // update uses outside the loop (phis) to account for the newly inserted
4123 // edge.
4124 if (!Cost->requiresScalarEpilogue(VF)) {
4125 // Fix-up external users of the induction variables.
4126 for (auto &Entry : Legal->getInductionVars())
4127 fixupIVUsers(Entry.first, Entry.second,
4128 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4129 IVEndValues[Entry.first], LoopMiddleBlock);
4130
4131 fixLCSSAPHIs(State);
4132 }
4133
4134 for (Instruction *PI : PredicatedInstructions)
4135 sinkScalarOperands(&*PI);
4136
4137 // Remove redundant induction instructions.
4138 cse(LoopVectorBody);
4139
4140 // Set/update profile weights for the vector and remainder loops as original
4141 // loop iterations are now distributed among them. Note that original loop
4142 // represented by LoopScalarBody becomes remainder loop after vectorization.
4143 //
4144 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4145 // end up getting slightly roughened result but that should be OK since
4146 // profile is not inherently precise anyway. Note also possible bypass of
4147 // vector code caused by legality checks is ignored, assigning all the weight
4148 // to the vector loop, optimistically.
4149 //
4150 // For scalable vectorization we can't know at compile time how many iterations
4151 // of the loop are handled in one vector iteration, so instead assume a pessimistic
4152 // vscale of '1'.
4153 setProfileInfoAfterUnrolling(
4154 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
4155 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
4156}
4157
4158void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
4159 // In order to support recurrences we need to be able to vectorize Phi nodes.
4160 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4161 // stage #2: We now need to fix the recurrences by adding incoming edges to
4162 // the currently empty PHI nodes. At this point every instruction in the
4163 // original loop is widened to a vector form so we can use them to construct
4164 // the incoming edges.
4165 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
4166 for (VPRecipeBase &R : Header->phis()) {
4167 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
4168 fixReduction(ReductionPhi, State);
4169 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
4170 fixFirstOrderRecurrence(FOR, State);
4171 }
4172}
4173
4174void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR,
4175 VPTransformState &State) {
4176 // This is the second phase of vectorizing first-order recurrences. An
4177 // overview of the transformation is described below. Suppose we have the
4178 // following loop.
4179 //
4180 // for (int i = 0; i < n; ++i)
4181 // b[i] = a[i] - a[i - 1];
4182 //
4183 // There is a first-order recurrence on "a". For this loop, the shorthand
4184 // scalar IR looks like:
4185 //
4186 // scalar.ph:
4187 // s_init = a[-1]
4188 // br scalar.body
4189 //
4190 // scalar.body:
4191 // i = phi [0, scalar.ph], [i+1, scalar.body]
4192 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4193 // s2 = a[i]
4194 // b[i] = s2 - s1
4195 // br cond, scalar.body, ...
4196 //
4197 // In this example, s1 is a recurrence because it's value depends on the
4198 // previous iteration. In the first phase of vectorization, we created a
4199 // vector phi v1 for s1. We now complete the vectorization and produce the
4200 // shorthand vector IR shown below (for VF = 4, UF = 1).
4201 //
4202 // vector.ph:
4203 // v_init = vector(..., ..., ..., a[-1])
4204 // br vector.body
4205 //
4206 // vector.body
4207 // i = phi [0, vector.ph], [i+4, vector.body]
4208 // v1 = phi [v_init, vector.ph], [v2, vector.body]
4209 // v2 = a[i, i+1, i+2, i+3];
4210 // v3 = vector(v1(3), v2(0, 1, 2))
4211 // b[i, i+1, i+2, i+3] = v2 - v3
4212 // br cond, vector.body, middle.block
4213 //
4214 // middle.block:
4215 // x = v2(3)
4216 // br scalar.ph
4217 //
4218 // scalar.ph:
4219 // s_init = phi [x, middle.block], [a[-1], otherwise]
4220 // br scalar.body
4221 //
4222 // After execution completes the vector loop, we extract the next value of
4223 // the recurrence (x) to use as the initial value in the scalar loop.
4224
4225 auto *IdxTy = Builder.getInt32Ty();
4226 auto *VecPhi = cast<PHINode>(State.get(PhiR, 0));
4227
4228 // Fix the latch value of the new recurrence in the vector loop.
4229 VPValue *PreviousDef = PhiR->getBackedgeValue();
4230 Value *Incoming = State.get(PreviousDef, UF - 1);
4231 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4232
4233 // Extract the last vector element in the middle block. This will be the
4234 // initial value for the recurrence when jumping to the scalar loop.
4235 auto *ExtractForScalar = Incoming;
4236 if (VF.isVector()) {
4237 auto *One = ConstantInt::get(IdxTy, 1);
4238 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4239 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4240 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4241 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
4242 "vector.recur.extract");
4243 }
4244 // Extract the second last element in the middle block if the
4245 // Phi is used outside the loop. We need to extract the phi itself
4246 // and not the last element (the phi update in the current iteration). This
4247 // will be the value when jumping to the exit block from the LoopMiddleBlock,
4248 // when the scalar loop is not run at all.
4249 Value *ExtractForPhiUsedOutsideLoop = nullptr;
4250 if (VF.isVector()) {
4251 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4252 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
4253 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4254 Incoming, Idx, "vector.recur.extract.for.phi");
4255 } else if (UF > 1)
4256 // When loop is unrolled without vectorizing, initialize
4257 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
4258 // of `Incoming`. This is analogous to the vectorized case above: extracting
4259 // the second last element when VF > 1.
4260 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4261
4262 // Fix the initial value of the original recurrence in the scalar loop.
4263 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4264 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
4265 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4266 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
4267 for (auto *BB : predecessors(LoopScalarPreHeader)) {
4268 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4269 Start->addIncoming(Incoming, BB);
4270 }
4271
4272 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4273 Phi->setName("scalar.recur");
4274
4275 // Finally, fix users of the recurrence outside the loop. The users will need
4276 // either the last value of the scalar recurrence or the last value of the
4277 // vector recurrence we extracted in the middle block. Since the loop is in
4278 // LCSSA form, we just need to find all the phi nodes for the original scalar
4279 // recurrence in the exit block, and then add an edge for the middle block.
4280 // Note that LCSSA does not imply single entry when the original scalar loop
4281 // had multiple exiting edges (as we always run the last iteration in the
4282 // scalar epilogue); in that case, there is no edge from middle to exit and
4283 // and thus no phis which needed updated.
4284 if (!Cost->requiresScalarEpilogue(VF))
4285 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4286 if (any_of(LCSSAPhi.incoming_values(),
4287 [Phi](Value *V) { return V == Phi; }))
4288 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4289}
4290
4291void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
4292 VPTransformState &State) {
4293 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4294 // Get it's reduction variable descriptor.
4295 assert(Legal->isReductionVariable(OrigPhi) &&(static_cast <bool> (Legal->isReductionVariable(OrigPhi
) && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4296, __extension__ __PRETTY_FUNCTION__))
4296 "Unable to find the reduction variable")(static_cast <bool> (Legal->isReductionVariable(OrigPhi
) && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4296, __extension__ __PRETTY_FUNCTION__))
;
4297 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4298
4299 RecurKind RK = RdxDesc.getRecurrenceKind();
4300 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4301 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4302 setDebugLocFromInst(ReductionStartValue);
4303
4304 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst);
4305 // This is the vector-clone of the value that leaves the loop.
4306 Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4307
4308 // Wrap flags are in general invalid after vectorization, clear them.
4309 clearReductionWrapFlags(RdxDesc, State);
4310
4311 // Fix the vector-loop phi.
4312
4313 // Reductions do not have to start at zero. They can start with
4314 // any loop invariant values.
4315 BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4316
4317 unsigned LastPartForNewPhi = PhiR->isOrdered() ? 1 : UF;
4318 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
4319 Value *VecRdxPhi = State.get(PhiR->getVPSingleValue(), Part);
4320 Value *Val = State.get(PhiR->getBackedgeValue(), Part);
4321 if (PhiR->isOrdered())
4322 Val = State.get(PhiR->getBackedgeValue(), UF - 1);
4323
4324 cast<PHINode>(VecRdxPhi)->addIncoming(Val, VectorLoopLatch);
4325 }
4326
4327 // Before each round, move the insertion point right between
4328 // the PHIs and the values we are going to write.
4329 // This allows us to write both PHINodes and the extractelement
4330 // instructions.
4331 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4332
4333 setDebugLocFromInst(LoopExitInst);
4334
4335 Type *PhiTy = OrigPhi->getType();
4336 // If tail is folded by masking, the vector value to leave the loop should be
4337 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4338 // instead of the former. For an inloop reduction the reduction will already
4339 // be predicated, and does not need to be handled here.
4340 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
4341 for (unsigned Part = 0; Part < UF; ++Part) {
4342 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4343 Value *Sel = nullptr;
4344 for (User *U : VecLoopExitInst->users()) {
4345 if (isa<SelectInst>(U)) {
4346 assert(!Sel && "Reduction exit feeding two selects")(static_cast <bool> (!Sel && "Reduction exit feeding two selects"
) ? void (0) : __assert_fail ("!Sel && \"Reduction exit feeding two selects\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4346, __extension__ __PRETTY_FUNCTION__))
;
4347 Sel = U;
4348 } else
4349 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select")(static_cast <bool> (isa<PHINode>(U) && "Reduction exit must feed Phi's or select"
) ? void (0) : __assert_fail ("isa<PHINode>(U) && \"Reduction exit must feed Phi's or select\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4349, __extension__ __PRETTY_FUNCTION__))
;
4350 }
4351 assert(Sel && "Reduction exit feeds no select")(static_cast <bool> (Sel && "Reduction exit feeds no select"
) ? void (0) : __assert_fail ("Sel && \"Reduction exit feeds no select\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4351, __extension__ __PRETTY_FUNCTION__))
;
4352 State.reset(LoopExitInstDef, Sel, Part);
4353
4354 // If the target can create a predicated operator for the reduction at no
4355 // extra cost in the loop (for example a predicated vadd), it can be
4356 // cheaper for the select to remain in the loop than be sunk out of it,
4357 // and so use the select value for the phi instead of the old
4358 // LoopExitValue.
4359 if (PreferPredicatedReductionSelect ||
4360 TTI->preferPredicatedReductionSelect(
4361 RdxDesc.getOpcode(), PhiTy,
4362 TargetTransformInfo::ReductionFlags())) {
4363 auto *VecRdxPhi =
4364 cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part));
4365 VecRdxPhi->setIncomingValueForBlock(
4366 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4367 }
4368 }
4369 }
4370
4371 // If the vector reduction can be performed in a smaller type, we truncate
4372 // then extend the loop exit value to enable InstCombine to evaluate the
4373 // entire expression in the smaller type.
4374 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4375 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!")(static_cast <bool> (!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"
) ? void (0) : __assert_fail ("!PhiR->isInLoop() && \"Unexpected truncated inloop reduction!\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4375, __extension__ __PRETTY_FUNCTION__))
;
4376 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4377 Builder.SetInsertPoint(
4378 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4379 VectorParts RdxParts(UF);
4380 for (unsigned Part = 0; Part < UF; ++Part) {
4381 RdxParts[Part] = State.get(LoopExitInstDef, Part);
4382 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4383 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4384 : Builder.CreateZExt(Trunc, VecTy);
4385 for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4386 UI != RdxParts[Part]->user_end();)
4387 if (*UI != Trunc) {
4388 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4389 RdxParts[Part] = Extnd;
4390 } else {
4391 ++UI;
4392 }
4393 }
4394 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4395 for (unsigned Part = 0; Part < UF; ++Part) {
4396 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4397 State.reset(LoopExitInstDef, RdxParts[Part], Part);
4398 }
4399 }
4400
4401 // Reduce all of the unrolled parts into a single vector.
4402 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4403 unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4404
4405 // The middle block terminator has already been assigned a DebugLoc here (the
4406 // OrigLoop's single latch terminator). We want the whole middle block to
4407 // appear to execute on this line because: (a) it is all compiler generated,
4408 // (b) these instructions are always executed after evaluating the latch
4409 // conditional branch, and (c) other passes may add new predecessors which
4410 // terminate on this line. This is the easiest way to ensure we don't
4411 // accidentally cause an extra step back into the loop while debugging.
4412 setDebugLocFromInst(LoopMiddleBlock->getTerminator());
4413 if (PhiR->isOrdered())
4414 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
4415 else {
4416 // Floating-point operations should have some FMF to enable the reduction.
4417 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4418 Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4419 for (unsigned Part = 1; Part < UF; ++Part) {
4420 Value *RdxPart = State.get(LoopExitInstDef, Part);
4421 if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4422 ReducedPartRdx = Builder.CreateBinOp(
4423 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4424 } else {
4425 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4426 }
4427 }
4428 }
4429
4430 // Create the reduction after the loop. Note that inloop reductions create the
4431 // target reduction in the loop using a Reduction recipe.
4432 if (VF.isVector() && !PhiR->isInLoop()) {
4433 ReducedPartRdx =
4434 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
4435 // If the reduction can be performed in a smaller type, we need to extend
4436 // the reduction to the wider type before we branch to the original loop.
4437 if (PhiTy != RdxDesc.getRecurrenceType())
4438 ReducedPartRdx = RdxDesc.isSigned()
4439 ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4440 : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4441 }
4442
4443 // Create a phi node that merges control-flow from the backedge-taken check
4444 // block and the middle block.
4445 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4446 LoopScalarPreHeader->getTerminator());
4447 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4448 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4449 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4450
4451 // Now, we need to fix the users of the reduction variable
4452 // inside and outside of the scalar remainder loop.
4453
4454 // We know that the loop is in LCSSA form. We need to update the PHI nodes
4455 // in the exit blocks. See comment on analogous loop in
4456 // fixFirstOrderRecurrence for a more complete explaination of the logic.
4457 if (!Cost->requiresScalarEpilogue(VF))
4458 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4459 if (any_of(LCSSAPhi.incoming_values(),
4460 [LoopExitInst](Value *V) { return V == LoopExitInst; }))
4461 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4462
4463 // Fix the scalar loop reduction variable with the incoming reduction sum
4464 // from the vector body and from the backedge value.
4465 int IncomingEdgeBlockIdx =
4466 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4467 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")(static_cast <bool> (IncomingEdgeBlockIdx >= 0 &&
"Invalid block index") ? void (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4467, __extension__ __PRETTY_FUNCTION__))
;
4468 // Pick the other block.
4469 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4470 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4471 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4472}
4473
4474void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4475 VPTransformState &State) {
4476 RecurKind RK = RdxDesc.getRecurrenceKind();
4477 if (RK != RecurKind::Add && RK != RecurKind::Mul)
4478 return;
4479
4480 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4481 assert(LoopExitInstr && "null loop exit instruction")(static_cast <bool> (LoopExitInstr && "null loop exit instruction"
) ? void (0) : __assert_fail ("LoopExitInstr && \"null loop exit instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4481, __extension__ __PRETTY_FUNCTION__))
;
4482 SmallVector<Instruction *, 8> Worklist;
4483 SmallPtrSet<Instruction *, 8> Visited;
4484 Worklist.push_back(LoopExitInstr);
4485 Visited.insert(LoopExitInstr);
4486
4487 while (!Worklist.empty()) {
4488 Instruction *Cur = Worklist.pop_back_val();
4489 if (isa<OverflowingBinaryOperator>(Cur))
4490 for (unsigned Part = 0; Part < UF; ++Part) {
4491 Value *V = State.get(State.Plan->getVPValue(Cur), Part);
4492 cast<Instruction>(V)->dropPoisonGeneratingFlags();
4493 }
4494
4495 for (User *U : Cur->users()) {
4496 Instruction *UI = cast<Instruction>(U);
4497 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4498 Visited.insert(UI).second)
4499 Worklist.push_back(UI);
4500 }
4501 }
4502}
4503
4504void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4505 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4506 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4507 // Some phis were already hand updated by the reduction and recurrence
4508 // code above, leave them alone.
4509 continue;
4510
4511 auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4512 // Non-instruction incoming values will have only one value.
4513
4514 VPLane Lane = VPLane::getFirstLane();
4515 if (isa<Instruction>(IncomingValue) &&
4516 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4517 VF))
4518 Lane = VPLane::getLastLaneForVF(VF);
4519
4520 // Can be a loop invariant incoming value or the last scalar value to be
4521 // extracted from the vectorized loop.
4522 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4523 Value *lastIncomingValue =
4524 OrigLoop->isLoopInvariant(IncomingValue)
4525 ? IncomingValue
4526 : State.get(State.Plan->getVPValue(IncomingValue),
4527 VPIteration(UF - 1, Lane));
4528 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4529 }
4530}
4531
4532void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4533 // The basic block and loop containing the predicated instruction.
4534 auto *PredBB = PredInst->getParent();
4535 auto *VectorLoop = LI->getLoopFor(PredBB);
4536
4537 // Initialize a worklist with the operands of the predicated instruction.
4538 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4539
4540 // Holds instructions that we need to analyze again. An instruction may be
4541 // reanalyzed if we don't yet know if we can sink it or not.
4542 SmallVector<Instruction *, 8> InstsToReanalyze;
4543
4544 // Returns true if a given use occurs in the predicated block. Phi nodes use
4545 // their operands in their corresponding predecessor blocks.
4546 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4547 auto *I = cast<Instruction>(U.getUser());
4548 BasicBlock *BB = I->getParent();
4549 if (auto *Phi = dyn_cast<PHINode>(I))
4550 BB = Phi->getIncomingBlock(
4551 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4552 return BB == PredBB;
4553 };
4554
4555 // Iteratively sink the scalarized operands of the predicated instruction
4556 // into the block we created for it. When an instruction is sunk, it's
4557 // operands are then added to the worklist. The algorithm ends after one pass
4558 // through the worklist doesn't sink a single instruction.
4559 bool Changed;
4560 do {
4561 // Add the instructions that need to be reanalyzed to the worklist, and
4562 // reset the changed indicator.
4563 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4564 InstsToReanalyze.clear();
4565 Changed = false;
4566
4567 while (!Worklist.empty()) {
4568 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4569
4570 // We can't sink an instruction if it is a phi node, is not in the loop,
4571 // or may have side effects.
4572 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4573 I->mayHaveSideEffects())
4574 continue;
4575
4576 // If the instruction is already in PredBB, check if we can sink its
4577 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4578 // sinking the scalar instruction I, hence it appears in PredBB; but it
4579 // may have failed to sink I's operands (recursively), which we try
4580 // (again) here.
4581 if (I->getParent() == PredBB) {
4582 Worklist.insert(I->op_begin(), I->op_end());
4583 continue;
4584 }
4585
4586 // It's legal to sink the instruction if all its uses occur in the
4587 // predicated block. Otherwise, there's nothing to do yet, and we may
4588 // need to reanalyze the instruction.
4589 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4590 InstsToReanalyze.push_back(I);
4591 continue;
4592 }
4593
4594 // Move the instruction to the beginning of the predicated block, and add
4595 // it's operands to the worklist.
4596 I->moveBefore(&*PredBB->getFirstInsertionPt());
4597 Worklist.insert(I->op_begin(), I->op_end());
4598
4599 // The sinking may have enabled other instructions to be sunk, so we will
4600 // need to iterate.
4601 Changed = true;
4602 }
4603 } while (Changed);
4604}
4605
4606void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4607 for (PHINode *OrigPhi : OrigPHIsToFix) {
4608 VPWidenPHIRecipe *VPPhi =
4609 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4610 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4611 // Make sure the builder has a valid insert point.
4612 Builder.SetInsertPoint(NewPhi);
4613 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4614 VPValue *Inc = VPPhi->getIncomingValue(i);
4615 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4616 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4617 }
4618 }
4619}
4620
4621bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
4622 return Cost->useOrderedReductions(RdxDesc);
4623}
4624
4625void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4626 VPUser &Operands, unsigned UF,
4627 ElementCount VF, bool IsPtrLoopInvariant,
4628 SmallBitVector &IsIndexLoopInvariant,
4629 VPTransformState &State) {
4630 // Construct a vector GEP by widening the operands of the scalar GEP as
4631 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4632 // results in a vector of pointers when at least one operand of the GEP
4633 // is vector-typed. Thus, to keep the representation compact, we only use
4634 // vector-typed operands for loop-varying values.
4635
4636 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4637 // If we are vectorizing, but the GEP has only loop-invariant operands,
4638 // the GEP we build (by only using vector-typed operands for
4639 // loop-varying values) would be a scalar pointer. Thus, to ensure we
4640 // produce a vector of pointers, we need to either arbitrarily pick an
4641 // operand to broadcast, or broadcast a clone of the original GEP.
4642 // Here, we broadcast a clone of the original.
4643 //
4644 // TODO: If at some point we decide to scalarize instructions having
4645 // loop-invariant operands, this special case will no longer be
4646 // required. We would add the scalarization decision to
4647 // collectLoopScalars() and teach getVectorValue() to broadcast
4648 // the lane-zero scalar value.
4649 auto *Clone = Builder.Insert(GEP->clone());
4650 for (unsigned Part = 0; Part < UF; ++Part) {
4651 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4652 State.set(VPDef, EntryPart, Part);
4653 addMetadata(EntryPart, GEP);
4654 }
4655 } else {
4656 // If the GEP has at least one loop-varying operand, we are sure to
4657 // produce a vector of pointers. But if we are only unrolling, we want
4658 // to produce a scalar GEP for each unroll part. Thus, the GEP we
4659 // produce with the code below will be scalar (if VF == 1) or vector
4660 // (otherwise). Note that for the unroll-only case, we still maintain
4661 // values in the vector mapping with initVector, as we do for other
4662 // instructions.
4663 for (unsigned Part = 0; Part < UF; ++Part) {
4664 // The pointer operand of the new GEP. If it's loop-invariant, we
4665 // won't broadcast it.
4666 auto *Ptr = IsPtrLoopInvariant
4667 ? State.get(Operands.getOperand(0), VPIteration(0, 0))
4668 : State.get(Operands.getOperand(0), Part);
4669
4670 // Collect all the indices for the new GEP. If any index is
4671 // loop-invariant, we won't broadcast it.
4672 SmallVector<Value *, 4> Indices;
4673 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4674 VPValue *Operand = Operands.getOperand(I);
4675 if (IsIndexLoopInvariant[I - 1])
4676 Indices.push_back(State.get(Operand, VPIteration(0, 0)));
4677 else
4678 Indices.push_back(State.get(Operand, Part));
4679 }
4680
4681 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4682 // but it should be a vector, otherwise.
4683 auto *NewGEP =
4684 GEP->isInBounds()
4685 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4686 Indices)
4687 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4688 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&(static_cast <bool> ((VF.isScalar() || NewGEP->getType
()->isVectorTy()) && "NewGEP is not a pointer vector"
) ? void (0) : __assert_fail ("(VF.isScalar() || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4689, __extension__ __PRETTY_FUNCTION__))
4689 "NewGEP is not a pointer vector")(static_cast <bool> ((VF.isScalar() || NewGEP->getType
()->isVectorTy()) && "NewGEP is not a pointer vector"
) ? void (0) : __assert_fail ("(VF.isScalar() || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4689, __extension__ __PRETTY_FUNCTION__))
;
4690 State.set(VPDef, NewGEP, Part);
4691 addMetadata(NewGEP, GEP);
4692 }
4693 }
4694}
4695
4696void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4697 VPWidenPHIRecipe *PhiR,
4698 VPTransformState &State) {
4699 PHINode *P = cast<PHINode>(PN);
4700 if (EnableVPlanNativePath) {
4701 // Currently we enter here in the VPlan-native path for non-induction
4702 // PHIs where all control flow is uniform. We simply widen these PHIs.
4703 // Create a vector phi with no operands - the vector phi operands will be
4704 // set at the end of vector code generation.
4705 Type *VecTy = (State.VF.isScalar())
4706 ? PN->getType()
4707 : VectorType::get(PN->getType(), State.VF);
4708 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4709 State.set(PhiR, VecPhi, 0);
4710 OrigPHIsToFix.push_back(P);
4711
4712 return;
4713 }
4714
4715 assert(PN->getParent() == OrigLoop->getHeader() &&(static_cast <bool> (PN->getParent() == OrigLoop->
getHeader() && "Non-header phis should have been handled elsewhere"
) ? void (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4716, __extension__ __PRETTY_FUNCTION__))
4716 "Non-header phis should have been handled elsewhere")(static_cast <bool> (PN->getParent() == OrigLoop->
getHeader() && "Non-header phis should have been handled elsewhere"
) ? void (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4716, __extension__ __PRETTY_FUNCTION__))
;
4717
4718 // In order to support recurrences we need to be able to vectorize Phi nodes.
4719 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4720 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4721 // this value when we vectorize all of the instructions that use the PHI.
4722
4723 assert(!Legal->isReductionVariable(P) &&(static_cast <bool> (!Legal->isReductionVariable(P) &&
"reductions should be handled elsewhere") ? void (0) : __assert_fail
("!Legal->isReductionVariable(P) && \"reductions should be handled elsewhere\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4724, __extension__ __PRETTY_FUNCTION__))
4724 "reductions should be handled elsewhere")(static_cast <bool> (!Legal->isReductionVariable(P) &&
"reductions should be handled elsewhere") ? void (0) : __assert_fail
("!Legal->isReductionVariable(P) && \"reductions should be handled elsewhere\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4724, __extension__ __PRETTY_FUNCTION__))
;
4725
4726 setDebugLocFromInst(P);
4727
4728 // This PHINode must be an induction variable.
4729 // Make sure that we know about it.
4730 assert(Legal->getInductionVars().count(P) && "Not an induction variable")(static_cast <bool> (Legal->getInductionVars().count
(P) && "Not an induction variable") ? void (0) : __assert_fail
("Legal->getInductionVars().count(P) && \"Not an induction variable\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4730, __extension__ __PRETTY_FUNCTION__))
;
4731
4732 InductionDescriptor II = Legal->getInductionVars().lookup(P);
4733 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4734
4735 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4736 // which can be found from the original scalar operations.
4737 switch (II.getKind()) {
4738 case InductionDescriptor::IK_NoInduction:
4739 llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4739)
;
4740 case InductionDescriptor::IK_IntInduction:
4741 case InductionDescriptor::IK_FpInduction:
4742 llvm_unreachable("Integer/fp induction is handled elsewhere.")::llvm::llvm_unreachable_internal("Integer/fp induction is handled elsewhere."
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4742)
;
4743 case InductionDescriptor::IK_PtrInduction: {
4744 // Handle the pointer induction variable case.
4745 assert(P->getType()->isPointerTy() && "Unexpected type.")(static_cast <bool> (P->getType()->isPointerTy() &&
"Unexpected type.") ? void (0) : __assert_fail ("P->getType()->isPointerTy() && \"Unexpected type.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4745, __extension__ __PRETTY_FUNCTION__))
;
4746
4747 if (Cost->isScalarAfterVectorization(P, State.VF)) {
4748 // This is the normalized GEP that starts counting at zero.
4749 Value *PtrInd =
4750 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4751 // Determine the number of scalars we need to generate for each unroll
4752 // iteration. If the instruction is uniform, we only need to generate the
4753 // first lane. Otherwise, we generate all VF values.
4754 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
4755 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
4756
4757 bool NeedsVectorIndex = !IsUniform && VF.isScalable();
4758 Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr;
4759 if (NeedsVectorIndex) {
4760 Type *VecIVTy = VectorType::get(PtrInd->getType(), VF);
4761 UnitStepVec = Builder.CreateStepVector(VecIVTy);
4762 PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd);
4763 }
4764
4765 for (unsigned Part = 0; Part < UF; ++Part) {
4766 Value *PartStart = createStepForVF(
4767 Builder, ConstantInt::get(PtrInd->getType(), Part), VF);
4768
4769 if (NeedsVectorIndex) {
4770 Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
4771 Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
4772 Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
4773 Value *SclrGep =
4774 emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
4775 SclrGep->setName("next.gep");
4776 State.set(PhiR, SclrGep, Part);
4777 // We've cached the whole vector, which means we can support the
4778 // extraction of any lane.
4779 continue;
4780 }
4781
4782 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4783 Value *Idx = Builder.CreateAdd(
4784 PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4785 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4786 Value *SclrGep =
4787 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4788 SclrGep->setName("next.gep");
4789 State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4790 }
4791 }
4792 return;
4793 }
4794 assert(isa<SCEVConstant>(II.getStep()) &&(static_cast <bool> (isa<SCEVConstant>(II.getStep
()) && "Induction step not a SCEV constant!") ? void (
0) : __assert_fail ("isa<SCEVConstant>(II.getStep()) && \"Induction step not a SCEV constant!\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4795, __extension__ __PRETTY_FUNCTION__))
4795 "Induction step not a SCEV constant!")(static_cast <bool> (isa<SCEVConstant>(II.getStep
()) && "Induction step not a SCEV constant!") ? void (
0) : __assert_fail ("isa<SCEVConstant>(II.getStep()) && \"Induction step not a SCEV constant!\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4795, __extension__ __PRETTY_FUNCTION__))
;
4796 Type *PhiType = II.getStep()->getType();
4797
4798 // Build a pointer phi
4799 Value *ScalarStartValue = II.getStartValue();
4800 Type *ScStValueType = ScalarStartValue->getType();
4801 PHINode *NewPointerPhi =
4802 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4803 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4804
4805 // A pointer induction, performed by using a gep
4806 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4807 Instruction *InductionLoc = LoopLatch->getTerminator();
4808 const SCEV *ScalarStep = II.getStep();
4809 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4810 Value *ScalarStepValue =
4811 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4812 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4813 Value *NumUnrolledElems =
4814 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4815 Value *InductionGEP = GetElementPtrInst::Create(
4816 ScStValueType->getPointerElementType(), NewPointerPhi,
4817 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4818 InductionLoc);
4819 NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4820
4821 // Create UF many actual address geps that use the pointer
4822 // phi as base and a vectorized version of the step value
4823 // (<step*0, ..., step*N>) as offset.
4824 for (unsigned Part = 0; Part < State.UF; ++Part) {
4825 Type *VecPhiType = VectorType::get(PhiType, State.VF);
4826 Value *StartOffsetScalar =
4827 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4828 Value *StartOffset =
4829 Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4830 // Create a vector of consecutive numbers from zero to VF.
4831 StartOffset =
4832 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4833
4834 Value *GEP = Builder.CreateGEP(
4835 ScStValueType->getPointerElementType(), NewPointerPhi,
4836 Builder.CreateMul(
4837 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4838 "vector.gep"));
4839 State.set(PhiR, GEP, Part);
4840 }
4841 }
4842 }
4843}
4844
4845/// A helper function for checking whether an integer division-related
4846/// instruction may divide by zero (in which case it must be predicated if
4847/// executed conditionally in the scalar code).
4848/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4849/// Non-zero divisors that are non compile-time constants will not be
4850/// converted into multiplication, so we will still end up scalarizing
4851/// the division, but can do so w/o predication.
4852static bool mayDivideByZero(Instruction &I) {
4853 assert((I.getOpcode() == Instruction::UDiv ||(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
|| I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4857, __extension__ __PRETTY_FUNCTION__))
4854 I.getOpcode() == Instruction::SDiv ||(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
|| I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4857, __extension__ __PRETTY_FUNCTION__))
4855 I.getOpcode() == Instruction::URem ||(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
|| I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4857, __extension__ __PRETTY_FUNCTION__))
4856 I.getOpcode() == Instruction::SRem) &&(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
|| I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4857, __extension__ __PRETTY_FUNCTION__))
4857 "Unexpected instruction")(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
|| I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4857, __extension__ __PRETTY_FUNCTION__))
;
4858 Value *Divisor = I.getOperand(1);
4859 auto *CInt = dyn_cast<ConstantInt>(Divisor);
4860 return !CInt || CInt->isZero();
4861}
4862
4863void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4864 VPUser &User,
4865 VPTransformState &State) {
4866 switch (I.getOpcode()) {
4867 case Instruction::Call:
4868 case Instruction::Br:
4869 case Instruction::PHI:
4870 case Instruction::GetElementPtr:
4871 case Instruction::Select:
4872 llvm_unreachable("This instruction is handled by a different recipe.")::llvm::llvm_unreachable_internal("This instruction is handled by a different recipe."
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4872)
;
4873 case Instruction::UDiv:
4874 case Instruction::SDiv:
4875 case Instruction::SRem:
4876 case Instruction::URem:
4877 case Instruction::Add:
4878 case Instruction::FAdd:
4879 case Instruction::Sub:
4880 case Instruction::FSub:
4881 case Instruction::FNeg:
4882 case Instruction::Mul:
4883 case Instruction::FMul:
4884 case Instruction::FDiv:
4885 case Instruction::FRem:
4886 case Instruction::Shl:
4887 case Instruction::LShr:
4888 case Instruction::AShr:
4889 case Instruction::And:
4890 case Instruction::Or:
4891 case Instruction::Xor: {
4892 // Just widen unops and binops.
4893 setDebugLocFromInst(&I);
4894
4895 for (unsigned Part = 0; Part < UF; ++Part) {
4896 SmallVector<Value *, 2> Ops;
4897 for (VPValue *VPOp : User.operands())
4898 Ops.push_back(State.get(VPOp, Part));
4899
4900 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4901
4902 if (auto *VecOp = dyn_cast<Instruction>(V))
4903 VecOp->copyIRFlags(&I);
4904
4905 // Use this vector value for all users of the original instruction.
4906 State.set(Def, V, Part);
4907 addMetadata(V, &I);
4908 }
4909
4910 break;
4911 }
4912 case Instruction::ICmp:
4913 case Instruction::FCmp: {
4914 // Widen compares. Generate vector compares.
4915 bool FCmp = (I.getOpcode() == Instruction::FCmp);
4916 auto *Cmp = cast<CmpInst>(&I);
4917 setDebugLocFromInst(Cmp);
4918 for (unsigned Part = 0; Part < UF; ++Part) {
4919 Value *A = State.get(User.getOperand(0), Part);
4920 Value *B = State.get(User.getOperand(1), Part);
4921 Value *C = nullptr;
4922 if (FCmp) {
4923 // Propagate fast math flags.
4924 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4925 Builder.setFastMathFlags(Cmp->getFastMathFlags());
4926 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4927 } else {
4928 C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4929 }
4930 State.set(Def, C, Part);
4931 addMetadata(C, &I);
4932 }
4933
4934 break;
4935 }
4936
4937 case Instruction::ZExt:
4938 case Instruction::SExt:
4939 case Instruction::FPToUI:
4940 case Instruction::FPToSI:
4941 case Instruction::FPExt:
4942 case Instruction::PtrToInt:
4943 case Instruction::IntToPtr:
4944 case Instruction::SIToFP:
4945 case Instruction::UIToFP:
4946 case Instruction::Trunc:
4947 case Instruction::FPTrunc:
4948 case Instruction::BitCast: {
4949 auto *CI = cast<CastInst>(&I);
4950 setDebugLocFromInst(CI);
4951
4952 /// Vectorize casts.
4953 Type *DestTy =
4954 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4955
4956 for (unsigned Part = 0; Part < UF; ++Part) {
4957 Value *A = State.get(User.getOperand(0), Part);
4958 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4959 State.set(Def, Cast, Part);
4960 addMetadata(Cast, &I);
4961 }
4962 break;
4963 }
4964 default:
4965 // This instruction is not vectorized by simple widening.
4966 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an unhandled instruction: "
<< I; } } while (false)
;
4967 llvm_unreachable("Unhandled instruction!")::llvm::llvm_unreachable_internal("Unhandled instruction!", "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4967)
;
4968 } // end of switch.
4969}
4970
4971void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4972 VPUser &ArgOperands,
4973 VPTransformState &State) {
4974 assert(!isa<DbgInfoIntrinsic>(I) &&(static_cast <bool> (!isa<DbgInfoIntrinsic>(I) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction"
) ? void (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(I) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4975, __extension__ __PRETTY_FUNCTION__))
4975 "DbgInfoIntrinsic should have been dropped during VPlan construction")(static_cast <bool> (!isa<DbgInfoIntrinsic>(I) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction"
) ? void (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(I) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4975, __extension__ __PRETTY_FUNCTION__))
;
4976 setDebugLocFromInst(&I);
4977
4978 Module *M = I.getParent()->getParent()->getParent();
4979 auto *CI = cast<CallInst>(&I);
4980
4981 SmallVector<Type *, 4> Tys;
4982 for (Value *ArgOperand : CI->arg_operands())
4983 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4984
4985 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4986
4987 // The flag shows whether we use Intrinsic or a usual Call for vectorized
4988 // version of the instruction.
4989 // Is it beneficial to perform intrinsic call compared to lib call?
4990 bool NeedToScalarize = false;
4991 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4992 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4993 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4994 assert((UseVectorIntrinsic || !NeedToScalarize) &&(static_cast <bool> ((UseVectorIntrinsic || !NeedToScalarize
) && "Instruction should be scalarized elsewhere.") ?
void (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4995, __extension__ __PRETTY_FUNCTION__))
4995 "Instruction should be scalarized elsewhere.")(static_cast <bool> ((UseVectorIntrinsic || !NeedToScalarize
) && "Instruction should be scalarized elsewhere.") ?
void (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4995, __extension__ __PRETTY_FUNCTION__))
;
4996 assert((IntrinsicCost.isValid() || CallCost.isValid()) &&(static_cast <bool> ((IntrinsicCost.isValid() || CallCost
.isValid()) && "Either the intrinsic cost or vector call cost must be valid"
) ? void (0) : __assert_fail ("(IntrinsicCost.isValid() || CallCost.isValid()) && \"Either the intrinsic cost or vector call cost must be valid\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4997, __extension__ __PRETTY_FUNCTION__))
4997 "Either the intrinsic cost or vector call cost must be valid")(static_cast <bool> ((IntrinsicCost.isValid() || CallCost
.isValid()) && "Either the intrinsic cost or vector call cost must be valid"
) ? void (0) : __assert_fail ("(IntrinsicCost.isValid() || CallCost.isValid()) && \"Either the intrinsic cost or vector call cost must be valid\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4997, __extension__ __PRETTY_FUNCTION__))
;
4998
4999 for (unsigned Part = 0; Part < UF; ++Part) {
5000 SmallVector<Type *, 2> TysForDecl = {CI->getType()};
5001 SmallVector<Value *, 4> Args;
5002 for (auto &I : enumerate(ArgOperands.operands())) {
5003 // Some intrinsics have a scalar argument - don't replace it with a
5004 // vector.
5005 Value *Arg;
5006 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
5007 Arg = State.get(I.value(), Part);
5008 else {
5009 Arg = State.get(I.value(), VPIteration(0, 0));
5010 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
5011 TysForDecl.push_back(Arg->getType());
5012 }
5013 Args.push_back(Arg);
5014 }
5015
5016 Function *VectorF;
5017 if (UseVectorIntrinsic) {
5018 // Use vector version of the intrinsic.
5019 if (VF.isVector())
5020 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
5021 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
5022 assert(VectorF && "Can't retrieve vector intrinsic.")(static_cast <bool> (VectorF && "Can't retrieve vector intrinsic."
) ? void (0) : __assert_fail ("VectorF && \"Can't retrieve vector intrinsic.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5022, __extension__ __PRETTY_FUNCTION__))
;
5023 } else {
5024 // Use vector version of the function call.
5025 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
5026#ifndef NDEBUG
5027 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&(static_cast <bool> (VFDatabase(*CI).getVectorizedFunction
(Shape) != nullptr && "Can't create vector function."
) ? void (0) : __assert_fail ("VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5028, __extension__ __PRETTY_FUNCTION__))
5028 "Can't create vector function.")(static_cast <bool> (VFDatabase(*CI).getVectorizedFunction
(Shape) != nullptr && "Can't create vector function."
) ? void (0) : __assert_fail ("VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5028, __extension__ __PRETTY_FUNCTION__))
;
5029#endif
5030 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
5031 }
5032 SmallVector<OperandBundleDef, 1> OpBundles;
5033 CI->getOperandBundlesAsDefs(OpBundles);
5034 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
5035
5036 if (isa<FPMathOperator>(V))
5037 V->copyFastMathFlags(CI);
5038
5039 State.set(Def, V, Part);
5040 addMetadata(V, &I);
5041 }
5042}
5043
5044void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
5045 VPUser &Operands,
5046 bool InvariantCond,
5047 VPTransformState &State) {
5048 setDebugLocFromInst(&I);
5049
5050 // The condition can be loop invariant but still defined inside the
5051 // loop. This means that we can't just use the original 'cond' value.
5052 // We have to take the 'vectorized' value and pick the first lane.
5053 // Instcombine will make this a no-op.
5054 auto *InvarCond = InvariantCond
5055 ? State.get(Operands.getOperand(0), VPIteration(0, 0))
5056 : nullptr;
5057
5058 for (unsigned Part = 0; Part < UF; ++Part) {
5059 Value *Cond =
5060 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
5061 Value *Op0 = State.get(Operands.getOperand(1), Part);
5062 Value *Op1 = State.get(Operands.getOperand(2), Part);
5063 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
5064 State.set(VPDef, Sel, Part);
5065 addMetadata(Sel, &I);
5066 }
5067}
5068
5069void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
5070 // We should not collect Scalars more than once per VF. Right now, this
5071 // function is called from collectUniformsAndScalars(), which already does
5072 // this check. Collecting Scalars for VF=1 does not make any sense.
5073 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&(static_cast <bool> (VF.isVector() && Scalars.find
(VF) == Scalars.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5074, __extension__ __PRETTY_FUNCTION__))
5074 "This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Scalars.find
(VF) == Scalars.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5074, __extension__ __PRETTY_FUNCTION__))
;
5075
5076 SmallSetVector<Instruction *, 8> Worklist;
5077
5078 // These sets are used to seed the analysis with pointers used by memory
5079 // accesses that will remain scalar.
5080 SmallSetVector<Instruction *, 8> ScalarPtrs;
5081 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
5082 auto *Latch = TheLoop->getLoopLatch();
5083
5084 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
5085 // The pointer operands of loads and stores will be scalar as long as the
5086 // memory access is not a gather or scatter operation. The value operand of a
5087 // store will remain scalar if the store is scalarized.
5088 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
5089 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
5090 assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5091, __extension__ __PRETTY_FUNCTION__))
5091 "Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5091, __extension__ __PRETTY_FUNCTION__))
;
5092 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
5093 if (Ptr == Store->getValueOperand())
5094 return WideningDecision == CM_Scalarize;
5095 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5096, __extension__ __PRETTY_FUNCTION__))
5096 "Ptr is neither a value or pointer operand")(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5096, __extension__ __PRETTY_FUNCTION__))
;
5097 return WideningDecision != CM_GatherScatter;
5098 };
5099
5100 // A helper that returns true if the given value is a bitcast or
5101 // getelementptr instruction contained in the loop.
5102 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
5103 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
5104 isa<GetElementPtrInst>(V)) &&
5105 !TheLoop->isLoopInvariant(V);
5106 };
5107
5108 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
5109 if (!isa<PHINode>(Ptr) ||
5110 !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
5111 return false;
5112 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
5113 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
5114 return false;
5115 return isScalarUse(MemAccess, Ptr);
5116 };
5117
5118 // A helper that evaluates a memory access's use of a pointer. If the
5119 // pointer is actually the pointer induction of a loop, it is being
5120 // inserted into Worklist. If the use will be a scalar use, and the
5121 // pointer is only used by memory accesses, we place the pointer in
5122 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
5123 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
5124 if (isScalarPtrInduction(MemAccess, Ptr)) {
5125 Worklist.insert(cast<Instruction>(Ptr));
5126 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptrdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Ptr << "\n"; } } while (false)
5127 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Ptr << "\n"; } } while (false)
;
5128
5129 Instruction *Update = cast<Instruction>(
5130 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
5131 ScalarPtrs.insert(Update);
5132 return;
5133 }
5134 // We only care about bitcast and getelementptr instructions contained in
5135 // the loop.
5136 if (!isLoopVaryingBitCastOrGEP(Ptr))
5137 return;
5138
5139 // If the pointer has already been identified as scalar (e.g., if it was
5140 // also identified as uniform), there's nothing to do.
5141 auto *I = cast<Instruction>(Ptr);
5142 if (Worklist.count(I))
5143 return;
5144
5145 // If all users of the pointer will be memory accesses and scalar, place the
5146 // pointer in ScalarPtrs. Otherwise, place the pointer in
5147 // PossibleNonScalarPtrs.
5148 if (llvm::all_of(I->users(), [&](User *U) {
5149 return (isa<LoadInst>(U) || isa<StoreInst>(U)) &&
5150 isScalarUse(cast<Instruction>(U), Ptr);
5151 }))
5152 ScalarPtrs.insert(I);
5153 else
5154 PossibleNonScalarPtrs.insert(I);
5155 };
5156
5157 // We seed the scalars analysis with three classes of instructions: (1)
5158 // instructions marked uniform-after-vectorization and (2) bitcast,
5159 // getelementptr and (pointer) phi instructions used by memory accesses
5160 // requiring a scalar use.
5161 //
5162 // (1) Add to the worklist all instructions that have been identified as
5163 // uniform-after-vectorization.
5164 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5165
5166 // (2) Add to the worklist all bitcast and getelementptr instructions used by
5167 // memory accesses requiring a scalar use. The pointer operands of loads and
5168 // stores will be scalar as long as the memory accesses is not a gather or
5169 // scatter operation. The value operand of a store will remain scalar if the
5170 // store is scalarized.
5171 for (auto *BB : TheLoop->blocks())
5172 for (auto &I : *BB) {
5173 if (auto *Load = dyn_cast<LoadInst>(&I)) {
5174 evaluatePtrUse(Load, Load->getPointerOperand());
5175 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5176 evaluatePtrUse(Store, Store->getPointerOperand());
5177 evaluatePtrUse(Store, Store->getValueOperand());
5178 }
5179 }
5180 for (auto *I : ScalarPtrs)
5181 if (!PossibleNonScalarPtrs.count(I)) {
5182 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *I << "\n"; } } while (false)
;
5183 Worklist.insert(I);
5184 }
5185
5186 // Insert the forced scalars.
5187 // FIXME: Currently widenPHIInstruction() often creates a dead vector
5188 // induction variable when the PHI user is scalarized.
5189 auto ForcedScalar = ForcedScalars.find(VF);
5190 if (ForcedScalar != ForcedScalars.end())
5191 for (auto *I : ForcedScalar->second)
5192 Worklist.insert(I);
5193
5194 // Expand the worklist by looking through any bitcasts and getelementptr
5195 // instructions we've already identified as scalar. This is similar to the
5196 // expansion step in collectLoopUniforms(); however, here we're only
5197 // expanding to include additional bitcasts and getelementptr instructions.
5198 unsigned Idx = 0;
5199 while (Idx != Worklist.size()) {
5200 Instruction *Dst = Worklist[Idx++];
5201 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5202 continue;
5203 auto *Src = cast<Instruction>(Dst->getOperand(0));
5204 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5205 auto *J = cast<Instruction>(U);
5206 return !TheLoop->contains(J) || Worklist.count(J) ||
5207 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5208 isScalarUse(J, Src));
5209 })) {
5210 Worklist.insert(Src);
5211 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Src << "\n"; } } while (false)
;
5212 }
5213 }
5214
5215 // An induction variable will remain scalar if all users of the induction
5216 // variable and induction variable update remain scalar.
5217 for (auto &Induction : Legal->getInductionVars()) {
5218 auto *Ind = Induction.first;
5219 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5220
5221 // If tail-folding is applied, the primary induction variable will be used
5222 // to feed a vector compare.
5223 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5224 continue;
5225
5226 // Determine if all users of the induction variable are scalar after
5227 // vectorization.
5228 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5229 auto *I = cast<Instruction>(U);
5230 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5231 });
5232 if (!ScalarInd)
5233 continue;
5234
5235 // Determine if all users of the induction variable update instruction are
5236 // scalar after vectorization.
5237 auto ScalarIndUpdate =
5238 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5239 auto *I = cast<Instruction>(U);
5240 return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5241 });
5242 if (!ScalarIndUpdate)
5243 continue;
5244
5245 // The induction variable and its update instruction will remain scalar.
5246 Worklist.insert(Ind);
5247 Worklist.insert(IndUpdate);
5248 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Ind << "\n"; } } while (false)
;
5249 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
5250 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
;
5251 }
5252
5253 Scalars[VF].insert(Worklist.begin(), Worklist.end());
5254}
5255
5256bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
5257 if (!blockNeedsPredication(I->getParent()))
5258 return false;
5259 switch(I->getOpcode()) {
5260 default:
5261 break;
5262 case Instruction::Load:
5263 case Instruction::Store: {
5264 if (!Legal->isMaskRequired(I))
5265 return false;
5266 auto *Ptr = getLoadStorePointerOperand(I);
5267 auto *Ty = getLoadStoreType(I);
5268 const Align Alignment = getLoadStoreAlignment(I);
5269 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5270 TTI.isLegalMaskedGather(Ty, Alignment))
5271 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5272 TTI.isLegalMaskedScatter(Ty, Alignment));
5273 }
5274 case Instruction::UDiv:
5275 case Instruction::SDiv:
5276 case Instruction::SRem:
5277 case Instruction::URem:
5278 return mayDivideByZero(*I);
5279 }
5280 return false;
5281}
5282
5283bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5284 Instruction *I, ElementCount VF) {
5285 assert(isAccessInterleaved(I) && "Expecting interleaved access.")(static_cast <bool> (isAccessInterleaved(I) && "Expecting interleaved access."
) ? void (0) : __assert_fail ("isAccessInterleaved(I) && \"Expecting interleaved access.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5285, __extension__ __PRETTY_FUNCTION__))
;
5286 assert(getWideningDecision(I, VF) == CM_Unknown &&(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown
&& "Decision should not be set yet.") ? void (0) : __assert_fail
("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5287, __extension__ __PRETTY_FUNCTION__))
5287 "Decision should not be set yet.")(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown
&& "Decision should not be set yet.") ? void (0) : __assert_fail
("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5287, __extension__ __PRETTY_FUNCTION__))
;
5288 auto *Group = getInterleavedAccessGroup(I);
5289 assert(Group && "Must have a group.")(static_cast <bool> (Group && "Must have a group."
) ? void (0) : __assert_fail ("Group && \"Must have a group.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5289, __extension__ __PRETTY_FUNCTION__))
;
5290
5291 // If the instruction's allocated size doesn't equal it's type size, it
5292 // requires padding and will be scalarized.
5293 auto &DL = I->getModule()->getDataLayout();
5294 auto *ScalarTy = getLoadStoreType(I);
5295 if (hasIrregularType(ScalarTy, DL))
5296 return false;
5297
5298 // Check if masking is required.
5299 // A Group may need masking for one of two reasons: it resides in a block that
5300 // needs predication, or it was decided to use masking to deal with gaps.
5301 bool PredicatedAccessRequiresMasking =
5302 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5303 bool AccessWithGapsRequiresMasking =
5304 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5305 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
5306 return true;
5307
5308 // If masked interleaving is required, we expect that the user/target had
5309 // enabled it, because otherwise it either wouldn't have been created or
5310 // it should have been invalidated by the CostModel.
5311 assert(useMaskedInterleavedAccesses(TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled."
) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5312, __extension__ __PRETTY_FUNCTION__))
5312 "Masked interleave-groups for predicated accesses are not enabled.")(static_cast <bool> (useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled."
) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5312, __extension__ __PRETTY_FUNCTION__))
;
5313
5314 auto *Ty = getLoadStoreType(I);
5315 const Align Alignment = getLoadStoreAlignment(I);
5316 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5317 : TTI.isLegalMaskedStore(Ty, Alignment);
5318}
5319
5320bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5321 Instruction *I, ElementCount VF) {
5322 // Get and ensure we have a valid memory instruction.
5323 LoadInst *LI = dyn_cast<LoadInst>(I);
5324 StoreInst *SI = dyn_cast<StoreInst>(I);
5325 assert((LI || SI) && "Invalid memory instruction")(static_cast <bool> ((LI || SI) && "Invalid memory instruction"
) ? void (0) : __assert_fail ("(LI || SI) && \"Invalid memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5325, __extension__ __PRETTY_FUNCTION__))
;
5326
5327 auto *Ptr = getLoadStorePointerOperand(I);
5328
5329 // In order to be widened, the pointer should be consecutive, first of all.
5330 if (!Legal->isConsecutivePtr(Ptr))
5331 return false;
5332
5333 // If the instruction is a store located in a predicated block, it will be
5334 // scalarized.
5335 if (isScalarWithPredication(I))
5336 return false;
5337
5338 // If the instruction's allocated size doesn't equal it's type size, it
5339 // requires padding and will be scalarized.
5340 auto &DL = I->getModule()->getDataLayout();
5341 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5342 if (hasIrregularType(ScalarTy, DL))
5343 return false;
5344
5345 return true;
5346}
5347
5348void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5349 // We should not collect Uniforms more than once per VF. Right now,
5350 // this function is called from collectUniformsAndScalars(), which
5351 // already does this check. Collecting Uniforms for VF=1 does not make any
5352 // sense.
5353
5354 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&(static_cast <bool> (VF.isVector() && Uniforms.
find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5355, __extension__ __PRETTY_FUNCTION__))
5355 "This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Uniforms.
find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5355, __extension__ __PRETTY_FUNCTION__))
;
5356
5357 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5358 // not analyze again. Uniforms.count(VF) will return 1.
5359 Uniforms[VF].clear();
5360
5361 // We now know that the loop is vectorizable!
5362 // Collect instructions inside the loop that will remain uniform after
5363 // vectorization.
5364
5365 // Global values, params and instructions outside of current loop are out of
5366 // scope.
5367 auto isOutOfScope = [&](Value *V) -> bool {
5368 Instruction *I = dyn_cast<Instruction>(V);
5369 return (!I || !TheLoop->contains(I));
5370 };
5371
5372 SetVector<Instruction *> Worklist;
5373 BasicBlock *Latch = TheLoop->getLoopLatch();
5374
5375 // Instructions that are scalar with predication must not be considered
5376 // uniform after vectorization, because that would create an erroneous
5377 // replicating region where only a single instance out of VF should be formed.
5378 // TODO: optimize such seldom cases if found important, see PR40816.
5379 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5380 if (isOutOfScope(I)) {
5381 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
5382 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
;
5383 return;
5384 }
5385 if (isScalarWithPredication(I)) {
5386 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
5387 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
;
5388 return;
5389 }
5390 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
<< *I << "\n"; } } while (false)
;
5391 Worklist.insert(I);
5392 };
5393
5394 // Start with the conditional branch. If the branch condition is an
5395 // instruction contained in the loop that is only used by the branch, it is
5396 // uniform.
5397 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5398 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5399 addToWorklistIfAllowed(Cmp);
5400
5401 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5402 InstWidening WideningDecision = getWideningDecision(I, VF);
5403 assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5404, __extension__ __PRETTY_FUNCTION__))
5404 "Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5404, __extension__ __PRETTY_FUNCTION__))
;
5405
5406 // A uniform memory op is itself uniform. We exclude uniform stores
5407 // here as they demand the last lane, not the first one.
5408 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5409 assert(WideningDecision == CM_Scalarize)(static_cast <bool> (WideningDecision == CM_Scalarize) ?
void (0) : __assert_fail ("WideningDecision == CM_Scalarize"
, "/build/llvm-toolchain-snapshot-13~++20210726100616+dead50d4427c/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5409, __extension__ __PRETTY_FUNCTION__))
;
5410 return true;
5411 }
5412
5413 return (WideningDecision == CM_Widen ||
5414 WideningDecision == CM_Widen_Reverse ||
5415 WideningDecision == CM_Interleave);
5416 };
5417
5418
5419 // Returns true if Ptr is the pointer operand of a memory access instruction
5420 // I, and I is known to not require scalarization.
5421 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5422 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5423 };
5424
5425 // Holds a list of values which are known to have at least one uniform use.
5426 // Note that there may be other uses which aren't uniform. A "uniform use"
5427 // here is something which only demands lane 0 of the unrolled iterations;
5428 // it does not imply that all lanes produce the same value (e.g. this is not
5429 // the usual meaning of uniform)
5430 SetVector<Value *> HasUniformUse;
5431
5432 // Scan the loop for instructions which are either a) known to have only
5433 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5434 for (auto *BB : TheLoop->blocks())
5435 for (auto &I : *BB) {
5436 // If there's no pointer operand, there's nothing to do.
5437 auto *Ptr = getLoadStorePointerOperand(&I);
5438 if (!Ptr)
5439 continue;
5440
5441 // A uniform memory op is itself uniform. We exclude uniform stores
5442 // here as they dem