Bug Summary

File:llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Warning:line 8648, column 5
Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopVectorize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/build-llvm/lib/Transforms/Vectorize -resource-dir /usr/lib/llvm-13/lib/clang/13.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/build-llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/build-llvm/include -I /build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../x86_64-linux-gnu/include -internal-isystem /usr/lib/llvm-13/lib/clang/13.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/build-llvm/lib/Transforms/Vectorize -fdebug-prefix-map=/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4=. -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-04-05-202135-9119-1 -x c++ /build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/Proposal/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanHCFGBuilder.h"
61#include "VPlanPredicator.h"
62#include "VPlanTransforms.h"
63#include "llvm/ADT/APInt.h"
64#include "llvm/ADT/ArrayRef.h"
65#include "llvm/ADT/DenseMap.h"
66#include "llvm/ADT/DenseMapInfo.h"
67#include "llvm/ADT/Hashing.h"
68#include "llvm/ADT/MapVector.h"
69#include "llvm/ADT/None.h"
70#include "llvm/ADT/Optional.h"
71#include "llvm/ADT/STLExtras.h"
72#include "llvm/ADT/SmallPtrSet.h"
73#include "llvm/ADT/SmallVector.h"
74#include "llvm/ADT/Statistic.h"
75#include "llvm/ADT/StringRef.h"
76#include "llvm/ADT/Twine.h"
77#include "llvm/ADT/iterator_range.h"
78#include "llvm/Analysis/AssumptionCache.h"
79#include "llvm/Analysis/BasicAliasAnalysis.h"
80#include "llvm/Analysis/BlockFrequencyInfo.h"
81#include "llvm/Analysis/CFG.h"
82#include "llvm/Analysis/CodeMetrics.h"
83#include "llvm/Analysis/DemandedBits.h"
84#include "llvm/Analysis/GlobalsModRef.h"
85#include "llvm/Analysis/LoopAccessAnalysis.h"
86#include "llvm/Analysis/LoopAnalysisManager.h"
87#include "llvm/Analysis/LoopInfo.h"
88#include "llvm/Analysis/LoopIterator.h"
89#include "llvm/Analysis/MemorySSA.h"
90#include "llvm/Analysis/OptimizationRemarkEmitter.h"
91#include "llvm/Analysis/ProfileSummaryInfo.h"
92#include "llvm/Analysis/ScalarEvolution.h"
93#include "llvm/Analysis/ScalarEvolutionExpressions.h"
94#include "llvm/Analysis/TargetLibraryInfo.h"
95#include "llvm/Analysis/TargetTransformInfo.h"
96#include "llvm/Analysis/VectorUtils.h"
97#include "llvm/IR/Attributes.h"
98#include "llvm/IR/BasicBlock.h"
99#include "llvm/IR/CFG.h"
100#include "llvm/IR/Constant.h"
101#include "llvm/IR/Constants.h"
102#include "llvm/IR/DataLayout.h"
103#include "llvm/IR/DebugInfoMetadata.h"
104#include "llvm/IR/DebugLoc.h"
105#include "llvm/IR/DerivedTypes.h"
106#include "llvm/IR/DiagnosticInfo.h"
107#include "llvm/IR/Dominators.h"
108#include "llvm/IR/Function.h"
109#include "llvm/IR/IRBuilder.h"
110#include "llvm/IR/InstrTypes.h"
111#include "llvm/IR/Instruction.h"
112#include "llvm/IR/Instructions.h"
113#include "llvm/IR/IntrinsicInst.h"
114#include "llvm/IR/Intrinsics.h"
115#include "llvm/IR/LLVMContext.h"
116#include "llvm/IR/Metadata.h"
117#include "llvm/IR/Module.h"
118#include "llvm/IR/Operator.h"
119#include "llvm/IR/Type.h"
120#include "llvm/IR/Use.h"
121#include "llvm/IR/User.h"
122#include "llvm/IR/Value.h"
123#include "llvm/IR/ValueHandle.h"
124#include "llvm/IR/Verifier.h"
125#include "llvm/InitializePasses.h"
126#include "llvm/Pass.h"
127#include "llvm/Support/Casting.h"
128#include "llvm/Support/CommandLine.h"
129#include "llvm/Support/Compiler.h"
130#include "llvm/Support/Debug.h"
131#include "llvm/Support/ErrorHandling.h"
132#include "llvm/Support/InstructionCost.h"
133#include "llvm/Support/MathExtras.h"
134#include "llvm/Support/raw_ostream.h"
135#include "llvm/Transforms/Utils/BasicBlockUtils.h"
136#include "llvm/Transforms/Utils/InjectTLIMappings.h"
137#include "llvm/Transforms/Utils/LoopSimplify.h"
138#include "llvm/Transforms/Utils/LoopUtils.h"
139#include "llvm/Transforms/Utils/LoopVersioning.h"
140#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141#include "llvm/Transforms/Utils/SizeOpts.h"
142#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143#include <algorithm>
144#include <cassert>
145#include <cstdint>
146#include <cstdlib>
147#include <functional>
148#include <iterator>
149#include <limits>
150#include <memory>
151#include <string>
152#include <tuple>
153#include <utility>
154
155using namespace llvm;
156
157#define LV_NAME"loop-vectorize" "loop-vectorize"
158#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"
159
160#ifndef NDEBUG
161const char VerboseDebug[] = DEBUG_TYPE"loop-vectorize" "-verbose";
162#endif
163
164/// @{
165/// Metadata attribute names
166const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
167const char LLVMLoopVectorizeFollowupVectorized[] =
168 "llvm.loop.vectorize.followup_vectorized";
169const char LLVMLoopVectorizeFollowupEpilogue[] =
170 "llvm.loop.vectorize.followup_epilogue";
171/// @}
172
173STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = {"loop-vectorize", "LoopsVectorized"
, "Number of loops vectorized"}
;
174STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = {"loop-vectorize", "LoopsAnalyzed"
, "Number of loops analyzed for vectorization"}
;
175STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized")static llvm::Statistic LoopsEpilogueVectorized = {"loop-vectorize"
, "LoopsEpilogueVectorized", "Number of epilogues vectorized"
}
;
176
177static cl::opt<bool> EnableEpilogueVectorization(
178 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
179 cl::desc("Enable vectorization of epilogue loops."));
180
181static cl::opt<unsigned> EpilogueVectorizationForceVF(
182 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
183 cl::desc("When epilogue vectorization is enabled, and a value greater than "
184 "1 is specified, forces the given VF for all applicable epilogue "
185 "loops."));
186
187static cl::opt<unsigned> EpilogueVectorizationMinVF(
188 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189 cl::desc("Only loops with vectorization factor equal to or larger than "
190 "the specified value are considered for epilogue vectorization."));
191
192/// Loops with a known constant trip count below this number are vectorized only
193/// if no scalar iteration overheads are incurred.
194static cl::opt<unsigned> TinyTripCountVectorThreshold(
195 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
196 cl::desc("Loops with a constant trip count that is smaller than this "
197 "value are vectorized only if no scalar iteration overheads "
198 "are incurred."));
199
200static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
201 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
202 cl::desc("The maximum allowed number of runtime memory checks with a "
203 "vectorize(enable) pragma."));
204
205// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206// that predication is preferred, and this lists all options. I.e., the
207// vectorizer will try to fold the tail-loop (epilogue) into the vector body
208// and predicate the instructions accordingly. If tail-folding fails, there are
209// different fallback strategies depending on these values:
210namespace PreferPredicateTy {
211 enum Option {
212 ScalarEpilogue = 0,
213 PredicateElseScalarEpilogue,
214 PredicateOrDontVectorize
215 };
216} // namespace PreferPredicateTy
217
218static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
219 "prefer-predicate-over-epilogue",
220 cl::init(PreferPredicateTy::ScalarEpilogue),
221 cl::Hidden,
222 cl::desc("Tail-folding and predication preferences over creating a scalar "
223 "epilogue loop."),
224 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
225 "scalar-epilogue",llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
226 "Don't tail-predicate loops, create scalar epilogue")llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
,
227 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
228 "predicate-else-scalar-epilogue",llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
229 "prefer tail-folding, create scalar epilogue if tail "llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
230 "folding fails.")llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
,
231 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
232 "predicate-dont-vectorize",llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
233 "prefers tail-folding, don't attempt vectorization if "llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
234 "tail-folding fails.")llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
));
235
236static cl::opt<bool> MaximizeBandwidth(
237 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
238 cl::desc("Maximize bandwidth when selecting vectorization factor which "
239 "will be determined by the smallest type in loop."));
240
241static cl::opt<bool> EnableInterleavedMemAccesses(
242 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
243 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
244
245/// An interleave-group may need masking if it resides in a block that needs
246/// predication, or in order to mask away gaps.
247static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
248 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
249 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
250
251static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
252 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
253 cl::desc("We don't interleave loops with a estimated constant trip count "
254 "below this number"));
255
256static cl::opt<unsigned> ForceTargetNumScalarRegs(
257 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
258 cl::desc("A flag that overrides the target's number of scalar registers."));
259
260static cl::opt<unsigned> ForceTargetNumVectorRegs(
261 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
262 cl::desc("A flag that overrides the target's number of vector registers."));
263
264static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
265 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
266 cl::desc("A flag that overrides the target's max interleave factor for "
267 "scalar loops."));
268
269static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
270 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
271 cl::desc("A flag that overrides the target's max interleave factor for "
272 "vectorized loops."));
273
274static cl::opt<unsigned> ForceTargetInstructionCost(
275 "force-target-instruction-cost", cl::init(0), cl::Hidden,
276 cl::desc("A flag that overrides the target's expected cost for "
277 "an instruction to a single constant value. Mostly "
278 "useful for getting consistent testing."));
279
280static cl::opt<bool> ForceTargetSupportsScalableVectors(
281 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
282 cl::desc(
283 "Pretend that scalable vectors are supported, even if the target does "
284 "not support them. This flag should only be used for testing."));
285
286static cl::opt<unsigned> SmallLoopCost(
287 "small-loop-cost", cl::init(20), cl::Hidden,
288 cl::desc(
289 "The cost of a loop that is considered 'small' by the interleaver."));
290
291static cl::opt<bool> LoopVectorizeWithBlockFrequency(
292 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
293 cl::desc("Enable the use of the block frequency analysis to access PGO "
294 "heuristics minimizing code growth in cold regions and being more "
295 "aggressive in hot regions."));
296
297// Runtime interleave loops for load/store throughput.
298static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
299 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
300 cl::desc(
301 "Enable runtime interleaving until load/store ports are saturated"));
302
303/// Interleave small loops with scalar reductions.
304static cl::opt<bool> InterleaveSmallLoopScalarReduction(
305 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
306 cl::desc("Enable interleaving for loops with small iteration counts that "
307 "contain scalar reductions to expose ILP."));
308
309/// The number of stores in a loop that are allowed to need predication.
310static cl::opt<unsigned> NumberOfStoresToPredicate(
311 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
312 cl::desc("Max number of stores to be predicated behind an if."));
313
314static cl::opt<bool> EnableIndVarRegisterHeur(
315 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
316 cl::desc("Count the induction variable only once when interleaving"));
317
318static cl::opt<bool> EnableCondStoresVectorization(
319 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
320 cl::desc("Enable if predication of stores during vectorization."));
321
322static cl::opt<unsigned> MaxNestedScalarReductionIC(
323 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
324 cl::desc("The maximum interleave count to use when interleaving a scalar "
325 "reduction in a nested loop."));
326
327static cl::opt<bool>
328 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
329 cl::Hidden,
330 cl::desc("Prefer in-loop vector reductions, "
331 "overriding the targets preference."));
332
333static cl::opt<bool> PreferPredicatedReductionSelect(
334 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
335 cl::desc(
336 "Prefer predicating a reduction operation over an after loop select."));
337
338cl::opt<bool> EnableVPlanNativePath(
339 "enable-vplan-native-path", cl::init(false), cl::Hidden,
340 cl::desc("Enable VPlan-native vectorization path with "
341 "support for outer loop vectorization."));
342
343// FIXME: Remove this switch once we have divergence analysis. Currently we
344// assume divergent non-backedge branches when this switch is true.
345cl::opt<bool> EnableVPlanPredication(
346 "enable-vplan-predication", cl::init(false), cl::Hidden,
347 cl::desc("Enable VPlan-native vectorization path predicator with "
348 "support for outer loop vectorization."));
349
350// This flag enables the stress testing of the VPlan H-CFG construction in the
351// VPlan-native vectorization path. It must be used in conjuction with
352// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
353// verification of the H-CFGs built.
354static cl::opt<bool> VPlanBuildStressTest(
355 "vplan-build-stress-test", cl::init(false), cl::Hidden,
356 cl::desc(
357 "Build VPlan for every supported loop nest in the function and bail "
358 "out right after the build (stress test the VPlan H-CFG construction "
359 "in the VPlan-native vectorization path)."));
360
361cl::opt<bool> llvm::EnableLoopInterleaving(
362 "interleave-loops", cl::init(true), cl::Hidden,
363 cl::desc("Enable loop interleaving in Loop vectorization passes"));
364cl::opt<bool> llvm::EnableLoopVectorization(
365 "vectorize-loops", cl::init(true), cl::Hidden,
366 cl::desc("Run the Loop vectorization passes"));
367
368cl::opt<bool> PrintVPlansInDotFormat(
369 "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
370 cl::desc("Use dot format instead of plain text when dumping VPlans"));
371
372/// A helper function that returns the type of loaded or stored value.
373static Type *getMemInstValueType(Value *I) {
374 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Expected Load or Store instruction") ? static_cast<void>
(0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 375, __PRETTY_FUNCTION__))
375 "Expected Load or Store instruction")(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Expected Load or Store instruction") ? static_cast<void>
(0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 375, __PRETTY_FUNCTION__))
;
376 if (auto *LI = dyn_cast<LoadInst>(I))
377 return LI->getType();
378 return cast<StoreInst>(I)->getValueOperand()->getType();
379}
380
381/// A helper function that returns true if the given type is irregular. The
382/// type is irregular if its allocated size doesn't equal the store size of an
383/// element of the corresponding vector type.
384static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
385 // Determine if an array of N elements of type Ty is "bitcast compatible"
386 // with a <N x Ty> vector.
387 // This is only true if there is no padding between the array elements.
388 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
389}
390
391/// A helper function that returns the reciprocal of the block probability of
392/// predicated blocks. If we return X, we are assuming the predicated block
393/// will execute once for every X iterations of the loop header.
394///
395/// TODO: We should use actual block probability here, if available. Currently,
396/// we always assume predicated blocks have a 50% chance of executing.
397static unsigned getReciprocalPredBlockProb() { return 2; }
398
399/// A helper function that returns an integer or floating-point constant with
400/// value C.
401static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
402 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
403 : ConstantFP::get(Ty, C);
404}
405
406/// Returns "best known" trip count for the specified loop \p L as defined by
407/// the following procedure:
408/// 1) Returns exact trip count if it is known.
409/// 2) Returns expected trip count according to profile data if any.
410/// 3) Returns upper bound estimate if it is known.
411/// 4) Returns None if all of the above failed.
412static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
413 // Check if exact trip count is known.
414 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
415 return ExpectedTC;
416
417 // Check if there is an expected trip count available from profile data.
418 if (LoopVectorizeWithBlockFrequency)
419 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
420 return EstimatedTC;
421
422 // Check if upper bound estimate is known.
423 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
424 return ExpectedTC;
425
426 return None;
427}
428
429// Forward declare GeneratedRTChecks.
430class GeneratedRTChecks;
431
432namespace llvm {
433
434/// InnerLoopVectorizer vectorizes loops which contain only one basic
435/// block to a specified vectorization factor (VF).
436/// This class performs the widening of scalars into vectors, or multiple
437/// scalars. This class also implements the following features:
438/// * It inserts an epilogue loop for handling loops that don't have iteration
439/// counts that are known to be a multiple of the vectorization factor.
440/// * It handles the code generation for reduction variables.
441/// * Scalarization (implementation using scalars) of un-vectorizable
442/// instructions.
443/// InnerLoopVectorizer does not perform any vectorization-legality
444/// checks, and relies on the caller to check for the different legality
445/// aspects. The InnerLoopVectorizer relies on the
446/// LoopVectorizationLegality class to provide information about the induction
447/// and reduction variables that were found to a given vectorization factor.
448class InnerLoopVectorizer {
449public:
450 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
451 LoopInfo *LI, DominatorTree *DT,
452 const TargetLibraryInfo *TLI,
453 const TargetTransformInfo *TTI, AssumptionCache *AC,
454 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
455 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
456 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
457 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
458 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
459 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
460 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
461 PSI(PSI), RTChecks(RTChecks) {
462 // Query this against the original loop and save it here because the profile
463 // of the original loop header may change as the transformation happens.
464 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
465 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
466 }
467
468 virtual ~InnerLoopVectorizer() = default;
469
470 /// Create a new empty loop that will contain vectorized instructions later
471 /// on, while the old loop will be used as the scalar remainder. Control flow
472 /// is generated around the vectorized (and scalar epilogue) loops consisting
473 /// of various checks and bypasses. Return the pre-header block of the new
474 /// loop.
475 /// In the case of epilogue vectorization, this function is overriden to
476 /// handle the more complex control flow around the loops.
477 virtual BasicBlock *createVectorizedLoopSkeleton();
478
479 /// Widen a single instruction within the innermost loop.
480 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
481 VPTransformState &State);
482
483 /// Widen a single call instruction within the innermost loop.
484 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
485 VPTransformState &State);
486
487 /// Widen a single select instruction within the innermost loop.
488 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
489 bool InvariantCond, VPTransformState &State);
490
491 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
492 void fixVectorizedLoop(VPTransformState &State);
493
494 // Return true if any runtime check is added.
495 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
496
497 /// A type for vectorized values in the new loop. Each value from the
498 /// original loop, when vectorized, is represented by UF vector values in the
499 /// new unrolled loop, where UF is the unroll factor.
500 using VectorParts = SmallVector<Value *, 2>;
501
502 /// Vectorize a single GetElementPtrInst based on information gathered and
503 /// decisions taken during planning.
504 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
505 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
506 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
507
508 /// Vectorize a single PHINode in a block. This method handles the induction
509 /// variable canonicalization. It supports both VF = 1 for unrolled loops and
510 /// arbitrary length vectors.
511 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc,
512 VPValue *StartV, VPValue *Def,
513 VPTransformState &State);
514
515 /// A helper function to scalarize a single Instruction in the innermost loop.
516 /// Generates a sequence of scalar instances for each lane between \p MinLane
517 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
518 /// inclusive. Uses the VPValue operands from \p Operands instead of \p
519 /// Instr's operands.
520 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands,
521 const VPIteration &Instance, bool IfPredicateInstr,
522 VPTransformState &State);
523
524 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
525 /// is provided, the integer induction variable will first be truncated to
526 /// the corresponding type.
527 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc,
528 VPValue *Def, VPValue *CastDef,
529 VPTransformState &State);
530
531 /// Construct the vector value of a scalarized value \p V one lane at a time.
532 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
533 VPTransformState &State);
534
535 /// Try to vectorize interleaved access group \p Group with the base address
536 /// given in \p Addr, optionally masking the vector operations if \p
537 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
538 /// values in the vectorized loop.
539 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
540 ArrayRef<VPValue *> VPDefs,
541 VPTransformState &State, VPValue *Addr,
542 ArrayRef<VPValue *> StoredValues,
543 VPValue *BlockInMask = nullptr);
544
545 /// Vectorize Load and Store instructions with the base address given in \p
546 /// Addr, optionally masking the vector operations if \p BlockInMask is
547 /// non-null. Use \p State to translate given VPValues to IR values in the
548 /// vectorized loop.
549 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
550 VPValue *Def, VPValue *Addr,
551 VPValue *StoredValue, VPValue *BlockInMask);
552
553 /// Set the debug location in the builder using the debug location in
554 /// the instruction.
555 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
556
557 /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
558 void fixNonInductionPHIs(VPTransformState &State);
559
560 /// Create a broadcast instruction. This method generates a broadcast
561 /// instruction (shuffle) for loop invariant values and for the induction
562 /// value. If this is the induction variable then we extend it to N, N+1, ...
563 /// this is needed because each iteration in the loop corresponds to a SIMD
564 /// element.
565 virtual Value *getBroadcastInstrs(Value *V);
566
567protected:
568 friend class LoopVectorizationPlanner;
569
570 /// A small list of PHINodes.
571 using PhiVector = SmallVector<PHINode *, 4>;
572
573 /// A type for scalarized values in the new loop. Each value from the
574 /// original loop, when scalarized, is represented by UF x VF scalar values
575 /// in the new unrolled loop, where UF is the unroll factor and VF is the
576 /// vectorization factor.
577 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
578
579 /// Set up the values of the IVs correctly when exiting the vector loop.
580 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
581 Value *CountRoundDown, Value *EndValue,
582 BasicBlock *MiddleBlock);
583
584 /// Create a new induction variable inside L.
585 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
586 Value *Step, Instruction *DL);
587
588 /// Handle all cross-iteration phis in the header.
589 void fixCrossIterationPHIs(VPTransformState &State);
590
591 /// Fix a first-order recurrence. This is the second phase of vectorizing
592 /// this phi node.
593 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State);
594
595 /// Fix a reduction cross-iteration phi. This is the second phase of
596 /// vectorizing this phi node.
597 void fixReduction(PHINode *Phi, VPTransformState &State);
598
599 /// Clear NSW/NUW flags from reduction instructions if necessary.
600 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc,
601 VPTransformState &State);
602
603 /// Fixup the LCSSA phi nodes in the unique exit block. This simply
604 /// means we need to add the appropriate incoming value from the middle
605 /// block as exiting edges from the scalar epilogue loop (if present) are
606 /// already in place, and we exit the vector loop exclusively to the middle
607 /// block.
608 void fixLCSSAPHIs(VPTransformState &State);
609
610 /// Iteratively sink the scalarized operands of a predicated instruction into
611 /// the block that was created for it.
612 void sinkScalarOperands(Instruction *PredInst);
613
614 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
615 /// represented as.
616 void truncateToMinimalBitwidths(VPTransformState &State);
617
618 /// This function adds
619 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
620 /// to each vector element of Val. The sequence starts at StartIndex.
621 /// \p Opcode is relevant for FP induction variable.
622 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
623 Instruction::BinaryOps Opcode =
624 Instruction::BinaryOpsEnd);
625
626 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
627 /// variable on which to base the steps, \p Step is the size of the step, and
628 /// \p EntryVal is the value from the original loop that maps to the steps.
629 /// Note that \p EntryVal doesn't have to be an induction variable - it
630 /// can also be a truncate instruction.
631 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
632 const InductionDescriptor &ID, VPValue *Def,
633 VPValue *CastDef, VPTransformState &State);
634
635 /// Create a vector induction phi node based on an existing scalar one. \p
636 /// EntryVal is the value from the original loop that maps to the vector phi
637 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
638 /// truncate instruction, instead of widening the original IV, we widen a
639 /// version of the IV truncated to \p EntryVal's type.
640 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
641 Value *Step, Value *Start,
642 Instruction *EntryVal, VPValue *Def,
643 VPValue *CastDef,
644 VPTransformState &State);
645
646 /// Returns true if an instruction \p I should be scalarized instead of
647 /// vectorized for the chosen vectorization factor.
648 bool shouldScalarizeInstruction(Instruction *I) const;
649
650 /// Returns true if we should generate a scalar version of \p IV.
651 bool needsScalarInduction(Instruction *IV) const;
652
653 /// If there is a cast involved in the induction variable \p ID, which should
654 /// be ignored in the vectorized loop body, this function records the
655 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
656 /// cast. We had already proved that the casted Phi is equal to the uncasted
657 /// Phi in the vectorized loop (under a runtime guard), and therefore
658 /// there is no need to vectorize the cast - the same value can be used in the
659 /// vector loop for both the Phi and the cast.
660 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
661 /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
662 ///
663 /// \p EntryVal is the value from the original loop that maps to the vector
664 /// phi node and is used to distinguish what is the IV currently being
665 /// processed - original one (if \p EntryVal is a phi corresponding to the
666 /// original IV) or the "newly-created" one based on the proof mentioned above
667 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
668 /// latter case \p EntryVal is a TruncInst and we must not record anything for
669 /// that IV, but it's error-prone to expect callers of this routine to care
670 /// about that, hence this explicit parameter.
671 void recordVectorLoopValueForInductionCast(
672 const InductionDescriptor &ID, const Instruction *EntryVal,
673 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
674 unsigned Part, unsigned Lane = UINT_MAX(2147483647 *2U +1U));
675
676 /// Generate a shuffle sequence that will reverse the vector Vec.
677 virtual Value *reverseVector(Value *Vec);
678
679 /// Returns (and creates if needed) the original loop trip count.
680 Value *getOrCreateTripCount(Loop *NewLoop);
681
682 /// Returns (and creates if needed) the trip count of the widened loop.
683 Value *getOrCreateVectorTripCount(Loop *NewLoop);
684
685 /// Returns a bitcasted value to the requested vector type.
686 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
687 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
688 const DataLayout &DL);
689
690 /// Emit a bypass check to see if the vector trip count is zero, including if
691 /// it overflows.
692 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
693
694 /// Emit a bypass check to see if all of the SCEV assumptions we've
695 /// had to make are correct. Returns the block containing the checks or
696 /// nullptr if no checks have been added.
697 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
698
699 /// Emit bypass checks to check any memory assumptions we may have made.
700 /// Returns the block containing the checks or nullptr if no checks have been
701 /// added.
702 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
703
704 /// Compute the transformed value of Index at offset StartValue using step
705 /// StepValue.
706 /// For integer induction, returns StartValue + Index * StepValue.
707 /// For pointer induction, returns StartValue[Index * StepValue].
708 /// FIXME: The newly created binary instructions should contain nsw/nuw
709 /// flags, which can be found from the original scalar operations.
710 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
711 const DataLayout &DL,
712 const InductionDescriptor &ID) const;
713
714 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
715 /// vector loop preheader, middle block and scalar preheader. Also
716 /// allocate a loop object for the new vector loop and return it.
717 Loop *createVectorLoopSkeleton(StringRef Prefix);
718
719 /// Create new phi nodes for the induction variables to resume iteration count
720 /// in the scalar epilogue, from where the vectorized loop left off (given by
721 /// \p VectorTripCount).
722 /// In cases where the loop skeleton is more complicated (eg. epilogue
723 /// vectorization) and the resume values can come from an additional bypass
724 /// block, the \p AdditionalBypass pair provides information about the bypass
725 /// block and the end value on the edge from bypass to this loop.
726 void createInductionResumeValues(
727 Loop *L, Value *VectorTripCount,
728 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
729
730 /// Complete the loop skeleton by adding debug MDs, creating appropriate
731 /// conditional branches in the middle block, preparing the builder and
732 /// running the verifier. Take in the vector loop \p L as argument, and return
733 /// the preheader of the completed vector loop.
734 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
735
736 /// Add additional metadata to \p To that was not present on \p Orig.
737 ///
738 /// Currently this is used to add the noalias annotations based on the
739 /// inserted memchecks. Use this for instructions that are *cloned* into the
740 /// vector loop.
741 void addNewMetadata(Instruction *To, const Instruction *Orig);
742
743 /// Add metadata from one instruction to another.
744 ///
745 /// This includes both the original MDs from \p From and additional ones (\see
746 /// addNewMetadata). Use this for *newly created* instructions in the vector
747 /// loop.
748 void addMetadata(Instruction *To, Instruction *From);
749
750 /// Similar to the previous function but it adds the metadata to a
751 /// vector of instructions.
752 void addMetadata(ArrayRef<Value *> To, Instruction *From);
753
754 /// Allow subclasses to override and print debug traces before/after vplan
755 /// execution, when trace information is requested.
756 virtual void printDebugTracesAtStart(){};
757 virtual void printDebugTracesAtEnd(){};
758
759 /// The original loop.
760 Loop *OrigLoop;
761
762 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
763 /// dynamic knowledge to simplify SCEV expressions and converts them to a
764 /// more usable form.
765 PredicatedScalarEvolution &PSE;
766
767 /// Loop Info.
768 LoopInfo *LI;
769
770 /// Dominator Tree.
771 DominatorTree *DT;
772
773 /// Alias Analysis.
774 AAResults *AA;
775
776 /// Target Library Info.
777 const TargetLibraryInfo *TLI;
778
779 /// Target Transform Info.
780 const TargetTransformInfo *TTI;
781
782 /// Assumption Cache.
783 AssumptionCache *AC;
784
785 /// Interface to emit optimization remarks.
786 OptimizationRemarkEmitter *ORE;
787
788 /// LoopVersioning. It's only set up (non-null) if memchecks were
789 /// used.
790 ///
791 /// This is currently only used to add no-alias metadata based on the
792 /// memchecks. The actually versioning is performed manually.
793 std::unique_ptr<LoopVersioning> LVer;
794
795 /// The vectorization SIMD factor to use. Each vector will have this many
796 /// vector elements.
797 ElementCount VF;
798
799 /// The vectorization unroll factor to use. Each scalar is vectorized to this
800 /// many different vector instructions.
801 unsigned UF;
802
803 /// The builder that we use
804 IRBuilder<> Builder;
805
806 // --- Vectorization state ---
807
808 /// The vector-loop preheader.
809 BasicBlock *LoopVectorPreHeader;
810
811 /// The scalar-loop preheader.
812 BasicBlock *LoopScalarPreHeader;
813
814 /// Middle Block between the vector and the scalar.
815 BasicBlock *LoopMiddleBlock;
816
817 /// The (unique) ExitBlock of the scalar loop. Note that
818 /// there can be multiple exiting edges reaching this block.
819 BasicBlock *LoopExitBlock;
820
821 /// The vector loop body.
822 BasicBlock *LoopVectorBody;
823
824 /// The scalar loop body.
825 BasicBlock *LoopScalarBody;
826
827 /// A list of all bypass blocks. The first block is the entry of the loop.
828 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
829
830 /// The new Induction variable which was added to the new block.
831 PHINode *Induction = nullptr;
832
833 /// The induction variable of the old basic block.
834 PHINode *OldInduction = nullptr;
835
836 /// Store instructions that were predicated.
837 SmallVector<Instruction *, 4> PredicatedInstructions;
838
839 /// Trip count of the original loop.
840 Value *TripCount = nullptr;
841
842 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
843 Value *VectorTripCount = nullptr;
844
845 /// The legality analysis.
846 LoopVectorizationLegality *Legal;
847
848 /// The profitablity analysis.
849 LoopVectorizationCostModel *Cost;
850
851 // Record whether runtime checks are added.
852 bool AddedSafetyChecks = false;
853
854 // Holds the end values for each induction variable. We save the end values
855 // so we can later fix-up the external users of the induction variables.
856 DenseMap<PHINode *, Value *> IVEndValues;
857
858 // Vector of original scalar PHIs whose corresponding widened PHIs need to be
859 // fixed up at the end of vector code generation.
860 SmallVector<PHINode *, 8> OrigPHIsToFix;
861
862 /// BFI and PSI are used to check for profile guided size optimizations.
863 BlockFrequencyInfo *BFI;
864 ProfileSummaryInfo *PSI;
865
866 // Whether this loop should be optimized for size based on profile guided size
867 // optimizatios.
868 bool OptForSizeBasedOnProfile;
869
870 /// Structure to hold information about generated runtime checks, responsible
871 /// for cleaning the checks, if vectorization turns out unprofitable.
872 GeneratedRTChecks &RTChecks;
873};
874
875class InnerLoopUnroller : public InnerLoopVectorizer {
876public:
877 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
878 LoopInfo *LI, DominatorTree *DT,
879 const TargetLibraryInfo *TLI,
880 const TargetTransformInfo *TTI, AssumptionCache *AC,
881 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
882 LoopVectorizationLegality *LVL,
883 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
884 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
885 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
886 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
887 BFI, PSI, Check) {}
888
889private:
890 Value *getBroadcastInstrs(Value *V) override;
891 Value *getStepVector(Value *Val, int StartIdx, Value *Step,
892 Instruction::BinaryOps Opcode =
893 Instruction::BinaryOpsEnd) override;
894 Value *reverseVector(Value *Vec) override;
895};
896
897/// Encapsulate information regarding vectorization of a loop and its epilogue.
898/// This information is meant to be updated and used across two stages of
899/// epilogue vectorization.
900struct EpilogueLoopVectorizationInfo {
901 ElementCount MainLoopVF = ElementCount::getFixed(0);
902 unsigned MainLoopUF = 0;
903 ElementCount EpilogueVF = ElementCount::getFixed(0);
904 unsigned EpilogueUF = 0;
905 BasicBlock *MainLoopIterationCountCheck = nullptr;
906 BasicBlock *EpilogueIterationCountCheck = nullptr;
907 BasicBlock *SCEVSafetyCheck = nullptr;
908 BasicBlock *MemSafetyCheck = nullptr;
909 Value *TripCount = nullptr;
910 Value *VectorTripCount = nullptr;
911
912 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
913 unsigned EUF)
914 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
915 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
916 assert(EUF == 1 &&((EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? static_cast<void> (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 917, __PRETTY_FUNCTION__))
917 "A high UF for the epilogue loop is likely not beneficial.")((EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? static_cast<void> (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 917, __PRETTY_FUNCTION__))
;
918 }
919};
920
921/// An extension of the inner loop vectorizer that creates a skeleton for a
922/// vectorized loop that has its epilogue (residual) also vectorized.
923/// The idea is to run the vplan on a given loop twice, firstly to setup the
924/// skeleton and vectorize the main loop, and secondly to complete the skeleton
925/// from the first step and vectorize the epilogue. This is achieved by
926/// deriving two concrete strategy classes from this base class and invoking
927/// them in succession from the loop vectorizer planner.
928class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
929public:
930 InnerLoopAndEpilogueVectorizer(
931 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
932 DominatorTree *DT, const TargetLibraryInfo *TLI,
933 const TargetTransformInfo *TTI, AssumptionCache *AC,
934 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
935 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
936 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
937 GeneratedRTChecks &Checks)
938 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
939 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
940 Checks),
941 EPI(EPI) {}
942
943 // Override this function to handle the more complex control flow around the
944 // three loops.
945 BasicBlock *createVectorizedLoopSkeleton() final override {
946 return createEpilogueVectorizedLoopSkeleton();
947 }
948
949 /// The interface for creating a vectorized skeleton using one of two
950 /// different strategies, each corresponding to one execution of the vplan
951 /// as described above.
952 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
953
954 /// Holds and updates state information required to vectorize the main loop
955 /// and its epilogue in two separate passes. This setup helps us avoid
956 /// regenerating and recomputing runtime safety checks. It also helps us to
957 /// shorten the iteration-count-check path length for the cases where the
958 /// iteration count of the loop is so small that the main vector loop is
959 /// completely skipped.
960 EpilogueLoopVectorizationInfo &EPI;
961};
962
963/// A specialized derived class of inner loop vectorizer that performs
964/// vectorization of *main* loops in the process of vectorizing loops and their
965/// epilogues.
966class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
967public:
968 EpilogueVectorizerMainLoop(
969 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
970 DominatorTree *DT, const TargetLibraryInfo *TLI,
971 const TargetTransformInfo *TTI, AssumptionCache *AC,
972 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
973 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
974 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
975 GeneratedRTChecks &Check)
976 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
977 EPI, LVL, CM, BFI, PSI, Check) {}
978 /// Implements the interface for creating a vectorized skeleton using the
979 /// *main loop* strategy (ie the first pass of vplan execution).
980 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
981
982protected:
983 /// Emits an iteration count bypass check once for the main loop (when \p
984 /// ForEpilogue is false) and once for the epilogue loop (when \p
985 /// ForEpilogue is true).
986 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
987 bool ForEpilogue);
988 void printDebugTracesAtStart() override;
989 void printDebugTracesAtEnd() override;
990};
991
992// A specialized derived class of inner loop vectorizer that performs
993// vectorization of *epilogue* loops in the process of vectorizing loops and
994// their epilogues.
995class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
996public:
997 EpilogueVectorizerEpilogueLoop(
998 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
999 DominatorTree *DT, const TargetLibraryInfo *TLI,
1000 const TargetTransformInfo *TTI, AssumptionCache *AC,
1001 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
1002 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
1003 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
1004 GeneratedRTChecks &Checks)
1005 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1006 EPI, LVL, CM, BFI, PSI, Checks) {}
1007 /// Implements the interface for creating a vectorized skeleton using the
1008 /// *epilogue loop* strategy (ie the second pass of vplan execution).
1009 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1010
1011protected:
1012 /// Emits an iteration count bypass check after the main vector loop has
1013 /// finished to see if there are any iterations left to execute by either
1014 /// the vector epilogue or the scalar epilogue.
1015 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1016 BasicBlock *Bypass,
1017 BasicBlock *Insert);
1018 void printDebugTracesAtStart() override;
1019 void printDebugTracesAtEnd() override;
1020};
1021} // end namespace llvm
1022
1023/// Look for a meaningful debug location on the instruction or it's
1024/// operands.
1025static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1026 if (!I)
1027 return I;
1028
1029 DebugLoc Empty;
1030 if (I->getDebugLoc() != Empty)
1031 return I;
1032
1033 for (Use &Op : I->operands()) {
1034 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
1035 if (OpInst->getDebugLoc() != Empty)
1036 return OpInst;
1037 }
1038
1039 return I;
1040}
1041
1042void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
1043 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
1044 const DILocation *DIL = Inst->getDebugLoc();
1045 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1046 !isa<DbgInfoIntrinsic>(Inst)) {
1047 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1047, __PRETTY_FUNCTION__))
;
1048 auto NewDIL =
1049 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1050 if (NewDIL)
1051 B.SetCurrentDebugLocation(NewDIL.getValue());
1052 else
1053 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
1054 << "Failed to create new discriminator: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
1055 << DIL->getFilename() << " Line: " << DIL->getLine())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
;
1056 }
1057 else
1058 B.SetCurrentDebugLocation(DIL);
1059 } else
1060 B.SetCurrentDebugLocation(DebugLoc());
1061}
1062
1063/// Write a record \p DebugMsg about vectorization failure to the debug
1064/// output stream. If \p I is passed, it is an instruction that prevents
1065/// vectorization.
1066#ifndef NDEBUG
1067static void debugVectorizationFailure(const StringRef DebugMsg,
1068 Instruction *I) {
1069 dbgs() << "LV: Not vectorizing: " << DebugMsg;
1070 if (I != nullptr)
1071 dbgs() << " " << *I;
1072 else
1073 dbgs() << '.';
1074 dbgs() << '\n';
1075}
1076#endif
1077
1078/// Create an analysis remark that explains why vectorization failed
1079///
1080/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
1081/// RemarkName is the identifier for the remark. If \p I is passed it is an
1082/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
1083/// the location of the remark. \return the remark object that can be
1084/// streamed to.
1085static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1086 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1087 Value *CodeRegion = TheLoop->getHeader();
1088 DebugLoc DL = TheLoop->getStartLoc();
1089
1090 if (I) {
1091 CodeRegion = I->getParent();
1092 // If there is no debug location attached to the instruction, revert back to
1093 // using the loop's.
1094 if (I->getDebugLoc())
1095 DL = I->getDebugLoc();
1096 }
1097
1098 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
1099 R << "loop not vectorized: ";
1100 return R;
1101}
1102
1103/// Return a value for Step multiplied by VF.
1104static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1105 assert(isa<ConstantInt>(Step) && "Expected an integer step")((isa<ConstantInt>(Step) && "Expected an integer step"
) ? static_cast<void> (0) : __assert_fail ("isa<ConstantInt>(Step) && \"Expected an integer step\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1105, __PRETTY_FUNCTION__))
;
1106 Constant *StepVal = ConstantInt::get(
1107 Step->getType(),
1108 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1109 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1110}
1111
1112namespace llvm {
1113
1114/// Return the runtime value for VF.
1115Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
1116 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1117 return VF.isScalable() ? B.CreateVScale(EC) : EC;
1118}
1119
1120void reportVectorizationFailure(const StringRef DebugMsg,
1121 const StringRef OREMsg, const StringRef ORETag,
1122 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
1123 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationFailure(DebugMsg, I);
} } while (false)
;
1124 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1125 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
1126 ORETag, TheLoop, I) << OREMsg);
1127}
1128
1129} // end namespace llvm
1130
1131#ifndef NDEBUG
1132/// \return string containing a file name and a line # for the given loop.
1133static std::string getDebugLocString(const Loop *L) {
1134 std::string Result;
1135 if (L) {
1136 raw_string_ostream OS(Result);
1137 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1138 LoopDbgLoc.print(OS);
1139 else
1140 // Just print the module name.
1141 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1142 OS.flush();
1143 }
1144 return Result;
1145}
1146#endif
1147
1148void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1149 const Instruction *Orig) {
1150 // If the loop was versioned with memchecks, add the corresponding no-alias
1151 // metadata.
1152 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1153 LVer->annotateInstWithNoAlias(To, Orig);
1154}
1155
1156void InnerLoopVectorizer::addMetadata(Instruction *To,
1157 Instruction *From) {
1158 propagateMetadata(To, From);
1159 addNewMetadata(To, From);
1160}
1161
1162void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1163 Instruction *From) {
1164 for (Value *V : To) {
1165 if (Instruction *I = dyn_cast<Instruction>(V))
1166 addMetadata(I, From);
1167 }
1168}
1169
1170namespace llvm {
1171
1172// Loop vectorization cost-model hints how the scalar epilogue loop should be
1173// lowered.
1174enum ScalarEpilogueLowering {
1175
1176 // The default: allowing scalar epilogues.
1177 CM_ScalarEpilogueAllowed,
1178
1179 // Vectorization with OptForSize: don't allow epilogues.
1180 CM_ScalarEpilogueNotAllowedOptSize,
1181
1182 // A special case of vectorisation with OptForSize: loops with a very small
1183 // trip count are considered for vectorization under OptForSize, thereby
1184 // making sure the cost of their loop body is dominant, free of runtime
1185 // guards and scalar iteration overheads.
1186 CM_ScalarEpilogueNotAllowedLowTripLoop,
1187
1188 // Loop hint predicate indicating an epilogue is undesired.
1189 CM_ScalarEpilogueNotNeededUsePredicate,
1190
1191 // Directive indicating we must either tail fold or not vectorize
1192 CM_ScalarEpilogueNotAllowedUsePredicate
1193};
1194
1195/// LoopVectorizationCostModel - estimates the expected speedups due to
1196/// vectorization.
1197/// In many cases vectorization is not profitable. This can happen because of
1198/// a number of reasons. In this class we mainly attempt to predict the
1199/// expected speedup/slowdowns due to the supported instruction set. We use the
1200/// TargetTransformInfo to query the different backends for the cost of
1201/// different operations.
1202class LoopVectorizationCostModel {
1203public:
1204 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1205 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1206 LoopVectorizationLegality *Legal,
1207 const TargetTransformInfo &TTI,
1208 const TargetLibraryInfo *TLI, DemandedBits *DB,
1209 AssumptionCache *AC,
1210 OptimizationRemarkEmitter *ORE, const Function *F,
1211 const LoopVectorizeHints *Hints,
1212 InterleavedAccessInfo &IAI)
1213 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1214 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1215 Hints(Hints), InterleaveInfo(IAI) {}
1216
1217 /// \return An upper bound for the vectorization factor, or None if
1218 /// vectorization and interleaving should be avoided up front.
1219 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
1220
1221 /// \return True if runtime checks are required for vectorization, and false
1222 /// otherwise.
1223 bool runtimeChecksRequired();
1224
1225 /// \return The most profitable vectorization factor and the cost of that VF.
1226 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1227 /// then this vectorization factor will be selected if vectorization is
1228 /// possible.
1229 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
1230 VectorizationFactor
1231 selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1232 const LoopVectorizationPlanner &LVP);
1233
1234 /// Setup cost-based decisions for user vectorization factor.
1235 void selectUserVectorizationFactor(ElementCount UserVF) {
1236 collectUniformsAndScalars(UserVF);
1237 collectInstsToScalarize(UserVF);
1238 }
1239
1240 /// \return The size (in bits) of the smallest and widest types in the code
1241 /// that needs to be vectorized. We ignore values that remain scalar such as
1242 /// 64 bit loop indices.
1243 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1244
1245 /// \return The desired interleave count.
1246 /// If interleave count has been specified by metadata it will be returned.
1247 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1248 /// are the selected vectorization factor and the cost of the selected VF.
1249 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1250
1251 /// Memory access instruction may be vectorized in more than one way.
1252 /// Form of instruction after vectorization depends on cost.
1253 /// This function takes cost-based decisions for Load/Store instructions
1254 /// and collects them in a map. This decisions map is used for building
1255 /// the lists of loop-uniform and loop-scalar instructions.
1256 /// The calculated cost is saved with widening decision in order to
1257 /// avoid redundant calculations.
1258 void setCostBasedWideningDecision(ElementCount VF);
1259
1260 /// A struct that represents some properties of the register usage
1261 /// of a loop.
1262 struct RegisterUsage {
1263 /// Holds the number of loop invariant values that are used in the loop.
1264 /// The key is ClassID of target-provided register class.
1265 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1266 /// Holds the maximum number of concurrent live intervals in the loop.
1267 /// The key is ClassID of target-provided register class.
1268 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1269 };
1270
1271 /// \return Returns information about the register usages of the loop for the
1272 /// given vectorization factors.
1273 SmallVector<RegisterUsage, 8>
1274 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1275
1276 /// Collect values we want to ignore in the cost model.
1277 void collectValuesToIgnore();
1278
1279 /// Split reductions into those that happen in the loop, and those that happen
1280 /// outside. In loop reductions are collected into InLoopReductionChains.
1281 void collectInLoopReductions();
1282
1283 /// \returns The smallest bitwidth each instruction can be represented with.
1284 /// The vector equivalents of these instructions should be truncated to this
1285 /// type.
1286 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1287 return MinBWs;
1288 }
1289
1290 /// \returns True if it is more profitable to scalarize instruction \p I for
1291 /// vectorization factor \p VF.
1292 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1293 assert(VF.isVector() &&((VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1294, __PRETTY_FUNCTION__))
1294 "Profitable to scalarize relevant only for VF > 1.")((VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1294, __PRETTY_FUNCTION__))
;
1295
1296 // Cost model is not run in the VPlan-native path - return conservative
1297 // result until this changes.
1298 if (EnableVPlanNativePath)
1299 return false;
1300
1301 auto Scalars = InstsToScalarize.find(VF);
1302 assert(Scalars != InstsToScalarize.end() &&((Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"
) ? static_cast<void> (0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1303, __PRETTY_FUNCTION__))
1303 "VF not yet analyzed for scalarization profitability")((Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"
) ? static_cast<void> (0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1303, __PRETTY_FUNCTION__))
;
1304 return Scalars->second.find(I) != Scalars->second.end();
1305 }
1306
1307 /// Returns true if \p I is known to be uniform after vectorization.
1308 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1309 if (VF.isScalar())
1310 return true;
1311
1312 // Cost model is not run in the VPlan-native path - return conservative
1313 // result until this changes.
1314 if (EnableVPlanNativePath)
1315 return false;
1316
1317 auto UniformsPerVF = Uniforms.find(VF);
1318 assert(UniformsPerVF != Uniforms.end() &&((UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"
) ? static_cast<void> (0) : __assert_fail ("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1319, __PRETTY_FUNCTION__))
1319 "VF not yet analyzed for uniformity")((UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"
) ? static_cast<void> (0) : __assert_fail ("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1319, __PRETTY_FUNCTION__))
;
1320 return UniformsPerVF->second.count(I);
1321 }
1322
1323 /// Returns true if \p I is known to be scalar after vectorization.
1324 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1325 if (VF.isScalar())
1326 return true;
1327
1328 // Cost model is not run in the VPlan-native path - return conservative
1329 // result until this changes.
1330 if (EnableVPlanNativePath)
1331 return false;
1332
1333 auto ScalarsPerVF = Scalars.find(VF);
1334 assert(ScalarsPerVF != Scalars.end() &&((ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"
) ? static_cast<void> (0) : __assert_fail ("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1335, __PRETTY_FUNCTION__))
1335 "Scalar values are not calculated for VF")((ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"
) ? static_cast<void> (0) : __assert_fail ("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1335, __PRETTY_FUNCTION__))
;
1336 return ScalarsPerVF->second.count(I);
1337 }
1338
1339 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1340 /// for vectorization factor \p VF.
1341 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1342 return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1343 !isProfitableToScalarize(I, VF) &&
1344 !isScalarAfterVectorization(I, VF);
1345 }
1346
1347 /// Decision that was taken during cost calculation for memory instruction.
1348 enum InstWidening {
1349 CM_Unknown,
1350 CM_Widen, // For consecutive accesses with stride +1.
1351 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1352 CM_Interleave,
1353 CM_GatherScatter,
1354 CM_Scalarize
1355 };
1356
1357 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1358 /// instruction \p I and vector width \p VF.
1359 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1360 InstructionCost Cost) {
1361 assert(VF.isVector() && "Expected VF >=2")((VF.isVector() && "Expected VF >=2") ? static_cast
<void> (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1361, __PRETTY_FUNCTION__))
;
1362 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1363 }
1364
1365 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1366 /// interleaving group \p Grp and vector width \p VF.
1367 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1368 ElementCount VF, InstWidening W,
1369 InstructionCost Cost) {
1370 assert(VF.isVector() && "Expected VF >=2")((VF.isVector() && "Expected VF >=2") ? static_cast
<void> (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1370, __PRETTY_FUNCTION__))
;
1371 /// Broadcast this decicion to all instructions inside the group.
1372 /// But the cost will be assigned to one instruction only.
1373 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1374 if (auto *I = Grp->getMember(i)) {
1375 if (Grp->getInsertPos() == I)
1376 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1377 else
1378 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1379 }
1380 }
1381 }
1382
1383 /// Return the cost model decision for the given instruction \p I and vector
1384 /// width \p VF. Return CM_Unknown if this instruction did not pass
1385 /// through the cost modeling.
1386 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1387 assert(VF.isVector() && "Expected VF to be a vector VF")((VF.isVector() && "Expected VF to be a vector VF") ?
static_cast<void> (0) : __assert_fail ("VF.isVector() && \"Expected VF to be a vector VF\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1387, __PRETTY_FUNCTION__))
;
1388 // Cost model is not run in the VPlan-native path - return conservative
1389 // result until this changes.
1390 if (EnableVPlanNativePath)
1391 return CM_GatherScatter;
1392
1393 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1394 auto Itr = WideningDecisions.find(InstOnVF);
1395 if (Itr == WideningDecisions.end())
1396 return CM_Unknown;
1397 return Itr->second.first;
1398 }
1399
1400 /// Return the vectorization cost for the given instruction \p I and vector
1401 /// width \p VF.
1402 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1403 assert(VF.isVector() && "Expected VF >=2")((VF.isVector() && "Expected VF >=2") ? static_cast
<void> (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1403, __PRETTY_FUNCTION__))
;
1404 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1405 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&((WideningDecisions.find(InstOnVF) != WideningDecisions.end()
&& "The cost is not calculated") ? static_cast<void
> (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1406, __PRETTY_FUNCTION__))
1406 "The cost is not calculated")((WideningDecisions.find(InstOnVF) != WideningDecisions.end()
&& "The cost is not calculated") ? static_cast<void
> (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1406, __PRETTY_FUNCTION__))
;
1407 return WideningDecisions[InstOnVF].second;
1408 }
1409
1410 /// Return True if instruction \p I is an optimizable truncate whose operand
1411 /// is an induction variable. Such a truncate will be removed by adding a new
1412 /// induction variable with the destination type.
1413 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1414 // If the instruction is not a truncate, return false.
1415 auto *Trunc = dyn_cast<TruncInst>(I);
1416 if (!Trunc)
1417 return false;
1418
1419 // Get the source and destination types of the truncate.
1420 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1421 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1422
1423 // If the truncate is free for the given types, return false. Replacing a
1424 // free truncate with an induction variable would add an induction variable
1425 // update instruction to each iteration of the loop. We exclude from this
1426 // check the primary induction variable since it will need an update
1427 // instruction regardless.
1428 Value *Op = Trunc->getOperand(0);
1429 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1430 return false;
1431
1432 // If the truncated value is not an induction variable, return false.
1433 return Legal->isInductionPhi(Op);
1434 }
1435
1436 /// Collects the instructions to scalarize for each predicated instruction in
1437 /// the loop.
1438 void collectInstsToScalarize(ElementCount VF);
1439
1440 /// Collect Uniform and Scalar values for the given \p VF.
1441 /// The sets depend on CM decision for Load/Store instructions
1442 /// that may be vectorized as interleave, gather-scatter or scalarized.
1443 void collectUniformsAndScalars(ElementCount VF) {
1444 // Do the analysis once.
1445 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1446 return;
1447 setCostBasedWideningDecision(VF);
1448 collectLoopUniforms(VF);
1449 collectLoopScalars(VF);
1450 }
1451
1452 /// Returns true if the target machine supports masked store operation
1453 /// for the given \p DataType and kind of access to \p Ptr.
1454 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1455 return Legal->isConsecutivePtr(Ptr) &&
1456 TTI.isLegalMaskedStore(DataType, Alignment);
1457 }
1458
1459 /// Returns true if the target machine supports masked load operation
1460 /// for the given \p DataType and kind of access to \p Ptr.
1461 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1462 return Legal->isConsecutivePtr(Ptr) &&
1463 TTI.isLegalMaskedLoad(DataType, Alignment);
1464 }
1465
1466 /// Returns true if the target machine supports masked scatter operation
1467 /// for the given \p DataType.
1468 bool isLegalMaskedScatter(Type *DataType, Align Alignment) const {
1469 return TTI.isLegalMaskedScatter(DataType, Alignment);
1470 }
1471
1472 /// Returns true if the target machine supports masked gather operation
1473 /// for the given \p DataType.
1474 bool isLegalMaskedGather(Type *DataType, Align Alignment) const {
1475 return TTI.isLegalMaskedGather(DataType, Alignment);
1476 }
1477
1478 /// Returns true if the target machine can represent \p V as a masked gather
1479 /// or scatter operation.
1480 bool isLegalGatherOrScatter(Value *V) {
1481 bool LI = isa<LoadInst>(V);
1482 bool SI = isa<StoreInst>(V);
1483 if (!LI && !SI)
1484 return false;
1485 auto *Ty = getMemInstValueType(V);
1486 Align Align = getLoadStoreAlignment(V);
1487 return (LI && isLegalMaskedGather(Ty, Align)) ||
1488 (SI && isLegalMaskedScatter(Ty, Align));
1489 }
1490
1491 /// Returns true if the target machine supports all of the reduction
1492 /// variables found for the given VF.
1493 bool canVectorizeReductions(ElementCount VF) {
1494 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1495 RecurrenceDescriptor RdxDesc = Reduction.second;
1496 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1497 }));
1498 }
1499
1500 /// Returns true if \p I is an instruction that will be scalarized with
1501 /// predication. Such instructions include conditional stores and
1502 /// instructions that may divide by zero.
1503 /// If a non-zero VF has been calculated, we check if I will be scalarized
1504 /// predication for that VF.
1505 bool
1506 isScalarWithPredication(Instruction *I,
1507 ElementCount VF = ElementCount::getFixed(1)) const;
1508
1509 // Returns true if \p I is an instruction that will be predicated either
1510 // through scalar predication or masked load/store or masked gather/scatter.
1511 // Superset of instructions that return true for isScalarWithPredication.
1512 bool isPredicatedInst(Instruction *I) {
1513 if (!blockNeedsPredication(I->getParent()))
1514 return false;
1515 // Loads and stores that need some form of masked operation are predicated
1516 // instructions.
1517 if (isa<LoadInst>(I) || isa<StoreInst>(I))
1518 return Legal->isMaskRequired(I);
1519 return isScalarWithPredication(I);
1520 }
1521
1522 /// Returns true if \p I is a memory instruction with consecutive memory
1523 /// access that can be widened.
1524 bool
1525 memoryInstructionCanBeWidened(Instruction *I,
1526 ElementCount VF = ElementCount::getFixed(1));
1527
1528 /// Returns true if \p I is a memory instruction in an interleaved-group
1529 /// of memory accesses that can be vectorized with wide vector loads/stores
1530 /// and shuffles.
1531 bool
1532 interleavedAccessCanBeWidened(Instruction *I,
1533 ElementCount VF = ElementCount::getFixed(1));
1534
1535 /// Check if \p Instr belongs to any interleaved access group.
1536 bool isAccessInterleaved(Instruction *Instr) {
1537 return InterleaveInfo.isInterleaved(Instr);
1538 }
1539
1540 /// Get the interleaved access group that \p Instr belongs to.
1541 const InterleaveGroup<Instruction> *
1542 getInterleavedAccessGroup(Instruction *Instr) {
1543 return InterleaveInfo.getInterleaveGroup(Instr);
1544 }
1545
1546 /// Returns true if we're required to use a scalar epilogue for at least
1547 /// the final iteration of the original loop.
1548 bool requiresScalarEpilogue() const {
1549 if (!isScalarEpilogueAllowed())
1550 return false;
1551 // If we might exit from anywhere but the latch, must run the exiting
1552 // iteration in scalar form.
1553 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1554 return true;
1555 return InterleaveInfo.requiresScalarEpilogue();
1556 }
1557
1558 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1559 /// loop hint annotation.
1560 bool isScalarEpilogueAllowed() const {
1561 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1562 }
1563
1564 /// Returns true if all loop blocks should be masked to fold tail loop.
1565 bool foldTailByMasking() const { return FoldTailByMasking; }
1566
1567 bool blockNeedsPredication(BasicBlock *BB) const {
1568 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1569 }
1570
1571 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1572 /// nodes to the chain of instructions representing the reductions. Uses a
1573 /// MapVector to ensure deterministic iteration order.
1574 using ReductionChainMap =
1575 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1576
1577 /// Return the chain of instructions representing an inloop reduction.
1578 const ReductionChainMap &getInLoopReductionChains() const {
1579 return InLoopReductionChains;
1580 }
1581
1582 /// Returns true if the Phi is part of an inloop reduction.
1583 bool isInLoopReduction(PHINode *Phi) const {
1584 return InLoopReductionChains.count(Phi);
1585 }
1586
1587 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1588 /// with factor VF. Return the cost of the instruction, including
1589 /// scalarization overhead if it's needed.
1590 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1591
1592 /// Estimate cost of a call instruction CI if it were vectorized with factor
1593 /// VF. Return the cost of the instruction, including scalarization overhead
1594 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1595 /// scalarized -
1596 /// i.e. either vector version isn't available, or is too expensive.
1597 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1598 bool &NeedToScalarize) const;
1599
1600 /// Invalidates decisions already taken by the cost model.
1601 void invalidateCostModelingDecisions() {
1602 WideningDecisions.clear();
1603 Uniforms.clear();
1604 Scalars.clear();
1605 }
1606
1607private:
1608 unsigned NumPredStores = 0;
1609
1610 /// \return An upper bound for the vectorization factor, a power-of-2 larger
1611 /// than zero. One is returned if vectorization should best be avoided due
1612 /// to cost.
1613 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
1614 ElementCount UserVF);
1615
1616 /// The vectorization cost is a combination of the cost itself and a boolean
1617 /// indicating whether any of the contributing operations will actually
1618 /// operate on
1619 /// vector values after type legalization in the backend. If this latter value
1620 /// is
1621 /// false, then all operations will be scalarized (i.e. no vectorization has
1622 /// actually taken place).
1623 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1624
1625 /// Returns the expected execution cost. The unit of the cost does
1626 /// not matter because we use the 'cost' units to compare different
1627 /// vector widths. The cost that is returned is *not* normalized by
1628 /// the factor width.
1629 VectorizationCostTy expectedCost(ElementCount VF);
1630
1631 /// Returns the execution time cost of an instruction for a given vector
1632 /// width. Vector width of one means scalar.
1633 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1634
1635 /// The cost-computation logic from getInstructionCost which provides
1636 /// the vector type as an output parameter.
1637 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1638 Type *&VectorTy);
1639
1640 /// Return the cost of instructions in an inloop reduction pattern, if I is
1641 /// part of that pattern.
1642 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF,
1643 Type *VectorTy,
1644 TTI::TargetCostKind CostKind);
1645
1646 /// Calculate vectorization cost of memory instruction \p I.
1647 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1648
1649 /// The cost computation for scalarized memory instruction.
1650 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1651
1652 /// The cost computation for interleaving group of memory instructions.
1653 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1654
1655 /// The cost computation for Gather/Scatter instruction.
1656 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1657
1658 /// The cost computation for widening instruction \p I with consecutive
1659 /// memory access.
1660 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1661
1662 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1663 /// Load: scalar load + broadcast.
1664 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1665 /// element)
1666 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1667
1668 /// Estimate the overhead of scalarizing an instruction. This is a
1669 /// convenience wrapper for the type-based getScalarizationOverhead API.
1670 InstructionCost getScalarizationOverhead(Instruction *I,
1671 ElementCount VF) const;
1672
1673 /// Returns whether the instruction is a load or store and will be a emitted
1674 /// as a vector operation.
1675 bool isConsecutiveLoadOrStore(Instruction *I);
1676
1677 /// Returns true if an artificially high cost for emulated masked memrefs
1678 /// should be used.
1679 bool useEmulatedMaskMemRefHack(Instruction *I);
1680
1681 /// Map of scalar integer values to the smallest bitwidth they can be legally
1682 /// represented as. The vector equivalents of these values should be truncated
1683 /// to this type.
1684 MapVector<Instruction *, uint64_t> MinBWs;
1685
1686 /// A type representing the costs for instructions if they were to be
1687 /// scalarized rather than vectorized. The entries are Instruction-Cost
1688 /// pairs.
1689 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1690
1691 /// A set containing all BasicBlocks that are known to present after
1692 /// vectorization as a predicated block.
1693 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1694
1695 /// Records whether it is allowed to have the original scalar loop execute at
1696 /// least once. This may be needed as a fallback loop in case runtime
1697 /// aliasing/dependence checks fail, or to handle the tail/remainder
1698 /// iterations when the trip count is unknown or doesn't divide by the VF,
1699 /// or as a peel-loop to handle gaps in interleave-groups.
1700 /// Under optsize and when the trip count is very small we don't allow any
1701 /// iterations to execute in the scalar loop.
1702 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1703
1704 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1705 bool FoldTailByMasking = false;
1706
1707 /// A map holding scalar costs for different vectorization factors. The
1708 /// presence of a cost for an instruction in the mapping indicates that the
1709 /// instruction will be scalarized when vectorizing with the associated
1710 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1711 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1712
1713 /// Holds the instructions known to be uniform after vectorization.
1714 /// The data is collected per VF.
1715 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1716
1717 /// Holds the instructions known to be scalar after vectorization.
1718 /// The data is collected per VF.
1719 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1720
1721 /// Holds the instructions (address computations) that are forced to be
1722 /// scalarized.
1723 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1724
1725 /// PHINodes of the reductions that should be expanded in-loop along with
1726 /// their associated chains of reduction operations, in program order from top
1727 /// (PHI) to bottom
1728 ReductionChainMap InLoopReductionChains;
1729
1730 /// A Map of inloop reduction operations and their immediate chain operand.
1731 /// FIXME: This can be removed once reductions can be costed correctly in
1732 /// vplan. This was added to allow quick lookup to the inloop operations,
1733 /// without having to loop through InLoopReductionChains.
1734 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1735
1736 /// Returns the expected difference in cost from scalarizing the expression
1737 /// feeding a predicated instruction \p PredInst. The instructions to
1738 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1739 /// non-negative return value implies the expression will be scalarized.
1740 /// Currently, only single-use chains are considered for scalarization.
1741 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1742 ElementCount VF);
1743
1744 /// Collect the instructions that are uniform after vectorization. An
1745 /// instruction is uniform if we represent it with a single scalar value in
1746 /// the vectorized loop corresponding to each vector iteration. Examples of
1747 /// uniform instructions include pointer operands of consecutive or
1748 /// interleaved memory accesses. Note that although uniformity implies an
1749 /// instruction will be scalar, the reverse is not true. In general, a
1750 /// scalarized instruction will be represented by VF scalar values in the
1751 /// vectorized loop, each corresponding to an iteration of the original
1752 /// scalar loop.
1753 void collectLoopUniforms(ElementCount VF);
1754
1755 /// Collect the instructions that are scalar after vectorization. An
1756 /// instruction is scalar if it is known to be uniform or will be scalarized
1757 /// during vectorization. Non-uniform scalarized instructions will be
1758 /// represented by VF values in the vectorized loop, each corresponding to an
1759 /// iteration of the original scalar loop.
1760 void collectLoopScalars(ElementCount VF);
1761
1762 /// Keeps cost model vectorization decision and cost for instructions.
1763 /// Right now it is used for memory instructions only.
1764 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1765 std::pair<InstWidening, InstructionCost>>;
1766
1767 DecisionList WideningDecisions;
1768
1769 /// Returns true if \p V is expected to be vectorized and it needs to be
1770 /// extracted.
1771 bool needsExtract(Value *V, ElementCount VF) const {
1772 Instruction *I = dyn_cast<Instruction>(V);
1773 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1774 TheLoop->isLoopInvariant(I))
1775 return false;
1776
1777 // Assume we can vectorize V (and hence we need extraction) if the
1778 // scalars are not computed yet. This can happen, because it is called
1779 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1780 // the scalars are collected. That should be a safe assumption in most
1781 // cases, because we check if the operands have vectorizable types
1782 // beforehand in LoopVectorizationLegality.
1783 return Scalars.find(VF) == Scalars.end() ||
1784 !isScalarAfterVectorization(I, VF);
1785 };
1786
1787 /// Returns a range containing only operands needing to be extracted.
1788 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1789 ElementCount VF) const {
1790 return SmallVector<Value *, 4>(make_filter_range(
1791 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1792 }
1793
1794 /// Determines if we have the infrastructure to vectorize loop \p L and its
1795 /// epilogue, assuming the main loop is vectorized by \p VF.
1796 bool isCandidateForEpilogueVectorization(const Loop &L,
1797 const ElementCount VF) const;
1798
1799 /// Returns true if epilogue vectorization is considered profitable, and
1800 /// false otherwise.
1801 /// \p VF is the vectorization factor chosen for the original loop.
1802 bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1803
1804public:
1805 /// The loop that we evaluate.
1806 Loop *TheLoop;
1807
1808 /// Predicated scalar evolution analysis.
1809 PredicatedScalarEvolution &PSE;
1810
1811 /// Loop Info analysis.
1812 LoopInfo *LI;
1813
1814 /// Vectorization legality.
1815 LoopVectorizationLegality *Legal;
1816
1817 /// Vector target information.
1818 const TargetTransformInfo &TTI;
1819
1820 /// Target Library Info.
1821 const TargetLibraryInfo *TLI;
1822
1823 /// Demanded bits analysis.
1824 DemandedBits *DB;
1825
1826 /// Assumption cache.
1827 AssumptionCache *AC;
1828
1829 /// Interface to emit optimization remarks.
1830 OptimizationRemarkEmitter *ORE;
1831
1832 const Function *TheFunction;
1833
1834 /// Loop Vectorize Hint.
1835 const LoopVectorizeHints *Hints;
1836
1837 /// The interleave access information contains groups of interleaved accesses
1838 /// with the same stride and close to each other.
1839 InterleavedAccessInfo &InterleaveInfo;
1840
1841 /// Values to ignore in the cost model.
1842 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1843
1844 /// Values to ignore in the cost model when VF > 1.
1845 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1846
1847 /// Profitable vector factors.
1848 SmallVector<VectorizationFactor, 8> ProfitableVFs;
1849};
1850} // end namespace llvm
1851
1852/// Helper struct to manage generating runtime checks for vectorization.
1853///
1854/// The runtime checks are created up-front in temporary blocks to allow better
1855/// estimating the cost and un-linked from the existing IR. After deciding to
1856/// vectorize, the checks are moved back. If deciding not to vectorize, the
1857/// temporary blocks are completely removed.
1858class GeneratedRTChecks {
1859 /// Basic block which contains the generated SCEV checks, if any.
1860 BasicBlock *SCEVCheckBlock = nullptr;
1861
1862 /// The value representing the result of the generated SCEV checks. If it is
1863 /// nullptr, either no SCEV checks have been generated or they have been used.
1864 Value *SCEVCheckCond = nullptr;
1865
1866 /// Basic block which contains the generated memory runtime checks, if any.
1867 BasicBlock *MemCheckBlock = nullptr;
1868
1869 /// The value representing the result of the generated memory runtime checks.
1870 /// If it is nullptr, either no memory runtime checks have been generated or
1871 /// they have been used.
1872 Instruction *MemRuntimeCheckCond = nullptr;
1873
1874 DominatorTree *DT;
1875 LoopInfo *LI;
1876
1877 SCEVExpander SCEVExp;
1878 SCEVExpander MemCheckExp;
1879
1880public:
1881 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1882 const DataLayout &DL)
1883 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1884 MemCheckExp(SE, DL, "scev.check") {}
1885
1886 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1887 /// accurately estimate the cost of the runtime checks. The blocks are
1888 /// un-linked from the IR and is added back during vector code generation. If
1889 /// there is no vector code generation, the check blocks are removed
1890 /// completely.
1891 void Create(Loop *L, const LoopAccessInfo &LAI,
1892 const SCEVUnionPredicate &UnionPred) {
1893
1894 BasicBlock *LoopHeader = L->getHeader();
1895 BasicBlock *Preheader = L->getLoopPreheader();
1896
1897 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1898 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1899 // may be used by SCEVExpander. The blocks will be un-linked from their
1900 // predecessors and removed from LI & DT at the end of the function.
1901 if (!UnionPred.isAlwaysTrue()) {
1902 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1903 nullptr, "vector.scevcheck");
1904
1905 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1906 &UnionPred, SCEVCheckBlock->getTerminator());
1907 }
1908
1909 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1910 if (RtPtrChecking.Need) {
1911 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1912 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1913 "vector.memcheck");
1914
1915 std::tie(std::ignore, MemRuntimeCheckCond) =
1916 addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1917 RtPtrChecking.getChecks(), MemCheckExp);
1918 assert(MemRuntimeCheckCond &&((MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? static_cast<void> (0)
: __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1920, __PRETTY_FUNCTION__))
1919 "no RT checks generated although RtPtrChecking "((MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? static_cast<void> (0)
: __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1920, __PRETTY_FUNCTION__))
1920 "claimed checks are required")((MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? static_cast<void> (0)
: __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1920, __PRETTY_FUNCTION__))
;
1921 }
1922
1923 if (!MemCheckBlock && !SCEVCheckBlock)
1924 return;
1925
1926 // Unhook the temporary block with the checks, update various places
1927 // accordingly.
1928 if (SCEVCheckBlock)
1929 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1930 if (MemCheckBlock)
1931 MemCheckBlock->replaceAllUsesWith(Preheader);
1932
1933 if (SCEVCheckBlock) {
1934 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1935 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1936 Preheader->getTerminator()->eraseFromParent();
1937 }
1938 if (MemCheckBlock) {
1939 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1940 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1941 Preheader->getTerminator()->eraseFromParent();
1942 }
1943
1944 DT->changeImmediateDominator(LoopHeader, Preheader);
1945 if (MemCheckBlock) {
1946 DT->eraseNode(MemCheckBlock);
1947 LI->removeBlock(MemCheckBlock);
1948 }
1949 if (SCEVCheckBlock) {
1950 DT->eraseNode(SCEVCheckBlock);
1951 LI->removeBlock(SCEVCheckBlock);
1952 }
1953 }
1954
1955 /// Remove the created SCEV & memory runtime check blocks & instructions, if
1956 /// unused.
1957 ~GeneratedRTChecks() {
1958 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
1959 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
1960 if (!SCEVCheckCond)
1961 SCEVCleaner.markResultUsed();
1962
1963 if (!MemRuntimeCheckCond)
1964 MemCheckCleaner.markResultUsed();
1965
1966 if (MemRuntimeCheckCond) {
1967 auto &SE = *MemCheckExp.getSE();
1968 // Memory runtime check generation creates compares that use expanded
1969 // values. Remove them before running the SCEVExpanderCleaners.
1970 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
1971 if (MemCheckExp.isInsertedInstruction(&I))
1972 continue;
1973 SE.forgetValue(&I);
1974 SE.eraseValueFromMap(&I);
1975 I.eraseFromParent();
1976 }
1977 }
1978 MemCheckCleaner.cleanup();
1979 SCEVCleaner.cleanup();
1980
1981 if (SCEVCheckCond)
1982 SCEVCheckBlock->eraseFromParent();
1983 if (MemRuntimeCheckCond)
1984 MemCheckBlock->eraseFromParent();
1985 }
1986
1987 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
1988 /// adjusts the branches to branch to the vector preheader or \p Bypass,
1989 /// depending on the generated condition.
1990 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
1991 BasicBlock *LoopVectorPreHeader,
1992 BasicBlock *LoopExitBlock) {
1993 if (!SCEVCheckCond)
1994 return nullptr;
1995 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
1996 if (C->isZero())
1997 return nullptr;
1998
1999 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2000
2001 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2002 // Create new preheader for vector loop.
2003 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2004 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2005
2006 SCEVCheckBlock->getTerminator()->eraseFromParent();
2007 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2008 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2009 SCEVCheckBlock);
2010
2011 DT->addNewBlock(SCEVCheckBlock, Pred);
2012 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2013
2014 ReplaceInstWithInst(
2015 SCEVCheckBlock->getTerminator(),
2016 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2017 // Mark the check as used, to prevent it from being removed during cleanup.
2018 SCEVCheckCond = nullptr;
2019 return SCEVCheckBlock;
2020 }
2021
2022 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2023 /// the branches to branch to the vector preheader or \p Bypass, depending on
2024 /// the generated condition.
2025 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2026 BasicBlock *LoopVectorPreHeader) {
2027 // Check if we generated code that checks in runtime if arrays overlap.
2028 if (!MemRuntimeCheckCond)
2029 return nullptr;
2030
2031 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2032 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2033 MemCheckBlock);
2034
2035 DT->addNewBlock(MemCheckBlock, Pred);
2036 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2037 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2038
2039 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2040 PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2041
2042 ReplaceInstWithInst(
2043 MemCheckBlock->getTerminator(),
2044 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2045 MemCheckBlock->getTerminator()->setDebugLoc(
2046 Pred->getTerminator()->getDebugLoc());
2047
2048 // Mark the check as used, to prevent it from being removed during cleanup.
2049 MemRuntimeCheckCond = nullptr;
2050 return MemCheckBlock;
2051 }
2052};
2053
2054// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2055// vectorization. The loop needs to be annotated with #pragma omp simd
2056// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2057// vector length information is not provided, vectorization is not considered
2058// explicit. Interleave hints are not allowed either. These limitations will be
2059// relaxed in the future.
2060// Please, note that we are currently forced to abuse the pragma 'clang
2061// vectorize' semantics. This pragma provides *auto-vectorization hints*
2062// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2063// provides *explicit vectorization hints* (LV can bypass legal checks and
2064// assume that vectorization is legal). However, both hints are implemented
2065// using the same metadata (llvm.loop.vectorize, processed by
2066// LoopVectorizeHints). This will be fixed in the future when the native IR
2067// representation for pragma 'omp simd' is introduced.
2068static bool isExplicitVecOuterLoop(Loop *OuterLp,
2069 OptimizationRemarkEmitter *ORE) {
2070 assert(!OuterLp->isInnermost() && "This is not an outer loop")((!OuterLp->isInnermost() && "This is not an outer loop"
) ? static_cast<void> (0) : __assert_fail ("!OuterLp->isInnermost() && \"This is not an outer loop\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2070, __PRETTY_FUNCTION__))
;
2071 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2072
2073 // Only outer loops with an explicit vectorization hint are supported.
2074 // Unannotated outer loops are ignored.
2075 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2076 return false;
2077
2078 Function *Fn = OuterLp->getHeader()->getParent();
2079 if (!Hints.allowVectorization(Fn, OuterLp,
2080 true /*VectorizeOnlyWhenForced*/)) {
2081 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"
; } } while (false)
;
2082 return false;
2083 }
2084
2085 if (Hints.getInterleave() > 1) {
2086 // TODO: Interleave support is future work.
2087 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
2088 "outer loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
;
2089 Hints.emitRemarkWithHints();
2090 return false;
2091 }
2092
2093 return true;
2094}
2095
2096static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2097 OptimizationRemarkEmitter *ORE,
2098 SmallVectorImpl<Loop *> &V) {
2099 // Collect inner loops and outer loops without irreducible control flow. For
2100 // now, only collect outer loops that have explicit vectorization hints. If we
2101 // are stress testing the VPlan H-CFG construction, we collect the outermost
2102 // loop of every loop nest.
2103 if (L.isInnermost() || VPlanBuildStressTest ||
2104 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2105 LoopBlocksRPO RPOT(&L);
2106 RPOT.perform(LI);
2107 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2108 V.push_back(&L);
2109 // TODO: Collect inner loops inside marked outer loops in case
2110 // vectorization fails for the outer loop. Do not invoke
2111 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2112 // already known to be reducible. We can use an inherited attribute for
2113 // that.
2114 return;
2115 }
2116 }
2117 for (Loop *InnerL : L)
2118 collectSupportedLoops(*InnerL, LI, ORE, V);
2119}
2120
2121namespace {
2122
2123/// The LoopVectorize Pass.
2124struct LoopVectorize : public FunctionPass {
2125 /// Pass identification, replacement for typeid
2126 static char ID;
2127
2128 LoopVectorizePass Impl;
2129
2130 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2131 bool VectorizeOnlyWhenForced = false)
2132 : FunctionPass(ID),
2133 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2134 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2135 }
2136
2137 bool runOnFunction(Function &F) override {
2138 if (skipFunction(F))
2139 return false;
2140
2141 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2142 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2143 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2144 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2145 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2146 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2147 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2148 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2149 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2150 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2151 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2152 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2153 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2154
2155 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2156 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2157
2158 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2159 GetLAA, *ORE, PSI).MadeAnyChange;
2160 }
2161
2162 void getAnalysisUsage(AnalysisUsage &AU) const override {
2163 AU.addRequired<AssumptionCacheTracker>();
2164 AU.addRequired<BlockFrequencyInfoWrapperPass>();
2165 AU.addRequired<DominatorTreeWrapperPass>();
2166 AU.addRequired<LoopInfoWrapperPass>();
2167 AU.addRequired<ScalarEvolutionWrapperPass>();
2168 AU.addRequired<TargetTransformInfoWrapperPass>();
2169 AU.addRequired<AAResultsWrapperPass>();
2170 AU.addRequired<LoopAccessLegacyAnalysis>();
2171 AU.addRequired<DemandedBitsWrapperPass>();
2172 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2173 AU.addRequired<InjectTLIMappingsLegacy>();
2174
2175 // We currently do not preserve loopinfo/dominator analyses with outer loop
2176 // vectorization. Until this is addressed, mark these analyses as preserved
2177 // only for non-VPlan-native path.
2178 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2179 if (!EnableVPlanNativePath) {
2180 AU.addPreserved<LoopInfoWrapperPass>();
2181 AU.addPreserved<DominatorTreeWrapperPass>();
2182 }
2183
2184 AU.addPreserved<BasicAAWrapperPass>();
2185 AU.addPreserved<GlobalsAAWrapperPass>();
2186 AU.addRequired<ProfileSummaryInfoWrapperPass>();
2187 }
2188};
2189
2190} // end anonymous namespace
2191
2192//===----------------------------------------------------------------------===//
2193// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2194// LoopVectorizationCostModel and LoopVectorizationPlanner.
2195//===----------------------------------------------------------------------===//
2196
2197Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2198 // We need to place the broadcast of invariant variables outside the loop,
2199 // but only if it's proven safe to do so. Else, broadcast will be inside
2200 // vector loop body.
2201 Instruction *Instr = dyn_cast<Instruction>(V);
2202 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2203 (!Instr ||
2204 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2205 // Place the code for broadcasting invariant variables in the new preheader.
2206 IRBuilder<>::InsertPointGuard Guard(Builder);
2207 if (SafeToHoist)
2208 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2209
2210 // Broadcast the scalar into all locations in the vector.
2211 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2212
2213 return Shuf;
2214}
2215
2216void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2217 const InductionDescriptor &II, Value *Step, Value *Start,
2218 Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
2219 VPTransformState &State) {
2220 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2221, __PRETTY_FUNCTION__))
2221 "Expected either an induction phi-node or a truncate of it!")(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2221, __PRETTY_FUNCTION__))
;
2222
2223 // Construct the initial value of the vector IV in the vector loop preheader
2224 auto CurrIP = Builder.saveIP();
2225 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2226 if (isa<TruncInst>(EntryVal)) {
2227 assert(Start->getType()->isIntegerTy() &&((Start->getType()->isIntegerTy() && "Truncation requires an integer type"
) ? static_cast<void> (0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2228, __PRETTY_FUNCTION__))
2228 "Truncation requires an integer type")((Start->getType()->isIntegerTy() && "Truncation requires an integer type"
) ? static_cast<void> (0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2228, __PRETTY_FUNCTION__))
;
2229 auto *TruncType = cast<IntegerType>(EntryVal->getType());
2230 Step = Builder.CreateTrunc(Step, TruncType);
2231 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2232 }
2233 Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2234 Value *SteppedStart =
2235 getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2236
2237 // We create vector phi nodes for both integer and floating-point induction
2238 // variables. Here, we determine the kind of arithmetic we will perform.
2239 Instruction::BinaryOps AddOp;
2240 Instruction::BinaryOps MulOp;
2241 if (Step->getType()->isIntegerTy()) {
2242 AddOp = Instruction::Add;
2243 MulOp = Instruction::Mul;
2244 } else {
2245 AddOp = II.getInductionOpcode();
2246 MulOp = Instruction::FMul;
2247 }
2248
2249 // Multiply the vectorization factor by the step using integer or
2250 // floating-point arithmetic as appropriate.
2251 Type *StepType = Step->getType();
2252 if (Step->getType()->isFloatingPointTy())
2253 StepType = IntegerType::get(StepType->getContext(),
2254 StepType->getScalarSizeInBits());
2255 Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF);
2256 if (Step->getType()->isFloatingPointTy())
2257 RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType());
2258 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2259
2260 // Create a vector splat to use in the induction update.
2261 //
2262 // FIXME: If the step is non-constant, we create the vector splat with
2263 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2264 // handle a constant vector splat.
2265 Value *SplatVF = isa<Constant>(Mul)
2266 ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2267 : Builder.CreateVectorSplat(VF, Mul);
2268 Builder.restoreIP(CurrIP);
2269
2270 // We may need to add the step a number of times, depending on the unroll
2271 // factor. The last of those goes into the PHI.
2272 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2273 &*LoopVectorBody->getFirstInsertionPt());
2274 VecInd->setDebugLoc(EntryVal->getDebugLoc());
2275 Instruction *LastInduction = VecInd;
2276 for (unsigned Part = 0; Part < UF; ++Part) {
2277 State.set(Def, LastInduction, Part);
2278
2279 if (isa<TruncInst>(EntryVal))
2280 addMetadata(LastInduction, EntryVal);
2281 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
2282 State, Part);
2283
2284 LastInduction = cast<Instruction>(
2285 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2286 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2287 }
2288
2289 // Move the last step to the end of the latch block. This ensures consistent
2290 // placement of all induction updates.
2291 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2292 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2293 auto *ICmp = cast<Instruction>(Br->getCondition());
2294 LastInduction->moveBefore(ICmp);
2295 LastInduction->setName("vec.ind.next");
2296
2297 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2298 VecInd->addIncoming(LastInduction, LoopVectorLatch);
2299}
2300
2301bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2302 return Cost->isScalarAfterVectorization(I, VF) ||
2303 Cost->isProfitableToScalarize(I, VF);
2304}
2305
2306bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2307 if (shouldScalarizeInstruction(IV))
2308 return true;
2309 auto isScalarInst = [&](User *U) -> bool {
2310 auto *I = cast<Instruction>(U);
2311 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2312 };
2313 return llvm::any_of(IV->users(), isScalarInst);
2314}
2315
2316void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2317 const InductionDescriptor &ID, const Instruction *EntryVal,
2318 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
2319 unsigned Part, unsigned Lane) {
2320 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2321, __PRETTY_FUNCTION__))
2321 "Expected either an induction phi-node or a truncate of it!")(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2321, __PRETTY_FUNCTION__))
;
2322
2323 // This induction variable is not the phi from the original loop but the
2324 // newly-created IV based on the proof that casted Phi is equal to the
2325 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2326 // re-uses the same InductionDescriptor that original IV uses but we don't
2327 // have to do any recording in this case - that is done when original IV is
2328 // processed.
2329 if (isa<TruncInst>(EntryVal))
2330 return;
2331
2332 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2333 if (Casts.empty())
2334 return;
2335 // Only the first Cast instruction in the Casts vector is of interest.
2336 // The rest of the Casts (if exist) have no uses outside the
2337 // induction update chain itself.
2338 if (Lane < UINT_MAX(2147483647 *2U +1U))
2339 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
2340 else
2341 State.set(CastDef, VectorLoopVal, Part);
2342}
2343
2344void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
2345 TruncInst *Trunc, VPValue *Def,
2346 VPValue *CastDef,
2347 VPTransformState &State) {
2348 assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&(((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type") ? static_cast
<void> (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2349, __PRETTY_FUNCTION__))
2349 "Primary induction variable must have an integer type")(((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type") ? static_cast
<void> (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2349, __PRETTY_FUNCTION__))
;
2350
2351 auto II = Legal->getInductionVars().find(IV);
2352 assert(II != Legal->getInductionVars().end() && "IV is not an induction")((II != Legal->getInductionVars().end() && "IV is not an induction"
) ? static_cast<void> (0) : __assert_fail ("II != Legal->getInductionVars().end() && \"IV is not an induction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2352, __PRETTY_FUNCTION__))
;
2353
2354 auto ID = II->second;
2355 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match")((IV->getType() == ID.getStartValue()->getType() &&
"Types must match") ? static_cast<void> (0) : __assert_fail
("IV->getType() == ID.getStartValue()->getType() && \"Types must match\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2355, __PRETTY_FUNCTION__))
;
2356
2357 // The value from the original loop to which we are mapping the new induction
2358 // variable.
2359 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2360
2361 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2362
2363 // Generate code for the induction step. Note that induction steps are
2364 // required to be loop-invariant
2365 auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2366 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&((PSE.getSE()->isLoopInvariant(Step, OrigLoop) && "Induction step should be loop invariant"
) ? static_cast<void> (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(Step, OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2367, __PRETTY_FUNCTION__))
2367 "Induction step should be loop invariant")((PSE.getSE()->isLoopInvariant(Step, OrigLoop) && "Induction step should be loop invariant"
) ? static_cast<void> (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(Step, OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2367, __PRETTY_FUNCTION__))
;
2368 if (PSE.getSE()->isSCEVable(IV->getType())) {
2369 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2370 return Exp.expandCodeFor(Step, Step->getType(),
2371 LoopVectorPreHeader->getTerminator());
2372 }
2373 return cast<SCEVUnknown>(Step)->getValue();
2374 };
2375
2376 // The scalar value to broadcast. This is derived from the canonical
2377 // induction variable. If a truncation type is given, truncate the canonical
2378 // induction variable and step. Otherwise, derive these values from the
2379 // induction descriptor.
2380 auto CreateScalarIV = [&](Value *&Step) -> Value * {
2381 Value *ScalarIV = Induction;
2382 if (IV != OldInduction) {
2383 ScalarIV = IV->getType()->isIntegerTy()
2384 ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2385 : Builder.CreateCast(Instruction::SIToFP, Induction,
2386 IV->getType());
2387 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2388 ScalarIV->setName("offset.idx");
2389 }
2390 if (Trunc) {
2391 auto *TruncType = cast<IntegerType>(Trunc->getType());
2392 assert(Step->getType()->isIntegerTy() &&((Step->getType()->isIntegerTy() && "Truncation requires an integer step"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2393, __PRETTY_FUNCTION__))
2393 "Truncation requires an integer step")((Step->getType()->isIntegerTy() && "Truncation requires an integer step"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2393, __PRETTY_FUNCTION__))
;
2394 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2395 Step = Builder.CreateTrunc(Step, TruncType);
2396 }
2397 return ScalarIV;
2398 };
2399
2400 // Create the vector values from the scalar IV, in the absence of creating a
2401 // vector IV.
2402 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2403 Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2404 for (unsigned Part = 0; Part < UF; ++Part) {
2405 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2405, __PRETTY_FUNCTION__))
;
2406 Value *EntryPart =
2407 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2408 ID.getInductionOpcode());
2409 State.set(Def, EntryPart, Part);
2410 if (Trunc)
2411 addMetadata(EntryPart, Trunc);
2412 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
2413 State, Part);
2414 }
2415 };
2416
2417 // Fast-math-flags propagate from the original induction instruction.
2418 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2419 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2420 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2421
2422 // Now do the actual transformations, and start with creating the step value.
2423 Value *Step = CreateStepValue(ID.getStep());
2424 if (VF.isZero() || VF.isScalar()) {
2425 Value *ScalarIV = CreateScalarIV(Step);
2426 CreateSplatIV(ScalarIV, Step);
2427 return;
2428 }
2429
2430 // Determine if we want a scalar version of the induction variable. This is
2431 // true if the induction variable itself is not widened, or if it has at
2432 // least one user in the loop that is not widened.
2433 auto NeedsScalarIV = needsScalarInduction(EntryVal);
2434 if (!NeedsScalarIV) {
2435 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2436 State);
2437 return;
2438 }
2439
2440 // Try to create a new independent vector induction variable. If we can't
2441 // create the phi node, we will splat the scalar induction variable in each
2442 // loop iteration.
2443 if (!shouldScalarizeInstruction(EntryVal)) {
2444 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2445 State);
2446 Value *ScalarIV = CreateScalarIV(Step);
2447 // Create scalar steps that can be used by instructions we will later
2448 // scalarize. Note that the addition of the scalar steps will not increase
2449 // the number of instructions in the loop in the common case prior to
2450 // InstCombine. We will be trading one vector extract for each scalar step.
2451 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2452 return;
2453 }
2454
2455 // All IV users are scalar instructions, so only emit a scalar IV, not a
2456 // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2457 // predicate used by the masked loads/stores.
2458 Value *ScalarIV = CreateScalarIV(Step);
2459 if (!Cost->isScalarEpilogueAllowed())
2460 CreateSplatIV(ScalarIV, Step);
2461 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2462}
2463
2464Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2465 Instruction::BinaryOps BinOp) {
2466 // Create and check the types.
2467 auto *ValVTy = cast<VectorType>(Val->getType());
2468 ElementCount VLen = ValVTy->getElementCount();
2469
2470 Type *STy = Val->getType()->getScalarType();
2471 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&(((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
"Induction Step must be an integer or FP") ? static_cast<
void> (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2472, __PRETTY_FUNCTION__))
2472 "Induction Step must be an integer or FP")(((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
"Induction Step must be an integer or FP") ? static_cast<
void> (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2472, __PRETTY_FUNCTION__))
;
2473 assert(Step->getType() == STy && "Step has wrong type")((Step->getType() == STy && "Step has wrong type")
? static_cast<void> (0) : __assert_fail ("Step->getType() == STy && \"Step has wrong type\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2473, __PRETTY_FUNCTION__))
;
2474
2475 SmallVector<Constant *, 8> Indices;
2476
2477 // Create a vector of consecutive numbers from zero to VF.
2478 VectorType *InitVecValVTy = ValVTy;
2479 Type *InitVecValSTy = STy;
2480 if (STy->isFloatingPointTy()) {
2481 InitVecValSTy =
2482 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2483 InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2484 }
2485 Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2486
2487 // Add on StartIdx
2488 Value *StartIdxSplat = Builder.CreateVectorSplat(
2489 VLen, ConstantInt::get(InitVecValSTy, StartIdx));
2490 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2491
2492 if (STy->isIntegerTy()) {
2493 Step = Builder.CreateVectorSplat(VLen, Step);
2494 assert(Step->getType() == Val->getType() && "Invalid step vec")((Step->getType() == Val->getType() && "Invalid step vec"
) ? static_cast<void> (0) : __assert_fail ("Step->getType() == Val->getType() && \"Invalid step vec\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2494, __PRETTY_FUNCTION__))
;
2495 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2496 // which can be found from the original scalar operations.
2497 Step = Builder.CreateMul(InitVec, Step);
2498 return Builder.CreateAdd(Val, Step, "induction");
2499 }
2500
2501 // Floating point induction.
2502 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&(((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
"Binary Opcode should be specified for FP induction") ? static_cast
<void> (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2503, __PRETTY_FUNCTION__))
2503 "Binary Opcode should be specified for FP induction")(((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
"Binary Opcode should be specified for FP induction") ? static_cast
<void> (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2503, __PRETTY_FUNCTION__))
;
2504 InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2505 Step = Builder.CreateVectorSplat(VLen, Step);
2506 Value *MulOp = Builder.CreateFMul(InitVec, Step);
2507 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2508}
2509
2510void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2511 Instruction *EntryVal,
2512 const InductionDescriptor &ID,
2513 VPValue *Def, VPValue *CastDef,
2514 VPTransformState &State) {
2515 // We shouldn't have to build scalar steps if we aren't vectorizing.
2516 assert(VF.isVector() && "VF should be greater than one")((VF.isVector() && "VF should be greater than one") ?
static_cast<void> (0) : __assert_fail ("VF.isVector() && \"VF should be greater than one\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2516, __PRETTY_FUNCTION__))
;
2517 // Get the value type and ensure it and the step have the same integer type.
2518 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2519 assert(ScalarIVTy == Step->getType() &&((ScalarIVTy == Step->getType() && "Val and Step should have the same type"
) ? static_cast<void> (0) : __assert_fail ("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2520, __PRETTY_FUNCTION__))
2520 "Val and Step should have the same type")((ScalarIVTy == Step->getType() && "Val and Step should have the same type"
) ? static_cast<void> (0) : __assert_fail ("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2520, __PRETTY_FUNCTION__))
;
2521
2522 // We build scalar steps for both integer and floating-point induction
2523 // variables. Here, we determine the kind of arithmetic we will perform.
2524 Instruction::BinaryOps AddOp;
2525 Instruction::BinaryOps MulOp;
2526 if (ScalarIVTy->isIntegerTy()) {
2527 AddOp = Instruction::Add;
2528 MulOp = Instruction::Mul;
2529 } else {
2530 AddOp = ID.getInductionOpcode();
2531 MulOp = Instruction::FMul;
2532 }
2533
2534 // Determine the number of scalars we need to generate for each unroll
2535 // iteration. If EntryVal is uniform, we only need to generate the first
2536 // lane. Otherwise, we generate all VF values.
2537 bool IsUniform =
2538 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF);
2539 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue();
2540 // Compute the scalar steps and save the results in State.
2541 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2542 ScalarIVTy->getScalarSizeInBits());
2543 Type *VecIVTy = nullptr;
2544 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2545 if (!IsUniform && VF.isScalable()) {
2546 VecIVTy = VectorType::get(ScalarIVTy, VF);
2547 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF));
2548 SplatStep = Builder.CreateVectorSplat(VF, Step);
2549 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV);
2550 }
2551
2552 for (unsigned Part = 0; Part < UF; ++Part) {
2553 Value *StartIdx0 =
2554 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2555
2556 if (!IsUniform && VF.isScalable()) {
2557 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0);
2558 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2559 if (ScalarIVTy->isFloatingPointTy())
2560 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2561 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2562 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2563 State.set(Def, Add, Part);
2564 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2565 Part);
2566 // It's useful to record the lane values too for the known minimum number
2567 // of elements so we do those below. This improves the code quality when
2568 // trying to extract the first element, for example.
2569 }
2570
2571 if (ScalarIVTy->isFloatingPointTy())
2572 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2573
2574 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2575 Value *StartIdx = Builder.CreateBinOp(
2576 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2577 // The step returned by `createStepForVF` is a runtime-evaluated value
2578 // when VF is scalable. Otherwise, it should be folded into a Constant.
2579 assert((VF.isScalable() || isa<Constant>(StartIdx)) &&(((VF.isScalable() || isa<Constant>(StartIdx)) &&
"Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? static_cast<void> (0) : __assert_fail ("(VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2581, __PRETTY_FUNCTION__))
2580 "Expected StartIdx to be folded to a constant when VF is not "(((VF.isScalable() || isa<Constant>(StartIdx)) &&
"Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? static_cast<void> (0) : __assert_fail ("(VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2581, __PRETTY_FUNCTION__))
2581 "scalable")(((VF.isScalable() || isa<Constant>(StartIdx)) &&
"Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? static_cast<void> (0) : __assert_fail ("(VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2581, __PRETTY_FUNCTION__))
;
2582 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2583 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2584 State.set(Def, Add, VPIteration(Part, Lane));
2585 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2586 Part, Lane);
2587 }
2588 }
2589}
2590
2591void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2592 const VPIteration &Instance,
2593 VPTransformState &State) {
2594 Value *ScalarInst = State.get(Def, Instance);
2595 Value *VectorValue = State.get(Def, Instance.Part);
2596 VectorValue = Builder.CreateInsertElement(
2597 VectorValue, ScalarInst,
2598 Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2599 State.set(Def, VectorValue, Instance.Part);
2600}
2601
2602Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2603 assert(Vec->getType()->isVectorTy() && "Invalid type")((Vec->getType()->isVectorTy() && "Invalid type"
) ? static_cast<void> (0) : __assert_fail ("Vec->getType()->isVectorTy() && \"Invalid type\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2603, __PRETTY_FUNCTION__))
;
2604 return Builder.CreateVectorReverse(Vec, "reverse");
2605}
2606
2607// Return whether we allow using masked interleave-groups (for dealing with
2608// strided loads/stores that reside in predicated blocks, or for dealing
2609// with gaps).
2610static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2611 // If an override option has been passed in for interleaved accesses, use it.
2612 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2613 return EnableMaskedInterleavedMemAccesses;
2614
2615 return TTI.enableMaskedInterleavedAccessVectorization();
2616}
2617
2618// Try to vectorize the interleave group that \p Instr belongs to.
2619//
2620// E.g. Translate following interleaved load group (factor = 3):
2621// for (i = 0; i < N; i+=3) {
2622// R = Pic[i]; // Member of index 0
2623// G = Pic[i+1]; // Member of index 1
2624// B = Pic[i+2]; // Member of index 2
2625// ... // do something to R, G, B
2626// }
2627// To:
2628// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2629// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2630// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2631// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2632//
2633// Or translate following interleaved store group (factor = 3):
2634// for (i = 0; i < N; i+=3) {
2635// ... do something to R, G, B
2636// Pic[i] = R; // Member of index 0
2637// Pic[i+1] = G; // Member of index 1
2638// Pic[i+2] = B; // Member of index 2
2639// }
2640// To:
2641// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2642// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2643// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2644// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2645// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2646void InnerLoopVectorizer::vectorizeInterleaveGroup(
2647 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2648 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2649 VPValue *BlockInMask) {
2650 Instruction *Instr = Group->getInsertPos();
2651 const DataLayout &DL = Instr->getModule()->getDataLayout();
2652
2653 // Prepare for the vector type of the interleaved load/store.
2654 Type *ScalarTy = getMemInstValueType(Instr);
2655 unsigned InterleaveFactor = Group->getFactor();
2656 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2656, __PRETTY_FUNCTION__))
;
2657 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2658
2659 // Prepare for the new pointers.
2660 SmallVector<Value *, 2> AddrParts;
2661 unsigned Index = Group->getIndex(Instr);
2662
2663 // TODO: extend the masked interleaved-group support to reversed access.
2664 assert((!BlockInMask || !Group->isReverse()) &&(((!BlockInMask || !Group->isReverse()) && "Reversed masked interleave-group not supported."
) ? static_cast<void> (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2665, __PRETTY_FUNCTION__))
2665 "Reversed masked interleave-group not supported.")(((!BlockInMask || !Group->isReverse()) && "Reversed masked interleave-group not supported."
) ? static_cast<void> (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2665, __PRETTY_FUNCTION__))
;
2666
2667 // If the group is reverse, adjust the index to refer to the last vector lane
2668 // instead of the first. We adjust the index from the first vector lane,
2669 // rather than directly getting the pointer for lane VF - 1, because the
2670 // pointer operand of the interleaved access is supposed to be uniform. For
2671 // uniform instructions, we're only required to generate a value for the
2672 // first vector lane in each unroll iteration.
2673 assert(!VF.isScalable() &&((!VF.isScalable() && "scalable vector reverse operation is not implemented"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vector reverse operation is not implemented\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2674, __PRETTY_FUNCTION__))
2674 "scalable vector reverse operation is not implemented")((!VF.isScalable() && "scalable vector reverse operation is not implemented"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vector reverse operation is not implemented\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2674, __PRETTY_FUNCTION__))
;
2675 if (Group->isReverse())
2676 Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2677
2678 for (unsigned Part = 0; Part < UF; Part++) {
2679 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2680 setDebugLocFromInst(Builder, AddrPart);
2681
2682 // Notice current instruction could be any index. Need to adjust the address
2683 // to the member of index 0.
2684 //
2685 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2686 // b = A[i]; // Member of index 0
2687 // Current pointer is pointed to A[i+1], adjust it to A[i].
2688 //
2689 // E.g. A[i+1] = a; // Member of index 1
2690 // A[i] = b; // Member of index 0
2691 // A[i+2] = c; // Member of index 2 (Current instruction)
2692 // Current pointer is pointed to A[i+2], adjust it to A[i].
2693
2694 bool InBounds = false;
2695 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2696 InBounds = gep->isInBounds();
2697 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2698 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2699
2700 // Cast to the vector pointer type.
2701 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2702 Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2703 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2704 }
2705
2706 setDebugLocFromInst(Builder, Instr);
2707 Value *PoisonVec = PoisonValue::get(VecTy);
2708
2709 Value *MaskForGaps = nullptr;
2710 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2711 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2711, __PRETTY_FUNCTION__))
;
2712 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2713 assert(MaskForGaps && "Mask for Gaps is required but it is null")((MaskForGaps && "Mask for Gaps is required but it is null"
) ? static_cast<void> (0) : __assert_fail ("MaskForGaps && \"Mask for Gaps is required but it is null\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2713, __PRETTY_FUNCTION__))
;
2714 }
2715
2716 // Vectorize the interleaved load group.
2717 if (isa<LoadInst>(Instr)) {
2718 // For each unroll part, create a wide load for the group.
2719 SmallVector<Value *, 2> NewLoads;
2720 for (unsigned Part = 0; Part < UF; Part++) {
2721 Instruction *NewLoad;
2722 if (BlockInMask || MaskForGaps) {
2723 assert(useMaskedInterleavedAccesses(*TTI) &&((useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2724, __PRETTY_FUNCTION__))
2724 "masked interleaved groups are not allowed.")((useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2724, __PRETTY_FUNCTION__))
;
2725 Value *GroupMask = MaskForGaps;
2726 if (BlockInMask) {
2727 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2728 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2728, __PRETTY_FUNCTION__))
;
2729 Value *ShuffledMask = Builder.CreateShuffleVector(
2730 BlockInMaskPart,
2731 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2732 "interleaved.mask");
2733 GroupMask = MaskForGaps
2734 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2735 MaskForGaps)
2736 : ShuffledMask;
2737 }
2738 NewLoad =
2739 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2740 GroupMask, PoisonVec, "wide.masked.vec");
2741 }
2742 else
2743 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2744 Group->getAlign(), "wide.vec");
2745 Group->addMetadata(NewLoad);
2746 NewLoads.push_back(NewLoad);
2747 }
2748
2749 // For each member in the group, shuffle out the appropriate data from the
2750 // wide loads.
2751 unsigned J = 0;
2752 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2753 Instruction *Member = Group->getMember(I);
2754
2755 // Skip the gaps in the group.
2756 if (!Member)
2757 continue;
2758
2759 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2759, __PRETTY_FUNCTION__))
;
2760 auto StrideMask =
2761 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2762 for (unsigned Part = 0; Part < UF; Part++) {
2763 Value *StridedVec = Builder.CreateShuffleVector(
2764 NewLoads[Part], StrideMask, "strided.vec");
2765
2766 // If this member has different type, cast the result type.
2767 if (Member->getType() != ScalarTy) {
2768 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2768, __PRETTY_FUNCTION__))
;
2769 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2770 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2771 }
2772
2773 if (Group->isReverse())
2774 StridedVec = reverseVector(StridedVec);
2775
2776 State.set(VPDefs[J], StridedVec, Part);
2777 }
2778 ++J;
2779 }
2780 return;
2781 }
2782
2783 // The sub vector type for current instruction.
2784 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2784, __PRETTY_FUNCTION__))
;
2785 auto *SubVT = VectorType::get(ScalarTy, VF);
2786
2787 // Vectorize the interleaved store group.
2788 for (unsigned Part = 0; Part < UF; Part++) {
2789 // Collect the stored vector from each member.
2790 SmallVector<Value *, 4> StoredVecs;
2791 for (unsigned i = 0; i < InterleaveFactor; i++) {
2792 // Interleaved store group doesn't allow a gap, so each index has a member
2793 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group")((Group->getMember(i) && "Fail to get a member from an interleaved store group"
) ? static_cast<void> (0) : __assert_fail ("Group->getMember(i) && \"Fail to get a member from an interleaved store group\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2793, __PRETTY_FUNCTION__))
;
2794
2795 Value *StoredVec = State.get(StoredValues[i], Part);
2796
2797 if (Group->isReverse())
2798 StoredVec = reverseVector(StoredVec);
2799
2800 // If this member has different type, cast it to a unified type.
2801
2802 if (StoredVec->getType() != SubVT)
2803 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2804
2805 StoredVecs.push_back(StoredVec);
2806 }
2807
2808 // Concatenate all vectors into a wide vector.
2809 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2810
2811 // Interleave the elements in the wide vector.
2812 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2812, __PRETTY_FUNCTION__))
;
2813 Value *IVec = Builder.CreateShuffleVector(
2814 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2815 "interleaved.vec");
2816
2817 Instruction *NewStoreInstr;
2818 if (BlockInMask) {
2819 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2820 Value *ShuffledMask = Builder.CreateShuffleVector(
2821 BlockInMaskPart,
2822 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2823 "interleaved.mask");
2824 NewStoreInstr = Builder.CreateMaskedStore(
2825 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2826 }
2827 else
2828 NewStoreInstr =
2829 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2830
2831 Group->addMetadata(NewStoreInstr);
2832 }
2833}
2834
2835void InnerLoopVectorizer::vectorizeMemoryInstruction(
2836 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2837 VPValue *StoredValue, VPValue *BlockInMask) {
2838 // Attempt to issue a wide load.
2839 LoadInst *LI = dyn_cast<LoadInst>(Instr);
2840 StoreInst *SI = dyn_cast<StoreInst>(Instr);
2841
2842 assert((LI || SI) && "Invalid Load/Store instruction")(((LI || SI) && "Invalid Load/Store instruction") ? static_cast
<void> (0) : __assert_fail ("(LI || SI) && \"Invalid Load/Store instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2842, __PRETTY_FUNCTION__))
;
2843 assert((!SI || StoredValue) && "No stored value provided for widened store")(((!SI || StoredValue) && "No stored value provided for widened store"
) ? static_cast<void> (0) : __assert_fail ("(!SI || StoredValue) && \"No stored value provided for widened store\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2843, __PRETTY_FUNCTION__))
;
2844 assert((!LI || !StoredValue) && "Stored value provided for widened load")(((!LI || !StoredValue) && "Stored value provided for widened load"
) ? static_cast<void> (0) : __assert_fail ("(!LI || !StoredValue) && \"Stored value provided for widened load\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2844, __PRETTY_FUNCTION__))
;
2845
2846 LoopVectorizationCostModel::InstWidening Decision =
2847 Cost->getWideningDecision(Instr, VF);
2848 assert((Decision == LoopVectorizationCostModel::CM_Widen ||(((Decision == LoopVectorizationCostModel::CM_Widen || Decision
== LoopVectorizationCostModel::CM_Widen_Reverse || Decision ==
LoopVectorizationCostModel::CM_GatherScatter) && "CM decision is not to widen the memory instruction"
) ? static_cast<void> (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2851, __PRETTY_FUNCTION__))
2849 Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||(((Decision == LoopVectorizationCostModel::CM_Widen || Decision
== LoopVectorizationCostModel::CM_Widen_Reverse || Decision ==
LoopVectorizationCostModel::CM_GatherScatter) && "CM decision is not to widen the memory instruction"
) ? static_cast<void> (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2851, __PRETTY_FUNCTION__))
2850 Decision == LoopVectorizationCostModel::CM_GatherScatter) &&(((Decision == LoopVectorizationCostModel::CM_Widen || Decision
== LoopVectorizationCostModel::CM_Widen_Reverse || Decision ==
LoopVectorizationCostModel::CM_GatherScatter) && "CM decision is not to widen the memory instruction"
) ? static_cast<void> (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2851, __PRETTY_FUNCTION__))
2851 "CM decision is not to widen the memory instruction")(((Decision == LoopVectorizationCostModel::CM_Widen || Decision
== LoopVectorizationCostModel::CM_Widen_Reverse || Decision ==
LoopVectorizationCostModel::CM_GatherScatter) && "CM decision is not to widen the memory instruction"
) ? static_cast<void> (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2851, __PRETTY_FUNCTION__))
;
2852
2853 Type *ScalarDataTy = getMemInstValueType(Instr);
2854
2855 auto *DataTy = VectorType::get(ScalarDataTy, VF);
2856 const Align Alignment = getLoadStoreAlignment(Instr);
2857
2858 // Determine if the pointer operand of the access is either consecutive or
2859 // reverse consecutive.
2860 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2861 bool ConsecutiveStride =
2862 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2863 bool CreateGatherScatter =
2864 (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2865
2866 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2867 // gather/scatter. Otherwise Decision should have been to Scalarize.
2868 assert((ConsecutiveStride || CreateGatherScatter) &&(((ConsecutiveStride || CreateGatherScatter) && "The instruction should be scalarized"
) ? static_cast<void> (0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2869, __PRETTY_FUNCTION__))
2869 "The instruction should be scalarized")(((ConsecutiveStride || CreateGatherScatter) && "The instruction should be scalarized"
) ? static_cast<void> (0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2869, __PRETTY_FUNCTION__))
;
2870 (void)ConsecutiveStride;
2871
2872 VectorParts BlockInMaskParts(UF);
2873 bool isMaskRequired = BlockInMask;
2874 if (isMaskRequired)
2875 for (unsigned Part = 0; Part < UF; ++Part)
2876 BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2877
2878 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2879 // Calculate the pointer for the specific unroll-part.
2880 GetElementPtrInst *PartPtr = nullptr;
2881
2882 bool InBounds = false;
2883 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2884 InBounds = gep->isInBounds();
2885 if (Reverse) {
2886 // If the address is consecutive but reversed, then the
2887 // wide store needs to start at the last vector element.
2888 // RunTimeVF = VScale * VF.getKnownMinValue()
2889 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
2890 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2891 // NumElt = -Part * RunTimeVF
2892 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
2893 // LastLane = 1 - RunTimeVF
2894 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
2895 PartPtr =
2896 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
2897 PartPtr->setIsInBounds(InBounds);
2898 PartPtr = cast<GetElementPtrInst>(
2899 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
2900 PartPtr->setIsInBounds(InBounds);
2901 if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2902 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2903 } else {
2904 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2905 PartPtr = cast<GetElementPtrInst>(
2906 Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2907 PartPtr->setIsInBounds(InBounds);
2908 }
2909
2910 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2911 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2912 };
2913
2914 // Handle Stores:
2915 if (SI) {
2916 setDebugLocFromInst(Builder, SI);
2917
2918 for (unsigned Part = 0; Part < UF; ++Part) {
2919 Instruction *NewSI = nullptr;
2920 Value *StoredVal = State.get(StoredValue, Part);
2921 if (CreateGatherScatter) {
2922 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2923 Value *VectorGep = State.get(Addr, Part);
2924 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2925 MaskPart);
2926 } else {
2927 if (Reverse) {
2928 // If we store to reverse consecutive memory locations, then we need
2929 // to reverse the order of elements in the stored value.
2930 StoredVal = reverseVector(StoredVal);
2931 // We don't want to update the value in the map as it might be used in
2932 // another expression. So don't call resetVectorValue(StoredVal).
2933 }
2934 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2935 if (isMaskRequired)
2936 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2937 BlockInMaskParts[Part]);
2938 else
2939 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2940 }
2941 addMetadata(NewSI, SI);
2942 }
2943 return;
2944 }
2945
2946 // Handle loads.
2947 assert(LI && "Must have a load instruction")((LI && "Must have a load instruction") ? static_cast
<void> (0) : __assert_fail ("LI && \"Must have a load instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2947, __PRETTY_FUNCTION__))
;
2948 setDebugLocFromInst(Builder, LI);
2949 for (unsigned Part = 0; Part < UF; ++Part) {
2950 Value *NewLI;
2951 if (CreateGatherScatter) {
2952 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2953 Value *VectorGep = State.get(Addr, Part);
2954 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2955 nullptr, "wide.masked.gather");
2956 addMetadata(NewLI, LI);
2957 } else {
2958 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2959 if (isMaskRequired)
2960 NewLI = Builder.CreateMaskedLoad(
2961 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy),
2962 "wide.masked.load");
2963 else
2964 NewLI =
2965 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2966
2967 // Add metadata to the load, but setVectorValue to the reverse shuffle.
2968 addMetadata(NewLI, LI);
2969 if (Reverse)
2970 NewLI = reverseVector(NewLI);
2971 }
2972
2973 State.set(Def, NewLI, Part);
2974 }
2975}
2976
2977void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
2978 VPUser &User,
2979 const VPIteration &Instance,
2980 bool IfPredicateInstr,
2981 VPTransformState &State) {
2982 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")((!Instr->getType()->isAggregateType() && "Can't handle vectors"
) ? static_cast<void> (0) : __assert_fail ("!Instr->getType()->isAggregateType() && \"Can't handle vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2982, __PRETTY_FUNCTION__))
;
2983
2984 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2985 // the first lane and part.
2986 if (isa<NoAliasScopeDeclInst>(Instr))
2987 if (!Instance.isFirstIteration())
2988 return;
2989
2990 setDebugLocFromInst(Builder, Instr);
2991
2992 // Does this instruction return a value ?
2993 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2994
2995 Instruction *Cloned = Instr->clone();
2996 if (!IsVoidRetTy)
2997 Cloned->setName(Instr->getName() + ".cloned");
2998
2999 State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
3000 Builder.GetInsertPoint());
3001 // Replace the operands of the cloned instructions with their scalar
3002 // equivalents in the new loop.
3003 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
3004 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
3005 auto InputInstance = Instance;
3006 if (!Operand || !OrigLoop->contains(Operand) ||
3007 (Cost->isUniformAfterVectorization(Operand, State.VF)))
3008 InputInstance.Lane = VPLane::getFirstLane();
3009 auto *NewOp = State.get(User.getOperand(op), InputInstance);
3010 Cloned->setOperand(op, NewOp);
3011 }
3012 addNewMetadata(Cloned, Instr);
3013
3014 // Place the cloned scalar in the new loop.
3015 Builder.Insert(Cloned);
3016
3017 State.set(Def, Cloned, Instance);
3018
3019 // If we just cloned a new assumption, add it the assumption cache.
3020 if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
3021 if (II->getIntrinsicID() == Intrinsic::assume)
3022 AC->registerAssumption(II);
3023
3024 // End if-block.
3025 if (IfPredicateInstr)
3026 PredicatedInstructions.push_back(Cloned);
3027}
3028
3029PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
3030 Value *End, Value *Step,
3031 Instruction *DL) {
3032 BasicBlock *Header = L->getHeader();
3033 BasicBlock *Latch = L->getLoopLatch();
3034 // As we're just creating this loop, it's possible no latch exists
3035 // yet. If so, use the header as this will be a single block loop.
3036 if (!Latch)
3037 Latch = Header;
3038
3039 IRBuilder<> Builder(&*Header->getFirstInsertionPt());
3040 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3041 setDebugLocFromInst(Builder, OldInst);
3042 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
3043
3044 Builder.SetInsertPoint(Latch->getTerminator());
3045 setDebugLocFromInst(Builder, OldInst);
3046
3047 // Create i+1 and fill the PHINode.
3048 Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
3049 Induction->addIncoming(Start, L->getLoopPreheader());
3050 Induction->addIncoming(Next, Latch);
3051 // Create the compare.
3052 Value *ICmp = Builder.CreateICmpEQ(Next, End);
3053 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
3054
3055 // Now we have two terminators. Remove the old one from the block.
3056 Latch->getTerminator()->eraseFromParent();
3057
3058 return Induction;
3059}
3060
3061Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3062 if (TripCount)
3063 return TripCount;
3064
3065 assert(L && "Create Trip Count for null loop.")((L && "Create Trip Count for null loop.") ? static_cast
<void> (0) : __assert_fail ("L && \"Create Trip Count for null loop.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3065, __PRETTY_FUNCTION__))
;
3066 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3067 // Find the loop boundaries.
3068 ScalarEvolution *SE = PSE.getSE();
3069 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3070 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&((!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
"Invalid loop count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3071, __PRETTY_FUNCTION__))
3071 "Invalid loop count")((!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
"Invalid loop count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3071, __PRETTY_FUNCTION__))
;
3072
3073 Type *IdxTy = Legal->getWidestInductionType();
3074 assert(IdxTy && "No type for induction")((IdxTy && "No type for induction") ? static_cast<
void> (0) : __assert_fail ("IdxTy && \"No type for induction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3074, __PRETTY_FUNCTION__))
;
3075
3076 // The exit count might have the type of i64 while the phi is i32. This can
3077 // happen if we have an induction variable that is sign extended before the
3078 // compare. The only way that we get a backedge taken count is that the
3079 // induction variable was signed and as such will not overflow. In such a case
3080 // truncation is legal.
3081 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3082 IdxTy->getPrimitiveSizeInBits())
3083 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3084 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3085
3086 // Get the total trip count from the count by adding 1.
3087 const SCEV *ExitCount = SE->getAddExpr(
3088 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3089
3090 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3091
3092 // Expand the trip count and place the new instructions in the preheader.
3093 // Notice that the pre-header does not change, only the loop body.
3094 SCEVExpander Exp(*SE, DL, "induction");
3095
3096 // Count holds the overall loop count (N).
3097 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3098 L->getLoopPreheader()->getTerminator());
3099
3100 if (TripCount->getType()->isPointerTy())
3101 TripCount =
3102 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3103 L->getLoopPreheader()->getTerminator());
3104
3105 return TripCount;
3106}
3107
3108Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3109 if (VectorTripCount)
3110 return VectorTripCount;
3111
3112 Value *TC = getOrCreateTripCount(L);
3113 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3114
3115 Type *Ty = TC->getType();
3116 // This is where we can make the step a runtime constant.
3117 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
3118
3119 // If the tail is to be folded by masking, round the number of iterations N
3120 // up to a multiple of Step instead of rounding down. This is done by first
3121 // adding Step-1 and then rounding down. Note that it's ok if this addition
3122 // overflows: the vector induction variable will eventually wrap to zero given
3123 // that it starts at zero and its Step is a power of two; the loop will then
3124 // exit, with the last early-exit vector comparison also producing all-true.
3125 if (Cost->foldTailByMasking()) {
3126 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&((isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3127, __PRETTY_FUNCTION__))
3127 "VF*UF must be a power of 2 when folding tail by masking")((isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3127, __PRETTY_FUNCTION__))
;
3128 assert(!VF.isScalable() &&((!VF.isScalable() && "Tail folding not yet supported for scalable vectors"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"Tail folding not yet supported for scalable vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3129, __PRETTY_FUNCTION__))
3129 "Tail folding not yet supported for scalable vectors")((!VF.isScalable() && "Tail folding not yet supported for scalable vectors"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"Tail folding not yet supported for scalable vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3129, __PRETTY_FUNCTION__))
;
3130 TC = Builder.CreateAdd(
3131 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3132 }
3133
3134 // Now we need to generate the expression for the part of the loop that the
3135 // vectorized body will execute. This is equal to N - (N % Step) if scalar
3136 // iterations are not required for correctness, or N - Step, otherwise. Step
3137 // is equal to the vectorization factor (number of SIMD elements) times the
3138 // unroll factor (number of SIMD instructions).
3139 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3140
3141 // There are two cases where we need to ensure (at least) the last iteration
3142 // runs in the scalar remainder loop. Thus, if the step evenly divides
3143 // the trip count, we set the remainder to be equal to the step. If the step
3144 // does not evenly divide the trip count, no adjustment is necessary since
3145 // there will already be scalar iterations. Note that the minimum iterations
3146 // check ensures that N >= Step. The cases are:
3147 // 1) If there is a non-reversed interleaved group that may speculatively
3148 // access memory out-of-bounds.
3149 // 2) If any instruction may follow a conditionally taken exit. That is, if
3150 // the loop contains multiple exiting blocks, or a single exiting block
3151 // which is not the latch.
3152 if (VF.isVector() && Cost->requiresScalarEpilogue()) {
3153 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3154 R = Builder.CreateSelect(IsZero, Step, R);
3155 }
3156
3157 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3158
3159 return VectorTripCount;
3160}
3161
3162Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3163 const DataLayout &DL) {
3164 // Verify that V is a vector type with same number of elements as DstVTy.
3165 auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3166 unsigned VF = DstFVTy->getNumElements();
3167 auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3168 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match")(((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"
) ? static_cast<void> (0) : __assert_fail ("(VF == SrcVecTy->getNumElements()) && \"Vector dimensions do not match\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3168, __PRETTY_FUNCTION__))
;
3169 Type *SrcElemTy = SrcVecTy->getElementType();
3170 Type *DstElemTy = DstFVTy->getElementType();
3171 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&(((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy
)) && "Vector elements must have same size") ? static_cast
<void> (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3172, __PRETTY_FUNCTION__))
3172 "Vector elements must have same size")(((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy
)) && "Vector elements must have same size") ? static_cast
<void> (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3172, __PRETTY_FUNCTION__))
;
3173
3174 // Do a direct cast if element types are castable.
3175 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3176 return Builder.CreateBitOrPointerCast(V, DstFVTy);
3177 }
3178 // V cannot be directly casted to desired vector type.
3179 // May happen when V is a floating point vector but DstVTy is a vector of
3180 // pointers or vice-versa. Handle this using a two-step bitcast using an
3181 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3182 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&(((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()
) && "Only one type should be a pointer type") ? static_cast
<void> (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3183, __PRETTY_FUNCTION__))
3183 "Only one type should be a pointer type")(((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()
) && "Only one type should be a pointer type") ? static_cast
<void> (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3183, __PRETTY_FUNCTION__))
;
3184 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&(((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy
()) && "Only one type should be a floating point type"
) ? static_cast<void> (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3185, __PRETTY_FUNCTION__))
3185 "Only one type should be a floating point type")(((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy
()) && "Only one type should be a floating point type"
) ? static_cast<void> (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3185, __PRETTY_FUNCTION__))
;
3186 Type *IntTy =
3187 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3188 auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3189 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3190 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3191}
3192
3193void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3194 BasicBlock *Bypass) {
3195 Value *Count = getOrCreateTripCount(L);
3196 // Reuse existing vector loop preheader for TC checks.
3197 // Note that new preheader block is generated for vector loop.
3198 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3199 IRBuilder<> Builder(TCCheckBlock->getTerminator());
3200
3201 // Generate code to check if the loop's trip count is less than VF * UF, or
3202 // equal to it in case a scalar epilogue is required; this implies that the
3203 // vector trip count is zero. This check also covers the case where adding one
3204 // to the backedge-taken count overflowed leading to an incorrect trip count
3205 // of zero. In this case we will also jump to the scalar loop.
3206 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3207 : ICmpInst::ICMP_ULT;
3208
3209 // If tail is to be folded, vector loop takes care of all iterations.
3210 Value *CheckMinIters = Builder.getFalse();
3211 if (!Cost->foldTailByMasking()) {
3212 Value *Step =
3213 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3214 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3215 }
3216 // Create new preheader for vector loop.
3217 LoopVectorPreHeader =
3218 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3219 "vector.ph");
3220
3221 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3223, __PRETTY_FUNCTION__))
3222 DT->getNode(Bypass)->getIDom()) &&((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3223, __PRETTY_FUNCTION__))
3223 "TC check is expected to dominate Bypass")((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3223, __PRETTY_FUNCTION__))
;
3224
3225 // Update dominator for Bypass & LoopExit.
3226 DT->changeImmediateDominator(Bypass, TCCheckBlock);
3227 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3228
3229 ReplaceInstWithInst(
3230 TCCheckBlock->getTerminator(),
3231 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3232 LoopBypassBlocks.push_back(TCCheckBlock);
3233}
3234
3235BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3236
3237 BasicBlock *const SCEVCheckBlock =
3238 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3239 if (!SCEVCheckBlock)
3240 return nullptr;
3241
3242 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||((!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile
&& Cost->Hints->getForce() != LoopVectorizeHints
::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3245, __PRETTY_FUNCTION__))
3243 (OptForSizeBasedOnProfile &&((!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile
&& Cost->Hints->getForce() != LoopVectorizeHints
::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3245, __PRETTY_FUNCTION__))
3244 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&((!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile
&& Cost->Hints->getForce() != LoopVectorizeHints
::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3245, __PRETTY_FUNCTION__))
3245 "Cannot SCEV check stride or overflow when optimizing for size")((!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile
&& Cost->Hints->getForce() != LoopVectorizeHints
::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3245, __PRETTY_FUNCTION__))
;
3246
3247
3248 // Update dominator only if this is first RT check.
3249 if (LoopBypassBlocks.empty()) {
3250 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3251 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3252 }
3253
3254 LoopBypassBlocks.push_back(SCEVCheckBlock);
3255 AddedSafetyChecks = true;
3256 return SCEVCheckBlock;
3257}
3258
3259BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3260 BasicBlock *Bypass) {
3261 // VPlan-native path does not do any analysis for runtime checks currently.
3262 if (EnableVPlanNativePath)
3263 return nullptr;
3264
3265 BasicBlock *const MemCheckBlock =
3266 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3267
3268 // Check if we generated code that checks in runtime if arrays overlap. We put
3269 // the checks into a separate block to make the more common case of few
3270 // elements faster.
3271 if (!MemCheckBlock)
3272 return nullptr;
3273
3274 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3275 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&((Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled
&& "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? static_cast<void> (0) : __assert_fail
("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3277, __PRETTY_FUNCTION__))
3276 "Cannot emit memory checks when optimizing for size, unless forced "((Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled
&& "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? static_cast<void> (0) : __assert_fail
("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3277, __PRETTY_FUNCTION__))
3277 "to vectorize.")((Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled
&& "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? static_cast<void> (0) : __assert_fail
("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3277, __PRETTY_FUNCTION__))
;
3278 ORE->emit([&]() {
3279 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationCodeSize",
3280 L->getStartLoc(), L->getHeader())
3281 << "Code-size may be reduced by not forcing "
3282 "vectorization, or by source-code modifications "
3283 "eliminating the need for runtime checks "
3284 "(e.g., adding 'restrict').";
3285 });
3286 }
3287
3288 LoopBypassBlocks.push_back(MemCheckBlock);
3289
3290 AddedSafetyChecks = true;
3291
3292 // We currently don't use LoopVersioning for the actual loop cloning but we
3293 // still use it to add the noalias metadata.
3294 LVer = std::make_unique<LoopVersioning>(
3295 *Legal->getLAI(),
3296 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3297 DT, PSE.getSE());
3298 LVer->prepareNoAliasMetadata();
3299 return MemCheckBlock;
3300}
3301
3302Value *InnerLoopVectorizer::emitTransformedIndex(
3303 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3304 const InductionDescriptor &ID) const {
3305
3306 SCEVExpander Exp(*SE, DL, "induction");
3307 auto Step = ID.getStep();
3308 auto StartValue = ID.getStartValue();
3309 assert(Index->getType() == Step->getType() &&((Index->getType() == Step->getType() && "Index type does not match StepValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == Step->getType() && \"Index type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3310, __PRETTY_FUNCTION__))
3310 "Index type does not match StepValue type")((Index->getType() == Step->getType() && "Index type does not match StepValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == Step->getType() && \"Index type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3310, __PRETTY_FUNCTION__))
;
3311
3312 // Note: the IR at this point is broken. We cannot use SE to create any new
3313 // SCEV and then expand it, hoping that SCEV's simplification will give us
3314 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3315 // lead to various SCEV crashes. So all we can do is to use builder and rely
3316 // on InstCombine for future simplifications. Here we handle some trivial
3317 // cases only.
3318 auto CreateAdd = [&B](Value *X, Value *Y) {
3319 assert(X->getType() == Y->getType() && "Types don't match!")((X->getType() == Y->getType() && "Types don't match!"
) ? static_cast<void> (0) : __assert_fail ("X->getType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3319, __PRETTY_FUNCTION__))
;
3320 if (auto *CX = dyn_cast<ConstantInt>(X))
3321 if (CX->isZero())
3322 return Y;
3323 if (auto *CY = dyn_cast<ConstantInt>(Y))
3324 if (CY->isZero())
3325 return X;
3326 return B.CreateAdd(X, Y);
3327 };
3328
3329 auto CreateMul = [&B](Value *X, Value *Y) {
3330 assert(X->getType() == Y->getType() && "Types don't match!")((X->getType() == Y->getType() && "Types don't match!"
) ? static_cast<void> (0) : __assert_fail ("X->getType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3330, __PRETTY_FUNCTION__))
;
3331 if (auto *CX = dyn_cast<ConstantInt>(X))
3332 if (CX->isOne())
3333 return Y;
3334 if (auto *CY = dyn_cast<ConstantInt>(Y))
3335 if (CY->isOne())
3336 return X;
3337 return B.CreateMul(X, Y);
3338 };
3339
3340 // Get a suitable insert point for SCEV expansion. For blocks in the vector
3341 // loop, choose the end of the vector loop header (=LoopVectorBody), because
3342 // the DomTree is not kept up-to-date for additional blocks generated in the
3343 // vector loop. By using the header as insertion point, we guarantee that the
3344 // expanded instructions dominate all their uses.
3345 auto GetInsertPoint = [this, &B]() {
3346 BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3347 if (InsertBB != LoopVectorBody &&
3348 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3349 return LoopVectorBody->getTerminator();
3350 return &*B.GetInsertPoint();
3351 };
3352
3353 switch (ID.getKind()) {
3354 case InductionDescriptor::IK_IntInduction: {
3355 assert(Index->getType() == StartValue->getType() &&((Index->getType() == StartValue->getType() && "Index type does not match StartValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3356, __PRETTY_FUNCTION__))
3356 "Index type does not match StartValue type")((Index->getType() == StartValue->getType() && "Index type does not match StartValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3356, __PRETTY_FUNCTION__))
;
3357 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3358 return B.CreateSub(StartValue, Index);
3359 auto *Offset = CreateMul(
3360 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3361 return CreateAdd(StartValue, Offset);
3362 }
3363 case InductionDescriptor::IK_PtrInduction: {
3364 assert(isa<SCEVConstant>(Step) &&((isa<SCEVConstant>(Step) && "Expected constant step for pointer induction"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3365, __PRETTY_FUNCTION__))
3365 "Expected constant step for pointer induction")((isa<SCEVConstant>(Step) && "Expected constant step for pointer induction"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3365, __PRETTY_FUNCTION__))
;
3366 return B.CreateGEP(
3367 StartValue->getType()->getPointerElementType(), StartValue,
3368 CreateMul(Index,
3369 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3370 }
3371 case InductionDescriptor::IK_FpInduction: {
3372 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value")((Step->getType()->isFloatingPointTy() && "Expected FP Step value"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isFloatingPointTy() && \"Expected FP Step value\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3372, __PRETTY_FUNCTION__))
;
3373 auto InductionBinOp = ID.getInductionBinOp();
3374 assert(InductionBinOp &&((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3377, __PRETTY_FUNCTION__))
3375 (InductionBinOp->getOpcode() == Instruction::FAdd ||((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3377, __PRETTY_FUNCTION__))
3376 InductionBinOp->getOpcode() == Instruction::FSub) &&((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3377, __PRETTY_FUNCTION__))
3377 "Original bin op should be defined for FP induction")((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3377, __PRETTY_FUNCTION__))
;
3378
3379 Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3380 Value *MulExp = B.CreateFMul(StepValue, Index);
3381 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3382 "induction");
3383 }
3384 case InductionDescriptor::IK_NoInduction:
3385 return nullptr;
3386 }
3387 llvm_unreachable("invalid enum")::llvm::llvm_unreachable_internal("invalid enum", "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3387)
;
3388}
3389
3390Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3391 LoopScalarBody = OrigLoop->getHeader();
3392 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3393 LoopExitBlock = OrigLoop->getUniqueExitBlock();
3394 assert(LoopExitBlock && "Must have an exit block")((LoopExitBlock && "Must have an exit block") ? static_cast
<void> (0) : __assert_fail ("LoopExitBlock && \"Must have an exit block\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3394, __PRETTY_FUNCTION__))
;
3395 assert(LoopVectorPreHeader && "Invalid loop structure")((LoopVectorPreHeader && "Invalid loop structure") ? static_cast
<void> (0) : __assert_fail ("LoopVectorPreHeader && \"Invalid loop structure\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3395, __PRETTY_FUNCTION__))
;
3396
3397 LoopMiddleBlock =
3398 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3399 LI, nullptr, Twine(Prefix) + "middle.block");
3400 LoopScalarPreHeader =
3401 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3402 nullptr, Twine(Prefix) + "scalar.ph");
3403
3404 // Set up branch from middle block to the exit and scalar preheader blocks.
3405 // completeLoopSkeleton will update the condition to use an iteration check,
3406 // if required to decide whether to execute the remainder.
3407 BranchInst *BrInst =
3408 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
3409 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3410 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3411 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3412
3413 // We intentionally don't let SplitBlock to update LoopInfo since
3414 // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3415 // LoopVectorBody is explicitly added to the correct place few lines later.
3416 LoopVectorBody =
3417 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3418 nullptr, nullptr, Twine(Prefix) + "vector.body");
3419
3420 // Update dominator for loop exit.
3421 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3422
3423 // Create and register the new vector loop.
3424 Loop *Lp = LI->AllocateLoop();
3425 Loop *ParentLoop = OrigLoop->getParentLoop();
3426
3427 // Insert the new loop into the loop nest and register the new basic blocks
3428 // before calling any utilities such as SCEV that require valid LoopInfo.
3429 if (ParentLoop) {
3430 ParentLoop->addChildLoop(Lp);
3431 } else {
3432 LI->addTopLevelLoop(Lp);
3433 }
3434 Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3435 return Lp;
3436}
3437
3438void InnerLoopVectorizer::createInductionResumeValues(
3439 Loop *L, Value *VectorTripCount,
3440 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3441 assert(VectorTripCount && L && "Expected valid arguments")((VectorTripCount && L && "Expected valid arguments"
) ? static_cast<void> (0) : __assert_fail ("VectorTripCount && L && \"Expected valid arguments\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3441, __PRETTY_FUNCTION__))
;
3442 assert(((AdditionalBypass.first && AdditionalBypass.second) ||((((AdditionalBypass.first && AdditionalBypass.second
) || (!AdditionalBypass.first && !AdditionalBypass.second
)) && "Inconsistent information about additional bypass."
) ? static_cast<void> (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3444, __PRETTY_FUNCTION__))
3443 (!AdditionalBypass.first && !AdditionalBypass.second)) &&((((AdditionalBypass.first && AdditionalBypass.second
) || (!AdditionalBypass.first && !AdditionalBypass.second
)) && "Inconsistent information about additional bypass."
) ? static_cast<void> (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3444, __PRETTY_FUNCTION__))
3444 "Inconsistent information about additional bypass.")((((AdditionalBypass.first && AdditionalBypass.second
) || (!AdditionalBypass.first && !AdditionalBypass.second
)) && "Inconsistent information about additional bypass."
) ? static_cast<void> (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3444, __PRETTY_FUNCTION__))
;
3445 // We are going to resume the execution of the scalar loop.
3446 // Go over all of the induction variables that we found and fix the
3447 // PHIs that are left in the scalar version of the loop.
3448 // The starting values of PHI nodes depend on the counter of the last
3449 // iteration in the vectorized loop.
3450 // If we come from a bypass edge then we need to start from the original
3451 // start value.
3452 for (auto &InductionEntry : Legal->getInductionVars()) {
3453 PHINode *OrigPhi = InductionEntry.first;
3454 InductionDescriptor II = InductionEntry.second;
3455
3456 // Create phi nodes to merge from the backedge-taken check block.
3457 PHINode *BCResumeVal =
3458 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3459 LoopScalarPreHeader->getTerminator());
3460 // Copy original phi DL over to the new one.
3461 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3462 Value *&EndValue = IVEndValues[OrigPhi];
3463 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3464 if (OrigPhi == OldInduction) {
3465 // We know what the end value is.
3466 EndValue = VectorTripCount;
3467 } else {
3468 IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3469
3470 // Fast-math-flags propagate from the original induction instruction.
3471 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3472 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3473
3474 Type *StepType = II.getStep()->getType();
3475 Instruction::CastOps CastOp =
3476 CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3477 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3478 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3479 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3480 EndValue->setName("ind.end");
3481
3482 // Compute the end value for the additional bypass (if applicable).
3483 if (AdditionalBypass.first) {
3484 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3485 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3486 StepType, true);
3487 CRD =
3488 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3489 EndValueFromAdditionalBypass =
3490 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3491 EndValueFromAdditionalBypass->setName("ind.end");
3492 }
3493 }
3494 // The new PHI merges the original incoming value, in case of a bypass,
3495 // or the value at the end of the vectorized loop.
3496 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3497
3498 // Fix the scalar body counter (PHI node).
3499 // The old induction's phi node in the scalar body needs the truncated
3500 // value.
3501 for (BasicBlock *BB : LoopBypassBlocks)
3502 BCResumeVal->addIncoming(II.getStartValue(), BB);
3503
3504 if (AdditionalBypass.first)
3505 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3506 EndValueFromAdditionalBypass);
3507
3508 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3509 }
3510}
3511
3512BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3513 MDNode *OrigLoopID) {
3514 assert(L && "Expected valid loop.")((L && "Expected valid loop.") ? static_cast<void>
(0) : __assert_fail ("L && \"Expected valid loop.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3514, __PRETTY_FUNCTION__))
;
3515
3516 // The trip counts should be cached by now.
3517 Value *Count = getOrCreateTripCount(L);
3518 Value *VectorTripCount = getOrCreateVectorTripCount(L);
3519
3520 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3521
3522 // Add a check in the middle block to see if we have completed
3523 // all of the iterations in the first vector loop.
3524 // If (N - N%VF) == N, then we *don't* need to run the remainder.
3525 // If tail is to be folded, we know we don't need to run the remainder.
3526 if (!Cost->foldTailByMasking()) {
3527 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3528 Count, VectorTripCount, "cmp.n",
3529 LoopMiddleBlock->getTerminator());
3530
3531 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3532 // of the corresponding compare because they may have ended up with
3533 // different line numbers and we want to avoid awkward line stepping while
3534 // debugging. Eg. if the compare has got a line number inside the loop.
3535 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3536 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3537 }
3538
3539 // Get ready to start creating new instructions into the vectorized body.
3540 assert(LoopVectorPreHeader == L->getLoopPreheader() &&((LoopVectorPreHeader == L->getLoopPreheader() && "Inconsistent vector loop preheader"
) ? static_cast<void> (0) : __assert_fail ("LoopVectorPreHeader == L->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3541, __PRETTY_FUNCTION__))
3541 "Inconsistent vector loop preheader")((LoopVectorPreHeader == L->getLoopPreheader() && "Inconsistent vector loop preheader"
) ? static_cast<void> (0) : __assert_fail ("LoopVectorPreHeader == L->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3541, __PRETTY_FUNCTION__))
;
3542 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3543
3544 Optional<MDNode *> VectorizedLoopID =
3545 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3546 LLVMLoopVectorizeFollowupVectorized});
3547 if (VectorizedLoopID.hasValue()) {
3548 L->setLoopID(VectorizedLoopID.getValue());
3549
3550 // Do not setAlreadyVectorized if loop attributes have been defined
3551 // explicitly.
3552 return LoopVectorPreHeader;
3553 }
3554
3555 // Keep all loop hints from the original loop on the vector loop (we'll
3556 // replace the vectorizer-specific hints below).
3557 if (MDNode *LID = OrigLoop->getLoopID())
3558 L->setLoopID(LID);
3559
3560 LoopVectorizeHints Hints(L, true, *ORE);
3561 Hints.setAlreadyVectorized();
3562
3563#ifdef EXPENSIVE_CHECKS
3564 assert(DT->verify(DominatorTree::VerificationLevel::Fast))((DT->verify(DominatorTree::VerificationLevel::Fast)) ? static_cast
<void> (0) : __assert_fail ("DT->verify(DominatorTree::VerificationLevel::Fast)"
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3564, __PRETTY_FUNCTION__))
;
3565 LI->verify(*DT);
3566#endif
3567
3568 return LoopVectorPreHeader;
3569}
3570
3571BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3572 /*
3573 In this function we generate a new loop. The new loop will contain
3574 the vectorized instructions while the old loop will continue to run the
3575 scalar remainder.
3576
3577 [ ] <-- loop iteration number check.
3578 / |
3579 / v
3580 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3581 | / |
3582 | / v
3583 || [ ] <-- vector pre header.
3584 |/ |
3585 | v
3586 | [ ] \
3587 | [ ]_| <-- vector loop.
3588 | |
3589 | v
3590 | -[ ] <--- middle-block.
3591 | / |
3592 | / v
3593 -|- >[ ] <--- new preheader.
3594 | |
3595 | v
3596 | [ ] \
3597 | [ ]_| <-- old scalar loop to handle remainder.
3598 \ |
3599 \ v
3600 >[ ] <-- exit block.
3601 ...
3602 */
3603
3604 // Get the metadata of the original loop before it gets modified.
3605 MDNode *OrigLoopID = OrigLoop->getLoopID();
3606
3607 // Create an empty vector loop, and prepare basic blocks for the runtime
3608 // checks.
3609 Loop *Lp = createVectorLoopSkeleton("");
3610
3611 // Now, compare the new count to zero. If it is zero skip the vector loop and
3612 // jump to the scalar loop. This check also covers the case where the
3613 // backedge-taken count is uint##_max: adding one to it will overflow leading
3614 // to an incorrect trip count of zero. In this (rare) case we will also jump
3615 // to the scalar loop.
3616 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3617
3618 // Generate the code to check any assumptions that we've made for SCEV
3619 // expressions.
3620 emitSCEVChecks(Lp, LoopScalarPreHeader);
3621
3622 // Generate the code that checks in runtime if arrays overlap. We put the
3623 // checks into a separate block to make the more common case of few elements
3624 // faster.
3625 emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3626
3627 // Some loops have a single integer induction variable, while other loops
3628 // don't. One example is c++ iterators that often have multiple pointer
3629 // induction variables. In the code below we also support a case where we
3630 // don't have a single induction variable.
3631 //
3632 // We try to obtain an induction variable from the original loop as hard
3633 // as possible. However if we don't find one that:
3634 // - is an integer
3635 // - counts from zero, stepping by one
3636 // - is the size of the widest induction variable type
3637 // then we create a new one.
3638 OldInduction = Legal->getPrimaryInduction();
3639 Type *IdxTy = Legal->getWidestInductionType();
3640 Value *StartIdx = ConstantInt::get(IdxTy, 0);
3641 // The loop step is equal to the vectorization factor (num of SIMD elements)
3642 // times the unroll factor (num of SIMD instructions).
3643 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3644 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3645 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3646 Induction =
3647 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3648 getDebugLocFromInstOrOperands(OldInduction));
3649
3650 // Emit phis for the new starting index of the scalar loop.
3651 createInductionResumeValues(Lp, CountRoundDown);
3652
3653 return completeLoopSkeleton(Lp, OrigLoopID);
3654}
3655
3656// Fix up external users of the induction variable. At this point, we are
3657// in LCSSA form, with all external PHIs that use the IV having one input value,
3658// coming from the remainder loop. We need those PHIs to also have a correct
3659// value for the IV when arriving directly from the middle block.
3660void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3661 const InductionDescriptor &II,
3662 Value *CountRoundDown, Value *EndValue,
3663 BasicBlock *MiddleBlock) {
3664 // There are two kinds of external IV usages - those that use the value
3665 // computed in the last iteration (the PHI) and those that use the penultimate
3666 // value (the value that feeds into the phi from the loop latch).
3667 // We allow both, but they, obviously, have different values.
3668
3669 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block")((OrigLoop->getUniqueExitBlock() && "Expected a single exit block"
) ? static_cast<void> (0) : __assert_fail ("OrigLoop->getUniqueExitBlock() && \"Expected a single exit block\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3669, __PRETTY_FUNCTION__))
;
3670
3671 DenseMap<Value *, Value *> MissingVals;
3672
3673 // An external user of the last iteration's value should see the value that
3674 // the remainder loop uses to initialize its own IV.
3675 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3676 for (User *U : PostInc->users()) {
3677 Instruction *UI = cast<Instruction>(U);
3678 if (!OrigLoop->contains(UI)) {
3679 assert(isa<PHINode>(UI) && "Expected LCSSA form")((isa<PHINode>(UI) && "Expected LCSSA form") ? static_cast
<void> (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3679, __PRETTY_FUNCTION__))
;
3680 MissingVals[UI] = EndValue;
3681 }
3682 }
3683
3684 // An external user of the penultimate value need to see EndValue - Step.
3685 // The simplest way to get this is to recompute it from the constituent SCEVs,
3686 // that is Start + (Step * (CRD - 1)).
3687 for (User *U : OrigPhi->users()) {
3688 auto *UI = cast<Instruction>(U);
3689 if (!OrigLoop->contains(UI)) {
3690 const DataLayout &DL =
3691 OrigLoop->getHeader()->getModule()->getDataLayout();
3692 assert(isa<PHINode>(UI) && "Expected LCSSA form")((isa<PHINode>(UI) && "Expected LCSSA form") ? static_cast
<void> (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3692, __PRETTY_FUNCTION__))
;
3693
3694 IRBuilder<> B(MiddleBlock->getTerminator());
3695
3696 // Fast-math-flags propagate from the original induction instruction.
3697 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3698 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3699
3700 Value *CountMinusOne = B.CreateSub(
3701 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3702 Value *CMO =
3703 !II.getStep()->getType()->isIntegerTy()
3704 ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3705 II.getStep()->getType())
3706 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3707 CMO->setName("cast.cmo");
3708 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3709 Escape->setName("ind.escape");
3710 MissingVals[UI] = Escape;
3711 }
3712 }
3713
3714 for (auto &I : MissingVals) {
3715 PHINode *PHI = cast<PHINode>(I.first);
3716 // One corner case we have to handle is two IVs "chasing" each-other,
3717 // that is %IV2 = phi [...], [ %IV1, %latch ]
3718 // In this case, if IV1 has an external use, we need to avoid adding both
3719 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3720 // don't already have an incoming value for the middle block.
3721 if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3722 PHI->addIncoming(I.second, MiddleBlock);
3723 }
3724}
3725
3726namespace {
3727
3728struct CSEDenseMapInfo {
3729 static bool canHandle(const Instruction *I) {
3730 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3731 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3732 }
3733
3734 static inline Instruction *getEmptyKey() {
3735 return DenseMapInfo<Instruction *>::getEmptyKey();
3736 }
3737
3738 static inline Instruction *getTombstoneKey() {
3739 return DenseMapInfo<Instruction *>::getTombstoneKey();
3740 }
3741
3742 static unsigned getHashValue(const Instruction *I) {
3743 assert(canHandle(I) && "Unknown instruction!")((canHandle(I) && "Unknown instruction!") ? static_cast
<void> (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3743, __PRETTY_FUNCTION__))
;
3744 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3745 I->value_op_end()));
3746 }
3747
3748 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3749 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3750 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3751 return LHS == RHS;
3752 return LHS->isIdenticalTo(RHS);
3753 }
3754};
3755
3756} // end anonymous namespace
3757
3758///Perform cse of induction variable instructions.
3759static void cse(BasicBlock *BB) {
3760 // Perform simple cse.
3761 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3762 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3763 Instruction *In = &*I++;
3764
3765 if (!CSEDenseMapInfo::canHandle(In))
3766 continue;
3767
3768 // Check if we can replace this instruction with any of the
3769 // visited instructions.
3770 if (Instruction *V = CSEMap.lookup(In)) {
3771 In->replaceAllUsesWith(V);
3772 In->eraseFromParent();
3773 continue;
3774 }
3775
3776 CSEMap[In] = In;
3777 }
3778}
3779
3780InstructionCost
3781LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3782 bool &NeedToScalarize) const {
3783 Function *F = CI->getCalledFunction();
3784 Type *ScalarRetTy = CI->getType();
3785 SmallVector<Type *, 4> Tys, ScalarTys;
3786 for (auto &ArgOp : CI->arg_operands())
3787 ScalarTys.push_back(ArgOp->getType());
3788
3789 // Estimate cost of scalarized vector call. The source operands are assumed
3790 // to be vectors, so we need to extract individual elements from there,
3791 // execute VF scalar calls, and then gather the result into the vector return
3792 // value.
3793 InstructionCost ScalarCallCost =
3794 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3795 if (VF.isScalar())
3796 return ScalarCallCost;
3797
3798 // Compute corresponding vector type for return value and arguments.
3799 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3800 for (Type *ScalarTy : ScalarTys)
3801 Tys.push_back(ToVectorTy(ScalarTy, VF));
3802
3803 // Compute costs of unpacking argument values for the scalar calls and
3804 // packing the return values to a vector.
3805 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3806
3807 InstructionCost Cost =
3808 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3809
3810 // If we can't emit a vector call for this function, then the currently found
3811 // cost is the cost we need to return.
3812 NeedToScalarize = true;
3813 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3814 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3815
3816 if (!TLI || CI->isNoBuiltin() || !VecFunc)
3817 return Cost;
3818
3819 // If the corresponding vector cost is cheaper, return its cost.
3820 InstructionCost VectorCallCost =
3821 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3822 if (VectorCallCost < Cost) {
3823 NeedToScalarize = false;
3824 Cost = VectorCallCost;
3825 }
3826 return Cost;
3827}
3828
3829static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3830 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3831 return Elt;
3832 return VectorType::get(Elt, VF);
3833}
3834
3835InstructionCost
3836LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3837 ElementCount VF) const {
3838 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3839 assert(ID && "Expected intrinsic call!")((ID && "Expected intrinsic call!") ? static_cast<
void> (0) : __assert_fail ("ID && \"Expected intrinsic call!\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3839, __PRETTY_FUNCTION__))
;
3840 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3841 FastMathFlags FMF;
3842 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3843 FMF = FPMO->getFastMathFlags();
3844
3845 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
3846 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3847 SmallVector<Type *> ParamTys;
3848 std::transform(FTy->param_begin(), FTy->param_end(),
3849 std::back_inserter(ParamTys),
3850 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3851
3852 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3853 dyn_cast<IntrinsicInst>(CI));
3854 return TTI.getIntrinsicInstrCost(CostAttrs,
3855 TargetTransformInfo::TCK_RecipThroughput);
3856}
3857
3858static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3859 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3860 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3861 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3862}
3863
3864static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3865 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3866 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3867 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3868}
3869
3870void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3871 // For every instruction `I` in MinBWs, truncate the operands, create a
3872 // truncated version of `I` and reextend its result. InstCombine runs
3873 // later and will remove any ext/trunc pairs.
3874 SmallPtrSet<Value *, 4> Erased;
3875 for (const auto &KV : Cost->getMinimalBitwidths()) {
3876 // If the value wasn't vectorized, we must maintain the original scalar
3877 // type. The absence of the value from State indicates that it
3878 // wasn't vectorized.
3879 VPValue *Def = State.Plan->getVPValue(KV.first);
3880 if (!State.hasAnyVectorValue(Def))
3881 continue;
3882 for (unsigned Part = 0; Part < UF; ++Part) {
3883 Value *I = State.get(Def, Part);
3884 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3885 continue;
3886 Type *OriginalTy = I->getType();
3887 Type *ScalarTruncatedTy =
3888 IntegerType::get(OriginalTy->getContext(), KV.second);
3889 auto *TruncatedTy = FixedVectorType::get(
3890 ScalarTruncatedTy,
3891 cast<FixedVectorType>(OriginalTy)->getNumElements());
3892 if (TruncatedTy == OriginalTy)
3893 continue;
3894
3895 IRBuilder<> B(cast<Instruction>(I));
3896 auto ShrinkOperand = [&](Value *V) -> Value * {
3897 if (auto *ZI = dyn_cast<ZExtInst>(V))
3898 if (ZI->getSrcTy() == TruncatedTy)
3899 return ZI->getOperand(0);
3900 return B.CreateZExtOrTrunc(V, TruncatedTy);
3901 };
3902
3903 // The actual instruction modification depends on the instruction type,
3904 // unfortunately.
3905 Value *NewI = nullptr;
3906 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3907 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3908 ShrinkOperand(BO->getOperand(1)));
3909
3910 // Any wrapping introduced by shrinking this operation shouldn't be
3911 // considered undefined behavior. So, we can't unconditionally copy
3912 // arithmetic wrapping flags to NewI.
3913 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3914 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3915 NewI =
3916 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3917 ShrinkOperand(CI->getOperand(1)));
3918 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3919 NewI = B.CreateSelect(SI->getCondition(),
3920 ShrinkOperand(SI->getTrueValue()),
3921 ShrinkOperand(SI->getFalseValue()));
3922 } else if (auto *CI = dyn_cast<CastInst>(I)) {
3923 switch (CI->getOpcode()) {
3924 default:
3925 llvm_unreachable("Unhandled cast!")::llvm::llvm_unreachable_internal("Unhandled cast!", "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3925)
;
3926 case Instruction::Trunc:
3927 NewI = ShrinkOperand(CI->getOperand(0));
3928 break;
3929 case Instruction::SExt:
3930 NewI = B.CreateSExtOrTrunc(
3931 CI->getOperand(0),
3932 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3933 break;
3934 case Instruction::ZExt:
3935 NewI = B.CreateZExtOrTrunc(
3936 CI->getOperand(0),
3937 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3938 break;
3939 }
3940 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3941 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3942 ->getNumElements();
3943 auto *O0 = B.CreateZExtOrTrunc(
3944 SI->getOperand(0),
3945 FixedVectorType::get(ScalarTruncatedTy, Elements0));
3946 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3947 ->getNumElements();
3948 auto *O1 = B.CreateZExtOrTrunc(
3949 SI->getOperand(1),
3950 FixedVectorType::get(ScalarTruncatedTy, Elements1));
3951
3952 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3953 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3954 // Don't do anything with the operands, just extend the result.
3955 continue;
3956 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3957 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3958 ->getNumElements();
3959 auto *O0 = B.CreateZExtOrTrunc(
3960 IE->getOperand(0),
3961 FixedVectorType::get(ScalarTruncatedTy, Elements));
3962 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3963 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3964 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3965 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3966 ->getNumElements();
3967 auto *O0 = B.CreateZExtOrTrunc(
3968 EE->getOperand(0),
3969 FixedVectorType::get(ScalarTruncatedTy, Elements));
3970 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3971 } else {
3972 // If we don't know what to do, be conservative and don't do anything.
3973 continue;
3974 }
3975
3976 // Lastly, extend the result.
3977 NewI->takeName(cast<Instruction>(I));
3978 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3979 I->replaceAllUsesWith(Res);
3980 cast<Instruction>(I)->eraseFromParent();
3981 Erased.insert(I);
3982 State.reset(Def, Res, Part);
3983 }
3984 }
3985
3986 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3987 for (const auto &KV : Cost->getMinimalBitwidths()) {
3988 // If the value wasn't vectorized, we must maintain the original scalar
3989 // type. The absence of the value from State indicates that it
3990 // wasn't vectorized.
3991 VPValue *Def = State.Plan->getVPValue(KV.first);
3992 if (!State.hasAnyVectorValue(Def))
3993 continue;
3994 for (unsigned Part = 0; Part < UF; ++Part) {
3995 Value *I = State.get(Def, Part);
3996 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3997 if (Inst && Inst->use_empty()) {
3998 Value *NewI = Inst->getOperand(0);
3999 Inst->eraseFromParent();
4000 State.reset(Def, NewI, Part);
4001 }
4002 }
4003 }
4004}
4005
4006void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
4007 // Insert truncates and extends for any truncated instructions as hints to
4008 // InstCombine.
4009 if (VF.isVector())
4010 truncateToMinimalBitwidths(State);
4011
4012 // Fix widened non-induction PHIs by setting up the PHI operands.
4013 if (OrigPHIsToFix.size()) {
4014 assert(EnableVPlanNativePath &&((EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4015, __PRETTY_FUNCTION__))
4015 "Unexpected non-induction PHIs for fixup in non VPlan-native path")((EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4015, __PRETTY_FUNCTION__))
;
4016 fixNonInductionPHIs(State);
4017 }
4018
4019 // At this point every instruction in the original loop is widened to a
4020 // vector form. Now we need to fix the recurrences in the loop. These PHI
4021 // nodes are currently empty because we did not want to introduce cycles.
4022 // This is the second stage of vectorizing recurrences.
4023 fixCrossIterationPHIs(State);
4024
4025 // Forget the original basic block.
4026 PSE.getSE()->forgetLoop(OrigLoop);
4027
4028 // Fix-up external users of the induction variables.
4029 for (auto &Entry : Legal->getInductionVars())
4030 fixupIVUsers(Entry.first, Entry.second,
4031 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4032 IVEndValues[Entry.first], LoopMiddleBlock);
4033
4034 fixLCSSAPHIs(State);
4035 for (Instruction *PI : PredicatedInstructions)
4036 sinkScalarOperands(&*PI);
4037
4038 // Remove redundant induction instructions.
4039 cse(LoopVectorBody);
4040
4041 // Set/update profile weights for the vector and remainder loops as original
4042 // loop iterations are now distributed among them. Note that original loop
4043 // represented by LoopScalarBody becomes remainder loop after vectorization.
4044 //
4045 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4046 // end up getting slightly roughened result but that should be OK since
4047 // profile is not inherently precise anyway. Note also possible bypass of
4048 // vector code caused by legality checks is ignored, assigning all the weight
4049 // to the vector loop, optimistically.
4050 //
4051 // For scalable vectorization we can't know at compile time how many iterations
4052 // of the loop are handled in one vector iteration, so instead assume a pessimistic
4053 // vscale of '1'.
4054 setProfileInfoAfterUnrolling(
4055 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
4056 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
4057}
4058
4059void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
4060 // In order to support recurrences we need to be able to vectorize Phi nodes.
4061 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4062 // stage #2: We now need to fix the recurrences by adding incoming edges to
4063 // the currently empty PHI nodes. At this point every instruction in the
4064 // original loop is widened to a vector form so we can use them to construct
4065 // the incoming edges.
4066 for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
4067 // Handle first-order recurrences and reductions that need to be fixed.
4068 if (Legal->isFirstOrderRecurrence(&Phi))
4069 fixFirstOrderRecurrence(&Phi, State);
4070 else if (Legal->isReductionVariable(&Phi))
4071 fixReduction(&Phi, State);
4072 }
4073}
4074
4075void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi,
4076 VPTransformState &State) {
4077 // This is the second phase of vectorizing first-order recurrences. An
4078 // overview of the transformation is described below. Suppose we have the
4079 // following loop.
4080 //
4081 // for (int i = 0; i < n; ++i)
4082 // b[i] = a[i] - a[i - 1];
4083 //
4084 // There is a first-order recurrence on "a". For this loop, the shorthand
4085 // scalar IR looks like:
4086 //
4087 // scalar.ph:
4088 // s_init = a[-1]
4089 // br scalar.body
4090 //
4091 // scalar.body:
4092 // i = phi [0, scalar.ph], [i+1, scalar.body]
4093 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4094 // s2 = a[i]
4095 // b[i] = s2 - s1
4096 // br cond, scalar.body, ...
4097 //
4098 // In this example, s1 is a recurrence because it's value depends on the
4099 // previous iteration. In the first phase of vectorization, we created a
4100 // temporary value for s1. We now complete the vectorization and produce the
4101 // shorthand vector IR shown below (for VF = 4, UF = 1).
4102 //
4103 // vector.ph:
4104 // v_init = vector(..., ..., ..., a[-1])
4105 // br vector.body
4106 //
4107 // vector.body
4108 // i = phi [0, vector.ph], [i+4, vector.body]
4109 // v1 = phi [v_init, vector.ph], [v2, vector.body]
4110 // v2 = a[i, i+1, i+2, i+3];
4111 // v3 = vector(v1(3), v2(0, 1, 2))
4112 // b[i, i+1, i+2, i+3] = v2 - v3
4113 // br cond, vector.body, middle.block
4114 //
4115 // middle.block:
4116 // x = v2(3)
4117 // br scalar.ph
4118 //
4119 // scalar.ph:
4120 // s_init = phi [x, middle.block], [a[-1], otherwise]
4121 // br scalar.body
4122 //
4123 // After execution completes the vector loop, we extract the next value of
4124 // the recurrence (x) to use as the initial value in the scalar loop.
4125
4126 // Get the original loop preheader and single loop latch.
4127 auto *Preheader = OrigLoop->getLoopPreheader();
4128 auto *Latch = OrigLoop->getLoopLatch();
4129
4130 // Get the initial and previous values of the scalar recurrence.
4131 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4132 auto *Previous = Phi->getIncomingValueForBlock(Latch);
4133
4134 // Create a vector from the initial value.
4135 auto *VectorInit = ScalarInit;
4136 if (VF.isVector()) {
4137 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4138 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4138, __PRETTY_FUNCTION__))
;
4139 VectorInit = Builder.CreateInsertElement(
4140 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4141 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
4142 }
4143
4144 VPValue *PhiDef = State.Plan->getVPValue(Phi);
4145 VPValue *PreviousDef = State.Plan->getVPValue(Previous);
4146 // We constructed a temporary phi node in the first phase of vectorization.
4147 // This phi node will eventually be deleted.
4148 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0)));
4149
4150 // Create a phi node for the new recurrence. The current value will either be
4151 // the initial value inserted into a vector or loop-varying vector value.
4152 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4153 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4154
4155 // Get the vectorized previous value of the last part UF - 1. It appears last
4156 // among all unrolled iterations, due to the order of their construction.
4157 Value *PreviousLastPart = State.get(PreviousDef, UF - 1);
4158
4159 // Find and set the insertion point after the previous value if it is an
4160 // instruction.
4161 BasicBlock::iterator InsertPt;
4162 // Note that the previous value may have been constant-folded so it is not
4163 // guaranteed to be an instruction in the vector loop.
4164 // FIXME: Loop invariant values do not form recurrences. We should deal with
4165 // them earlier.
4166 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
4167 InsertPt = LoopVectorBody->getFirstInsertionPt();
4168 else {
4169 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
4170 if (isa<PHINode>(PreviousLastPart))
4171 // If the previous value is a phi node, we should insert after all the phi
4172 // nodes in the block containing the PHI to avoid breaking basic block
4173 // verification. Note that the basic block may be different to
4174 // LoopVectorBody, in case we predicate the loop.
4175 InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
4176 else
4177 InsertPt = ++PreviousInst->getIterator();
4178 }
4179 Builder.SetInsertPoint(&*InsertPt);
4180
4181 // We will construct a vector for the recurrence by combining the values for
4182 // the current and previous iterations. This is the required shuffle mask.
4183 assert(!VF.isScalable())((!VF.isScalable()) ? static_cast<void> (0) : __assert_fail
("!VF.isScalable()", "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4183, __PRETTY_FUNCTION__))
;
4184 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
4185 ShuffleMask[0] = VF.getKnownMinValue() - 1;
4186 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
4187 ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
4188
4189 // The vector from which to take the initial value for the current iteration
4190 // (actual or unrolled). Initially, this is the vector phi node.
4191 Value *Incoming = VecPhi;
4192
4193 // Shuffle the current and previous vector and update the vector parts.
4194 for (unsigned Part = 0; Part < UF; ++Part) {
4195 Value *PreviousPart = State.get(PreviousDef, Part);
4196 Value *PhiPart = State.get(PhiDef, Part);
4197 auto *Shuffle =
4198 VF.isVector()
4199 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
4200 : Incoming;
4201 PhiPart->replaceAllUsesWith(Shuffle);
4202 cast<Instruction>(PhiPart)->eraseFromParent();
4203 State.reset(PhiDef, Shuffle, Part);
4204 Incoming = PreviousPart;
4205 }
4206
4207 // Fix the latch value of the new recurrence in the vector loop.
4208 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4209
4210 // Extract the last vector element in the middle block. This will be the
4211 // initial value for the recurrence when jumping to the scalar loop.
4212 auto *ExtractForScalar = Incoming;
4213 if (VF.isVector()) {
4214 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4215 ExtractForScalar = Builder.CreateExtractElement(
4216 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
4217 "vector.recur.extract");
4218 }
4219 // Extract the second last element in the middle block if the
4220 // Phi is used outside the loop. We need to extract the phi itself
4221 // and not the last element (the phi update in the current iteration). This
4222 // will be the value when jumping to the exit block from the LoopMiddleBlock,
4223 // when the scalar loop is not run at all.
4224 Value *ExtractForPhiUsedOutsideLoop = nullptr;
4225 if (VF.isVector())
4226 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4227 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
4228 "vector.recur.extract.for.phi");
4229 // When loop is unrolled without vectorizing, initialize
4230 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4231 // `Incoming`. This is analogous to the vectorized case above: extracting the
4232 // second last element when VF > 1.
4233 else if (UF > 1)
4234 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4235
4236 // Fix the initial value of the original recurrence in the scalar loop.
4237 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4238 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4239 for (auto *BB : predecessors(LoopScalarPreHeader)) {
4240 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4241 Start->addIncoming(Incoming, BB);
4242 }
4243
4244 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4245 Phi->setName("scalar.recur");
4246
4247 // Finally, fix users of the recurrence outside the loop. The users will need
4248 // either the last value of the scalar recurrence or the last value of the
4249 // vector recurrence we extracted in the middle block. Since the loop is in
4250 // LCSSA form, we just need to find all the phi nodes for the original scalar
4251 // recurrence in the exit block, and then add an edge for the middle block.
4252 // Note that LCSSA does not imply single entry when the original scalar loop
4253 // had multiple exiting edges (as we always run the last iteration in the
4254 // scalar epilogue); in that case, the exiting path through middle will be
4255 // dynamically dead and the value picked for the phi doesn't matter.
4256 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4257 if (any_of(LCSSAPhi.incoming_values(),
4258 [Phi](Value *V) { return V == Phi; }))
4259 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4260}
4261
4262void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) {
4263 // Get it's reduction variable descriptor.
4264 assert(Legal->isReductionVariable(Phi) &&((Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->isReductionVariable(Phi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4265, __PRETTY_FUNCTION__))
4265 "Unable to find the reduction variable")((Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->isReductionVariable(Phi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4265, __PRETTY_FUNCTION__))
;
4266 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4267
4268 RecurKind RK = RdxDesc.getRecurrenceKind();
4269 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4270 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4271 setDebugLocFromInst(Builder, ReductionStartValue);
4272 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
4273
4274 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst);
4275 // This is the vector-clone of the value that leaves the loop.
4276 Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4277
4278 // Wrap flags are in general invalid after vectorization, clear them.
4279 clearReductionWrapFlags(RdxDesc, State);
4280
4281 // Fix the vector-loop phi.
4282
4283 // Reductions do not have to start at zero. They can start with
4284 // any loop invariant values.
4285 BasicBlock *Latch = OrigLoop->getLoopLatch();
4286 Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4287
4288 for (unsigned Part = 0; Part < UF; ++Part) {
4289 Value *VecRdxPhi = State.get(State.Plan->getVPValue(Phi), Part);
4290 Value *Val = State.get(State.Plan->getVPValue(LoopVal), Part);
4291 cast<PHINode>(VecRdxPhi)
4292 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4293 }
4294
4295 // Before each round, move the insertion point right between
4296 // the PHIs and the values we are going to write.
4297 // This allows us to write both PHINodes and the extractelement
4298 // instructions.
4299 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4300
4301 setDebugLocFromInst(Builder, LoopExitInst);
4302
4303 Type *PhiTy = Phi->getType();
4304 // If tail is folded by masking, the vector value to leave the loop should be
4305 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4306 // instead of the former. For an inloop reduction the reduction will already
4307 // be predicated, and does not need to be handled here.
4308 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4309 for (unsigned Part = 0; Part < UF; ++Part) {
4310 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4311 Value *Sel = nullptr;
4312 for (User *U : VecLoopExitInst->users()) {
4313 if (isa<SelectInst>(U)) {
4314 assert(!Sel && "Reduction exit feeding two selects")((!Sel && "Reduction exit feeding two selects") ? static_cast
<void> (0) : __assert_fail ("!Sel && \"Reduction exit feeding two selects\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4314, __PRETTY_FUNCTION__))
;
4315 Sel = U;
4316 } else
4317 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select")((isa<PHINode>(U) && "Reduction exit must feed Phi's or select"
) ? static_cast<void> (0) : __assert_fail ("isa<PHINode>(U) && \"Reduction exit must feed Phi's or select\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4317, __PRETTY_FUNCTION__))
;
4318 }
4319 assert(Sel && "Reduction exit feeds no select")((Sel && "Reduction exit feeds no select") ? static_cast
<void> (0) : __assert_fail ("Sel && \"Reduction exit feeds no select\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4319, __PRETTY_FUNCTION__))
;
4320 State.reset(LoopExitInstDef, Sel, Part);
4321
4322 // If the target can create a predicated operator for the reduction at no
4323 // extra cost in the loop (for example a predicated vadd), it can be
4324 // cheaper for the select to remain in the loop than be sunk out of it,
4325 // and so use the select value for the phi instead of the old
4326 // LoopExitValue.
4327 if (PreferPredicatedReductionSelect ||
4328 TTI->preferPredicatedReductionSelect(
4329 RdxDesc.getOpcode(), PhiTy,
4330 TargetTransformInfo::ReductionFlags())) {
4331 auto *VecRdxPhi =
4332 cast<PHINode>(State.get(State.Plan->getVPValue(Phi), Part));
4333 VecRdxPhi->setIncomingValueForBlock(
4334 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4335 }
4336 }
4337 }
4338
4339 // If the vector reduction can be performed in a smaller type, we truncate
4340 // then extend the loop exit value to enable InstCombine to evaluate the
4341 // entire expression in the smaller type.
4342 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4343 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!")((!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"
) ? static_cast<void> (0) : __assert_fail ("!IsInLoopReductionPhi && \"Unexpected truncated inloop reduction!\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4343, __PRETTY_FUNCTION__))
;
4344 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4344, __PRETTY_FUNCTION__))
;
4345 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4346 Builder.SetInsertPoint(
4347 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4348 VectorParts RdxParts(UF);
4349 for (unsigned Part = 0; Part < UF; ++Part) {
4350 RdxParts[Part] = State.get(LoopExitInstDef, Part);
4351 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4352 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4353 : Builder.CreateZExt(Trunc, VecTy);
4354 for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4355 UI != RdxParts[Part]->user_end();)
4356 if (*UI != Trunc) {
4357 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4358 RdxParts[Part] = Extnd;
4359 } else {
4360 ++UI;
4361 }
4362 }
4363 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4364 for (unsigned Part = 0; Part < UF; ++Part) {
4365 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4366 State.reset(LoopExitInstDef, RdxParts[Part], Part);
4367 }
4368 }
4369
4370 // Reduce all of the unrolled parts into a single vector.
4371 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4372 unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4373
4374 // The middle block terminator has already been assigned a DebugLoc here (the
4375 // OrigLoop's single latch terminator). We want the whole middle block to
4376 // appear to execute on this line because: (a) it is all compiler generated,
4377 // (b) these instructions are always executed after evaluating the latch
4378 // conditional branch, and (c) other passes may add new predecessors which
4379 // terminate on this line. This is the easiest way to ensure we don't
4380 // accidentally cause an extra step back into the loop while debugging.
4381 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4382 {
4383 // Floating-point operations should have some FMF to enable the reduction.
4384 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4385 Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4386 for (unsigned Part = 1; Part < UF; ++Part) {
4387 Value *RdxPart = State.get(LoopExitInstDef, Part);
4388 if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4389 ReducedPartRdx = Builder.CreateBinOp(
4390 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4391 } else {
4392 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4393 }
4394 }
4395 }
4396
4397 // Create the reduction after the loop. Note that inloop reductions create the
4398 // target reduction in the loop using a Reduction recipe.
4399 if (VF.isVector() && !IsInLoopReductionPhi) {
4400 ReducedPartRdx =
4401 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
4402 // If the reduction can be performed in a smaller type, we need to extend
4403 // the reduction to the wider type before we branch to the original loop.
4404 if (PhiTy != RdxDesc.getRecurrenceType())
4405 ReducedPartRdx = RdxDesc.isSigned()
4406 ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4407 : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4408 }
4409
4410 // Create a phi node that merges control-flow from the backedge-taken check
4411 // block and the middle block.
4412 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4413 LoopScalarPreHeader->getTerminator());
4414 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4415 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4416 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4417
4418 // Now, we need to fix the users of the reduction variable
4419 // inside and outside of the scalar remainder loop.
4420
4421 // We know that the loop is in LCSSA form. We need to update the PHI nodes
4422 // in the exit blocks. See comment on analogous loop in
4423 // fixFirstOrderRecurrence for a more complete explaination of the logic.
4424 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4425 if (any_of(LCSSAPhi.incoming_values(),
4426 [LoopExitInst](Value *V) { return V == LoopExitInst; }))
4427 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4428
4429 // Fix the scalar loop reduction variable with the incoming reduction sum
4430 // from the vector body and from the backedge value.
4431 int IncomingEdgeBlockIdx =
4432 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4433 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")((IncomingEdgeBlockIdx >= 0 && "Invalid block index"
) ? static_cast<void> (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4433, __PRETTY_FUNCTION__))
;
4434 // Pick the other block.
4435 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4436 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4437 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4438}
4439
4440void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc,
4441 VPTransformState &State) {
4442 RecurKind RK = RdxDesc.getRecurrenceKind();
4443 if (RK != RecurKind::Add && RK != RecurKind::Mul)
4444 return;
4445
4446 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4447 assert(LoopExitInstr && "null loop exit instruction")((LoopExitInstr && "null loop exit instruction") ? static_cast
<void> (0) : __assert_fail ("LoopExitInstr && \"null loop exit instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4447, __PRETTY_FUNCTION__))
;
4448 SmallVector<Instruction *, 8> Worklist;
4449 SmallPtrSet<Instruction *, 8> Visited;
4450 Worklist.push_back(LoopExitInstr);
4451 Visited.insert(LoopExitInstr);
4452
4453 while (!Worklist.empty()) {
4454 Instruction *Cur = Worklist.pop_back_val();
4455 if (isa<OverflowingBinaryOperator>(Cur))
4456 for (unsigned Part = 0; Part < UF; ++Part) {
4457 Value *V = State.get(State.Plan->getVPValue(Cur), Part);
4458 cast<Instruction>(V)->dropPoisonGeneratingFlags();
4459 }
4460
4461 for (User *U : Cur->users()) {
4462 Instruction *UI = cast<Instruction>(U);
4463 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4464 Visited.insert(UI).second)
4465 Worklist.push_back(UI);
4466 }
4467 }
4468}
4469
4470void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4471 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4472 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4473 // Some phis were already hand updated by the reduction and recurrence
4474 // code above, leave them alone.
4475 continue;
4476
4477 auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4478 // Non-instruction incoming values will have only one value.
4479
4480 VPLane Lane = VPLane::getFirstLane();
4481 if (isa<Instruction>(IncomingValue) &&
4482 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4483 VF))
4484 Lane = VPLane::getLastLaneForVF(VF);
4485
4486 // Can be a loop invariant incoming value or the last scalar value to be
4487 // extracted from the vectorized loop.
4488 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4489 Value *lastIncomingValue =
4490 OrigLoop->isLoopInvariant(IncomingValue)
4491 ? IncomingValue
4492 : State.get(State.Plan->getVPValue(IncomingValue),
4493 VPIteration(UF - 1, Lane));
4494 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4495 }
4496}
4497
4498void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4499 // The basic block and loop containing the predicated instruction.
4500 auto *PredBB = PredInst->getParent();
4501 auto *VectorLoop = LI->getLoopFor(PredBB);
4502
4503 // Initialize a worklist with the operands of the predicated instruction.
4504 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4505
4506 // Holds instructions that we need to analyze again. An instruction may be
4507 // reanalyzed if we don't yet know if we can sink it or not.
4508 SmallVector<Instruction *, 8> InstsToReanalyze;
4509
4510 // Returns true if a given use occurs in the predicated block. Phi nodes use
4511 // their operands in their corresponding predecessor blocks.
4512 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4513 auto *I = cast<Instruction>(U.getUser());
4514 BasicBlock *BB = I->getParent();
4515 if (auto *Phi = dyn_cast<PHINode>(I))
4516 BB = Phi->getIncomingBlock(
4517 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4518 return BB == PredBB;
4519 };
4520
4521 // Iteratively sink the scalarized operands of the predicated instruction
4522 // into the block we created for it. When an instruction is sunk, it's
4523 // operands are then added to the worklist. The algorithm ends after one pass
4524 // through the worklist doesn't sink a single instruction.
4525 bool Changed;
4526 do {
4527 // Add the instructions that need to be reanalyzed to the worklist, and
4528 // reset the changed indicator.
4529 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4530 InstsToReanalyze.clear();
4531 Changed = false;
4532
4533 while (!Worklist.empty()) {
4534 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4535
4536 // We can't sink an instruction if it is a phi node, is already in the
4537 // predicated block, is not in the loop, or may have side effects.
4538 if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4539 !VectorLoop->contains(I) || I->mayHaveSideEffects())
4540 continue;
4541
4542 // It's legal to sink the instruction if all its uses occur in the
4543 // predicated block. Otherwise, there's nothing to do yet, and we may
4544 // need to reanalyze the instruction.
4545 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4546 InstsToReanalyze.push_back(I);
4547 continue;
4548 }
4549
4550 // Move the instruction to the beginning of the predicated block, and add
4551 // it's operands to the worklist.
4552 I->moveBefore(&*PredBB->getFirstInsertionPt());
4553 Worklist.insert(I->op_begin(), I->op_end());
4554
4555 // The sinking may have enabled other instructions to be sunk, so we will
4556 // need to iterate.
4557 Changed = true;
4558 }
4559 } while (Changed);
4560}
4561
4562void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4563 for (PHINode *OrigPhi : OrigPHIsToFix) {
4564 VPWidenPHIRecipe *VPPhi =
4565 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4566 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4567 // Make sure the builder has a valid insert point.
4568 Builder.SetInsertPoint(NewPhi);
4569 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4570 VPValue *Inc = VPPhi->getIncomingValue(i);
4571 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4572 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4573 }
4574 }
4575}
4576
4577void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4578 VPUser &Operands, unsigned UF,
4579 ElementCount VF, bool IsPtrLoopInvariant,
4580 SmallBitVector &IsIndexLoopInvariant,
4581 VPTransformState &State) {
4582 // Construct a vector GEP by widening the operands of the scalar GEP as
4583 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4584 // results in a vector of pointers when at least one operand of the GEP
4585 // is vector-typed. Thus, to keep the representation compact, we only use
4586 // vector-typed operands for loop-varying values.
4587
4588 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4589 // If we are vectorizing, but the GEP has only loop-invariant operands,
4590 // the GEP we build (by only using vector-typed operands for
4591 // loop-varying values) would be a scalar pointer. Thus, to ensure we
4592 // produce a vector of pointers, we need to either arbitrarily pick an
4593 // operand to broadcast, or broadcast a clone of the original GEP.
4594 // Here, we broadcast a clone of the original.
4595 //
4596 // TODO: If at some point we decide to scalarize instructions having
4597 // loop-invariant operands, this special case will no longer be
4598 // required. We would add the scalarization decision to
4599 // collectLoopScalars() and teach getVectorValue() to broadcast
4600 // the lane-zero scalar value.
4601 auto *Clone = Builder.Insert(GEP->clone());
4602 for (unsigned Part = 0; Part < UF; ++Part) {
4603 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4604 State.set(VPDef, EntryPart, Part);
4605 addMetadata(EntryPart, GEP);
4606 }
4607 } else {
4608 // If the GEP has at least one loop-varying operand, we are sure to
4609 // produce a vector of pointers. But if we are only unrolling, we want
4610 // to produce a scalar GEP for each unroll part. Thus, the GEP we
4611 // produce with the code below will be scalar (if VF == 1) or vector
4612 // (otherwise). Note that for the unroll-only case, we still maintain
4613 // values in the vector mapping with initVector, as we do for other
4614 // instructions.
4615 for (unsigned Part = 0; Part < UF; ++Part) {
4616 // The pointer operand of the new GEP. If it's loop-invariant, we
4617 // won't broadcast it.
4618 auto *Ptr = IsPtrLoopInvariant
4619 ? State.get(Operands.getOperand(0), VPIteration(0, 0))
4620 : State.get(Operands.getOperand(0), Part);
4621
4622 // Collect all the indices for the new GEP. If any index is
4623 // loop-invariant, we won't broadcast it.
4624 SmallVector<Value *, 4> Indices;
4625 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4626 VPValue *Operand = Operands.getOperand(I);
4627 if (IsIndexLoopInvariant[I - 1])
4628 Indices.push_back(State.get(Operand, VPIteration(0, 0)));
4629 else
4630 Indices.push_back(State.get(Operand, Part));
4631 }
4632
4633 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4634 // but it should be a vector, otherwise.
4635 auto *NewGEP =
4636 GEP->isInBounds()
4637 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4638 Indices)
4639 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4640 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&(((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector") ? static_cast<void> (
0) : __assert_fail ("(VF.isScalar() || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4641, __PRETTY_FUNCTION__))
4641 "NewGEP is not a pointer vector")(((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector") ? static_cast<void> (
0) : __assert_fail ("(VF.isScalar() || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4641, __PRETTY_FUNCTION__))
;
4642 State.set(VPDef, NewGEP, Part);
4643 addMetadata(NewGEP, GEP);
4644 }
4645 }
4646}
4647
4648void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4649 RecurrenceDescriptor *RdxDesc,
4650 VPValue *StartVPV, VPValue *Def,
4651 VPTransformState &State) {
4652 PHINode *P = cast<PHINode>(PN);
4653 if (EnableVPlanNativePath) {
4654 // Currently we enter here in the VPlan-native path for non-induction
4655 // PHIs where all control flow is uniform. We simply widen these PHIs.
4656 // Create a vector phi with no operands - the vector phi operands will be
4657 // set at the end of vector code generation.
4658 Type *VecTy = (State.VF.isScalar())
4659 ? PN->getType()
4660 : VectorType::get(PN->getType(), State.VF);
4661 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4662 State.set(Def, VecPhi, 0);
4663 OrigPHIsToFix.push_back(P);
4664
4665 return;
4666 }
4667
4668 assert(PN->getParent() == OrigLoop->getHeader() &&((PN->getParent() == OrigLoop->getHeader() && "Non-header phis should have been handled elsewhere"
) ? static_cast<void> (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4669, __PRETTY_FUNCTION__))
4669 "Non-header phis should have been handled elsewhere")((PN->getParent() == OrigLoop->getHeader() && "Non-header phis should have been handled elsewhere"
) ? static_cast<void> (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4669, __PRETTY_FUNCTION__))
;
4670
4671 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr;
4672 // In order to support recurrences we need to be able to vectorize Phi nodes.
4673 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4674 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4675 // this value when we vectorize all of the instructions that use the PHI.
4676 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) {
4677 Value *Iden = nullptr;
4678 bool ScalarPHI =
4679 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4680 Type *VecTy =
4681 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
4682
4683 if (RdxDesc) {
4684 assert(Legal->isReductionVariable(P) && StartV &&((Legal->isReductionVariable(P) && StartV &&
"RdxDesc should only be set for reduction variables; in that case "
"a StartV is also required") ? static_cast<void> (0) :
__assert_fail ("Legal->isReductionVariable(P) && StartV && \"RdxDesc should only be set for reduction variables; in that case \" \"a StartV is also required\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4686, __PRETTY_FUNCTION__))
4685 "RdxDesc should only be set for reduction variables; in that case "((Legal->isReductionVariable(P) && StartV &&
"RdxDesc should only be set for reduction variables; in that case "
"a StartV is also required") ? static_cast<void> (0) :
__assert_fail ("Legal->isReductionVariable(P) && StartV && \"RdxDesc should only be set for reduction variables; in that case \" \"a StartV is also required\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4686, __PRETTY_FUNCTION__))
4686 "a StartV is also required")((Legal->isReductionVariable(P) && StartV &&
"RdxDesc should only be set for reduction variables; in that case "
"a StartV is also required") ? static_cast<void> (0) :
__assert_fail ("Legal->isReductionVariable(P) && StartV && \"RdxDesc should only be set for reduction variables; in that case \" \"a StartV is also required\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4686, __PRETTY_FUNCTION__))
;
4687 RecurKind RK = RdxDesc->getRecurrenceKind();
4688 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
4689 // MinMax reduction have the start value as their identify.
4690 if (ScalarPHI) {
4691 Iden = StartV;
4692 } else {
4693 IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4694 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4695 StartV = Iden =
4696 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
4697 }
4698 } else {
4699 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity(
4700 RK, VecTy->getScalarType());
4701 Iden = IdenC;
4702
4703 if (!ScalarPHI) {
4704 Iden = ConstantVector::getSplat(State.VF, IdenC);
4705 IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4706 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4707 Constant *Zero = Builder.getInt32(0);
4708 StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
4709 }
4710 }
4711 }
4712
4713 for (unsigned Part = 0; Part < State.UF; ++Part) {
4714 // This is phase one of vectorizing PHIs.
4715 Value *EntryPart = PHINode::Create(
4716 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4717 State.set(Def, EntryPart, Part);
4718 if (StartV) {
4719 // Make sure to add the reduction start value only to the
4720 // first unroll part.
4721 Value *StartVal = (Part == 0) ? StartV : Iden;
4722 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader);
4723 }
4724 }
4725 return;
4726 }
4727
4728 assert(!Legal->isReductionVariable(P) &&((!Legal->isReductionVariable(P) && "reductions should be handled above"
) ? static_cast<void> (0) : __assert_fail ("!Legal->isReductionVariable(P) && \"reductions should be handled above\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4729, __PRETTY_FUNCTION__))
4729 "reductions should be handled above")((!Legal->isReductionVariable(P) && "reductions should be handled above"
) ? static_cast<void> (0) : __assert_fail ("!Legal->isReductionVariable(P) && \"reductions should be handled above\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4729, __PRETTY_FUNCTION__))
;
4730
4731 setDebugLocFromInst(Builder, P);
4732
4733 // This PHINode must be an induction variable.
4734 // Make sure that we know about it.
4735 assert(Legal->getInductionVars().count(P) && "Not an induction variable")((Legal->getInductionVars().count(P) && "Not an induction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->getInductionVars().count(P) && \"Not an induction variable\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4735, __PRETTY_FUNCTION__))
;
4736
4737 InductionDescriptor II = Legal->getInductionVars().lookup(P);
4738 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4739
4740 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4741 // which can be found from the original scalar operations.
4742 switch (II.getKind()) {
4743 case InductionDescriptor::IK_NoInduction:
4744 llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4744)
;
4745 case InductionDescriptor::IK_IntInduction:
4746 case InductionDescriptor::IK_FpInduction:
4747 llvm_unreachable("Integer/fp induction is handled elsewhere.")::llvm::llvm_unreachable_internal("Integer/fp induction is handled elsewhere."
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4747)
;
4748 case InductionDescriptor::IK_PtrInduction: {
4749 // Handle the pointer induction variable case.
4750 assert(P->getType()->isPointerTy() && "Unexpected type.")((P->getType()->isPointerTy() && "Unexpected type."
) ? static_cast<void> (0) : __assert_fail ("P->getType()->isPointerTy() && \"Unexpected type.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4750, __PRETTY_FUNCTION__))
;
4751 assert(!VF.isScalable() && "Currently unsupported for scalable vectors")((!VF.isScalable() && "Currently unsupported for scalable vectors"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"Currently unsupported for scalable vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4751, __PRETTY_FUNCTION__))
;
4752
4753 if (Cost->isScalarAfterVectorization(P, State.VF)) {
4754 // This is the normalized GEP that starts counting at zero.
4755 Value *PtrInd =
4756 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4757 // Determine the number of scalars we need to generate for each unroll
4758 // iteration. If the instruction is uniform, we only need to generate the
4759 // first lane. Otherwise, we generate all VF values.
4760 unsigned Lanes = Cost->isUniformAfterVectorization(P, State.VF)
4761 ? 1
4762 : State.VF.getKnownMinValue();
4763 for (unsigned Part = 0; Part < UF; ++Part) {
4764 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4765 Constant *Idx = ConstantInt::get(
4766 PtrInd->getType(), Lane + Part * State.VF.getKnownMinValue());
4767 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4768 Value *SclrGep =
4769 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4770 SclrGep->setName("next.gep");
4771 State.set(Def, SclrGep, VPIteration(Part, Lane));
4772 }
4773 }
4774 return;
4775 }
4776 assert(isa<SCEVConstant>(II.getStep()) &&((isa<SCEVConstant>(II.getStep()) && "Induction step not a SCEV constant!"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(II.getStep()) && \"Induction step not a SCEV constant!\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4777, __PRETTY_FUNCTION__))
4777 "Induction step not a SCEV constant!")((isa<SCEVConstant>(II.getStep()) && "Induction step not a SCEV constant!"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(II.getStep()) && \"Induction step not a SCEV constant!\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4777, __PRETTY_FUNCTION__))
;
4778 Type *PhiType = II.getStep()->getType();
4779
4780 // Build a pointer phi
4781 Value *ScalarStartValue = II.getStartValue();
4782 Type *ScStValueType = ScalarStartValue->getType();
4783 PHINode *NewPointerPhi =
4784 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4785 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4786
4787 // A pointer induction, performed by using a gep
4788 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4789 Instruction *InductionLoc = LoopLatch->getTerminator();
4790 const SCEV *ScalarStep = II.getStep();
4791 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4792 Value *ScalarStepValue =
4793 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4794 Value *InductionGEP = GetElementPtrInst::Create(
4795 ScStValueType->getPointerElementType(), NewPointerPhi,
4796 Builder.CreateMul(
4797 ScalarStepValue,
4798 ConstantInt::get(PhiType, State.VF.getKnownMinValue() * State.UF)),
4799 "ptr.ind", InductionLoc);
4800 NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4801
4802 // Create UF many actual address geps that use the pointer
4803 // phi as base and a vectorized version of the step value
4804 // (<step*0, ..., step*N>) as offset.
4805 for (unsigned Part = 0; Part < State.UF; ++Part) {
4806 Type *VecPhiType = VectorType::get(PhiType, State.VF);
4807 Value *StartOffset =
4808 ConstantInt::get(VecPhiType, Part * State.VF.getKnownMinValue());
4809 // Create a vector of consecutive numbers from zero to VF.
4810 StartOffset =
4811 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4812
4813 Value *GEP = Builder.CreateGEP(
4814 ScStValueType->getPointerElementType(), NewPointerPhi,
4815 Builder.CreateMul(StartOffset,
4816 Builder.CreateVectorSplat(
4817 State.VF.getKnownMinValue(), ScalarStepValue),
4818 "vector.gep"));
4819 State.set(Def, GEP, Part);
4820 }
4821 }
4822 }
4823}
4824
4825/// A helper function for checking whether an integer division-related
4826/// instruction may divide by zero (in which case it must be predicated if
4827/// executed conditionally in the scalar code).
4828/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4829/// Non-zero divisors that are non compile-time constants will not be
4830/// converted into multiplication, so we will still end up scalarizing
4831/// the division, but can do so w/o predication.
4832static bool mayDivideByZero(Instruction &I) {
4833 assert((I.getOpcode() == Instruction::UDiv ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4837, __PRETTY_FUNCTION__))
4834 I.getOpcode() == Instruction::SDiv ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4837, __PRETTY_FUNCTION__))
4835 I.getOpcode() == Instruction::URem ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4837, __PRETTY_FUNCTION__))
4836 I.getOpcode() == Instruction::SRem) &&(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4837, __PRETTY_FUNCTION__))
4837 "Unexpected instruction")(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4837, __PRETTY_FUNCTION__))
;
4838 Value *Divisor = I.getOperand(1);
4839 auto *CInt = dyn_cast<ConstantInt>(Divisor);
4840 return !CInt || CInt->isZero();
4841}
4842
4843void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4844 VPUser &User,
4845 VPTransformState &State) {
4846 switch (I.getOpcode()) {
4847 case Instruction::Call:
4848 case Instruction::Br:
4849 case Instruction::PHI:
4850 case Instruction::GetElementPtr:
4851 case Instruction::Select:
4852 llvm_unreachable("This instruction is handled by a different recipe.")::llvm::llvm_unreachable_internal("This instruction is handled by a different recipe."
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4852)
;
4853 case Instruction::UDiv:
4854 case Instruction::SDiv:
4855 case Instruction::SRem:
4856 case Instruction::URem:
4857 case Instruction::Add:
4858 case Instruction::FAdd:
4859 case Instruction::Sub:
4860 case Instruction::FSub:
4861 case Instruction::FNeg:
4862 case Instruction::Mul:
4863 case Instruction::FMul:
4864 case Instruction::FDiv:
4865 case Instruction::FRem:
4866 case Instruction::Shl:
4867 case Instruction::LShr:
4868 case Instruction::AShr:
4869 case Instruction::And:
4870 case Instruction::Or:
4871 case Instruction::Xor: {
4872 // Just widen unops and binops.
4873 setDebugLocFromInst(Builder, &I);
4874
4875 for (unsigned Part = 0; Part < UF; ++Part) {
4876 SmallVector<Value *, 2> Ops;
4877 for (VPValue *VPOp : User.operands())
4878 Ops.push_back(State.get(VPOp, Part));
4879
4880 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4881
4882 if (auto *VecOp = dyn_cast<Instruction>(V))
4883 VecOp->copyIRFlags(&I);
4884
4885 // Use this vector value for all users of the original instruction.
4886 State.set(Def, V, Part);
4887 addMetadata(V, &I);
4888 }
4889
4890 break;
4891 }
4892 case Instruction::ICmp:
4893 case Instruction::FCmp: {
4894 // Widen compares. Generate vector compares.
4895 bool FCmp = (I.getOpcode() == Instruction::FCmp);
4896 auto *Cmp = cast<CmpInst>(&I);
4897 setDebugLocFromInst(Builder, Cmp);
4898 for (unsigned Part = 0; Part < UF; ++Part) {
4899 Value *A = State.get(User.getOperand(0), Part);
4900 Value *B = State.get(User.getOperand(1), Part);
4901 Value *C = nullptr;
4902 if (FCmp) {
4903 // Propagate fast math flags.
4904 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4905 Builder.setFastMathFlags(Cmp->getFastMathFlags());
4906 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4907 } else {
4908 C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4909 }
4910 State.set(Def, C, Part);
4911 addMetadata(C, &I);
4912 }
4913
4914 break;
4915 }
4916
4917 case Instruction::ZExt:
4918 case Instruction::SExt:
4919 case Instruction::FPToUI:
4920 case Instruction::FPToSI:
4921 case Instruction::FPExt:
4922 case Instruction::PtrToInt:
4923 case Instruction::IntToPtr:
4924 case Instruction::SIToFP:
4925 case Instruction::UIToFP:
4926 case Instruction::Trunc:
4927 case Instruction::FPTrunc:
4928 case Instruction::BitCast: {
4929 auto *CI = cast<CastInst>(&I);
4930 setDebugLocFromInst(Builder, CI);
4931
4932 /// Vectorize casts.
4933 Type *DestTy =
4934 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4935
4936 for (unsigned Part = 0; Part < UF; ++Part) {
4937 Value *A = State.get(User.getOperand(0), Part);
4938 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4939 State.set(Def, Cast, Part);
4940 addMetadata(Cast, &I);
4941 }
4942 break;
4943 }
4944 default:
4945 // This instruction is not vectorized by simple widening.
4946 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an unhandled instruction: "
<< I; } } while (false)
;
4947 llvm_unreachable("Unhandled instruction!")::llvm::llvm_unreachable_internal("Unhandled instruction!", "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4947)
;
4948 } // end of switch.
4949}
4950
4951void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4952 VPUser &ArgOperands,
4953 VPTransformState &State) {
4954 assert(!isa<DbgInfoIntrinsic>(I) &&((!isa<DbgInfoIntrinsic>(I) && "DbgInfoIntrinsic should have been dropped during VPlan construction"
) ? static_cast<void> (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(I) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4955, __PRETTY_FUNCTION__))
4955 "DbgInfoIntrinsic should have been dropped during VPlan construction")((!isa<DbgInfoIntrinsic>(I) && "DbgInfoIntrinsic should have been dropped during VPlan construction"
) ? static_cast<void> (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(I) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4955, __PRETTY_FUNCTION__))
;
4956 setDebugLocFromInst(Builder, &I);
4957
4958 Module *M = I.getParent()->getParent()->getParent();
4959 auto *CI = cast<CallInst>(&I);
4960
4961 SmallVector<Type *, 4> Tys;
4962 for (Value *ArgOperand : CI->arg_operands())
4963 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4964
4965 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4966
4967 // The flag shows whether we use Intrinsic or a usual Call for vectorized
4968 // version of the instruction.
4969 // Is it beneficial to perform intrinsic call compared to lib call?
4970 bool NeedToScalarize = false;
4971 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4972 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4973 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4974 assert((UseVectorIntrinsic || !NeedToScalarize) &&(((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."
) ? static_cast<void> (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4975, __PRETTY_FUNCTION__))
4975 "Instruction should be scalarized elsewhere.")(((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."
) ? static_cast<void> (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4975, __PRETTY_FUNCTION__))
;
4976 assert((IntrinsicCost.isValid() || CallCost.isValid()) &&(((IntrinsicCost.isValid() || CallCost.isValid()) && "Either the intrinsic cost or vector call cost must be valid"
) ? static_cast<void> (0) : __assert_fail ("(IntrinsicCost.isValid() || CallCost.isValid()) && \"Either the intrinsic cost or vector call cost must be valid\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4977, __PRETTY_FUNCTION__))
4977 "Either the intrinsic cost or vector call cost must be valid")(((IntrinsicCost.isValid() || CallCost.isValid()) && "Either the intrinsic cost or vector call cost must be valid"
) ? static_cast<void> (0) : __assert_fail ("(IntrinsicCost.isValid() || CallCost.isValid()) && \"Either the intrinsic cost or vector call cost must be valid\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4977, __PRETTY_FUNCTION__))
;
4978
4979 for (unsigned Part = 0; Part < UF; ++Part) {
4980 SmallVector<Value *, 4> Args;
4981 for (auto &I : enumerate(ArgOperands.operands())) {
4982 // Some intrinsics have a scalar argument - don't replace it with a
4983 // vector.
4984 Value *Arg;
4985 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4986 Arg = State.get(I.value(), Part);
4987 else
4988 Arg = State.get(I.value(), VPIteration(0, 0));
4989 Args.push_back(Arg);
4990 }
4991
4992 Function *VectorF;
4993 if (UseVectorIntrinsic) {
4994 // Use vector version of the intrinsic.
4995 Type *TysForDecl[] = {CI->getType()};
4996 if (VF.isVector())
4997 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4998 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4999 assert(VectorF && "Can't retrieve vector intrinsic.")((VectorF && "Can't retrieve vector intrinsic.") ? static_cast
<void> (0) : __assert_fail ("VectorF && \"Can't retrieve vector intrinsic.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4999, __PRETTY_FUNCTION__))
;
5000 } else {
5001 // Use vector version of the function call.
5002 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
5003#ifndef NDEBUG
5004 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&((VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
"Can't create vector function.") ? static_cast<void> (
0) : __assert_fail ("VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5005, __PRETTY_FUNCTION__))
5005 "Can't create vector function.")((VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
"Can't create vector function.") ? static_cast<void> (
0) : __assert_fail ("VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5005, __PRETTY_FUNCTION__))
;
5006#endif
5007 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
5008 }
5009 SmallVector<OperandBundleDef, 1> OpBundles;
5010 CI->getOperandBundlesAsDefs(OpBundles);
5011 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
5012
5013 if (isa<FPMathOperator>(V))
5014 V->copyFastMathFlags(CI);
5015
5016 State.set(Def, V, Part);
5017 addMetadata(V, &I);
5018 }
5019}
5020
5021void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
5022 VPUser &Operands,
5023 bool InvariantCond,
5024 VPTransformState &State) {
5025 setDebugLocFromInst(Builder, &I);
5026
5027 // The condition can be loop invariant but still defined inside the
5028 // loop. This means that we can't just use the original 'cond' value.
5029 // We have to take the 'vectorized' value and pick the first lane.
5030 // Instcombine will make this a no-op.
5031 auto *InvarCond = InvariantCond
5032 ? State.get(Operands.getOperand(0), VPIteration(0, 0))
5033 : nullptr;
5034
5035 for (unsigned Part = 0; Part < UF; ++Part) {
5036 Value *Cond =
5037 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
5038 Value *Op0 = State.get(Operands.getOperand(1), Part);
5039 Value *Op1 = State.get(Operands.getOperand(2), Part);
5040 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
5041 State.set(VPDef, Sel, Part);
5042 addMetadata(Sel, &I);
5043 }
5044}
5045
5046void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
5047 // We should not collect Scalars more than once per VF. Right now, this
5048 // function is called from collectUniformsAndScalars(), which already does
5049 // this check. Collecting Scalars for VF=1 does not make any sense.
5050 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&((VF.isVector() && Scalars.find(VF) == Scalars.end() &&
"This function should not be visited twice for the same VF")
? static_cast<void> (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5051, __PRETTY_FUNCTION__))
5051 "This function should not be visited twice for the same VF")((VF.isVector() && Scalars.find(VF) == Scalars.end() &&
"This function should not be visited twice for the same VF")
? static_cast<void> (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5051, __PRETTY_FUNCTION__))
;
5052
5053 SmallSetVector<Instruction *, 8> Worklist;
5054
5055 // These sets are used to seed the analysis with pointers used by memory
5056 // accesses that will remain scalar.
5057 SmallSetVector<Instruction *, 8> ScalarPtrs;
5058 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
5059 auto *Latch = TheLoop->getLoopLatch();
5060
5061 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
5062 // The pointer operands of loads and stores will be scalar as long as the
5063 // memory access is not a gather or scatter operation. The value operand of a
5064 // store will remain scalar if the store is scalarized.
5065 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
5066 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
5067 assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5068, __PRETTY_FUNCTION__))
5068 "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5068, __PRETTY_FUNCTION__))
;
5069 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
5070 if (Ptr == Store->getValueOperand())
5071 return WideningDecision == CM_Scalarize;
5072 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&((Ptr == getLoadStorePointerOperand(MemAccess) && "Ptr is neither a value or pointer operand"
) ? static_cast<void> (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5073, __PRETTY_FUNCTION__))
5073 "Ptr is neither a value or pointer operand")((Ptr == getLoadStorePointerOperand(MemAccess) && "Ptr is neither a value or pointer operand"
) ? static_cast<void> (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5073, __PRETTY_FUNCTION__))
;
5074 return WideningDecision != CM_GatherScatter;
5075 };
5076
5077 // A helper that returns true if the given value is a bitcast or
5078 // getelementptr instruction contained in the loop.
5079 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
5080 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
5081 isa<GetElementPtrInst>(V)) &&
5082 !TheLoop->isLoopInvariant(V);
5083 };
5084
5085 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
5086 if (!isa<PHINode>(Ptr) ||
5087 !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
5088 return false;
5089 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
5090 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
5091 return false;
5092 return isScalarUse(MemAccess, Ptr);
5093 };
5094
5095 // A helper that evaluates a memory access's use of a pointer. If the
5096 // pointer is actually the pointer induction of a loop, it is being
5097 // inserted into Worklist. If the use will be a scalar use, and the
5098 // pointer is only used by memory accesses, we place the pointer in
5099 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
5100 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
5101 if (isScalarPtrInduction(MemAccess, Ptr)) {
5102 Worklist.insert(cast<Instruction>(Ptr));
5103 Instruction *Update = cast<Instruction>(
5104 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
5105 Worklist.insert(Update);
5106 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptrdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Ptr << "\n"; } } while (false)
5107 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Ptr << "\n"; } } while (false)
;
5108 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Updatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Update << "\n"; } } while (false)
5109 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Update << "\n"; } } while (false)
;
5110 return;
5111 }
5112 // We only care about bitcast and getelementptr instructions contained in
5113 // the loop.
5114 if (!isLoopVaryingBitCastOrGEP(Ptr))
5115 return;
5116
5117 // If the pointer has already been identified as scalar (e.g., if it was
5118 // also identified as uniform), there's nothing to do.
5119 auto *I = cast<Instruction>(Ptr);
5120 if (Worklist.count(I))
5121 return;
5122
5123 // If the use of the pointer will be a scalar use, and all users of the
5124 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5125 // place the pointer in PossibleNonScalarPtrs.
5126 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5127 return isa<LoadInst>(U) || isa<StoreInst>(U);
5128 }))
5129 ScalarPtrs.insert(I);
5130 else
5131 PossibleNonScalarPtrs.insert(I);
5132 };
5133
5134 // We seed the scalars analysis with three classes of instructions: (1)
5135 // instructions marked uniform-after-vectorization and (2) bitcast,
5136 // getelementptr and (pointer) phi instructions used by memory accesses
5137 // requiring a scalar use.
5138 //
5139 // (1) Add to the worklist all instructions that have been identified as
5140 // uniform-after-vectorization.
5141 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5142
5143 // (2) Add to the worklist all bitcast and getelementptr instructions used by
5144 // memory accesses requiring a scalar use. The pointer operands of loads and
5145 // stores will be scalar as long as the memory accesses is not a gather or
5146 // scatter operation. The value operand of a store will remain scalar if the
5147 // store is scalarized.
5148 for (auto *BB : TheLoop->blocks())
5149 for (auto &I : *BB) {
5150 if (auto *Load = dyn_cast<LoadInst>(&I)) {
5151 evaluatePtrUse(Load, Load->getPointerOperand());
5152 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5153 evaluatePtrUse(Store, Store->getPointerOperand());
5154 evaluatePtrUse(Store, Store->getValueOperand());
5155 }
5156 }
5157 for (auto *I : ScalarPtrs)
5158 if (!PossibleNonScalarPtrs.count(I)) {
5159 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *I << "\n"; } } while (false)
;
5160 Worklist.insert(I);
5161 }
5162
5163 // Insert the forced scalars.
5164 // FIXME: Currently widenPHIInstruction() often creates a dead vector
5165 // induction variable when the PHI user is scalarized.
5166 auto ForcedScalar = ForcedScalars.find(VF);
5167 if (ForcedScalar != ForcedScalars.end())
5168 for (auto *I : ForcedScalar->second)
5169 Worklist.insert(I);
5170
5171 // Expand the worklist by looking through any bitcasts and getelementptr
5172 // instructions we've already identified as scalar. This is similar to the
5173 // expansion step in collectLoopUniforms(); however, here we're only
5174 // expanding to include additional bitcasts and getelementptr instructions.
5175 unsigned Idx = 0;
5176 while (Idx != Worklist.size()) {
5177 Instruction *Dst = Worklist[Idx++];
5178 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5179 continue;
5180 auto *Src = cast<Instruction>(Dst->getOperand(0));
5181 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5182 auto *J = cast<Instruction>(U);
5183 return !TheLoop->contains(J) || Worklist.count(J) ||
5184 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5185 isScalarUse(J, Src));
5186 })) {
5187 Worklist.insert(Src);
5188 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Src << "\n"; } } while (false)
;
5189 }
5190 }
5191
5192 // An induction variable will remain scalar if all users of the induction
5193 // variable and induction variable update remain scalar.
5194 for (auto &Induction : Legal->getInductionVars()) {
5195 auto *Ind = Induction.first;
5196 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5197
5198 // If tail-folding is applied, the primary induction variable will be used
5199 // to feed a vector compare.
5200 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5201 continue;
5202
5203 // Determine if all users of the induction variable are scalar after
5204 // vectorization.
5205 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5206 auto *I = cast<Instruction>(U);
5207 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5208 });
5209 if (!ScalarInd)
5210 continue;
5211
5212 // Determine if all users of the induction variable update instruction are
5213 // scalar after vectorization.
5214 auto ScalarIndUpdate =
5215 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5216 auto *I = cast<Instruction>(U);
5217 return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5218 });
5219 if (!ScalarIndUpdate)
5220 continue;
5221
5222 // The induction variable and its update instruction will remain scalar.
5223 Worklist.insert(Ind);
5224 Worklist.insert(IndUpdate);
5225 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Ind << "\n"; } } while (false)
;
5226 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
5227 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
;
5228 }
5229
5230 Scalars[VF].insert(Worklist.begin(), Worklist.end());
5231}
5232
5233bool LoopVectorizationCostModel::isScalarWithPredication(
5234 Instruction *I, ElementCount VF) const {
5235 if (!blockNeedsPredication(I->getParent()))
5236 return false;
5237 switch(I->getOpcode()) {
5238 default:
5239 break;
5240 case Instruction::Load:
5241 case Instruction::Store: {
5242 if (!Legal->isMaskRequired(I))
5243 return false;
5244 auto *Ptr = getLoadStorePointerOperand(I);
5245 auto *Ty = getMemInstValueType(I);
5246 // We have already decided how to vectorize this instruction, get that
5247 // result.
5248 if (VF.isVector()) {
5249 InstWidening WideningDecision = getWideningDecision(I, VF);
5250 assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5251, __PRETTY_FUNCTION__))
5251 "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5251, __PRETTY_FUNCTION__))
;
5252 return WideningDecision == CM_Scalarize;
5253 }
5254 const Align Alignment = getLoadStoreAlignment(I);
5255 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5256 isLegalMaskedGather(Ty, Alignment))
5257 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5258 isLegalMaskedScatter(Ty, Alignment));
5259 }
5260 case Instruction::UDiv:
5261 case Instruction::SDiv:
5262 case Instruction::SRem:
5263 case Instruction::URem:
5264 return mayDivideByZero(*I);
5265 }
5266 return false;
5267}
5268
5269bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5270 Instruction *I, ElementCount VF) {
5271 assert(isAccessInterleaved(I) && "Expecting interleaved access.")((isAccessInterleaved(I) && "Expecting interleaved access."
) ? static_cast<void> (0) : __assert_fail ("isAccessInterleaved(I) && \"Expecting interleaved access.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5271, __PRETTY_FUNCTION__))
;
5272 assert(getWideningDecision(I, VF) == CM_Unknown &&((getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."
) ? static_cast<void> (0) : __assert_fail ("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5273, __PRETTY_FUNCTION__))
5273 "Decision should not be set yet.")((getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."
) ? static_cast<void> (0) : __assert_fail ("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5273, __PRETTY_FUNCTION__))
;
5274 auto *Group = getInterleavedAccessGroup(I);
5275 assert(Group && "Must have a group.")((Group && "Must have a group.") ? static_cast<void
> (0) : __assert_fail ("Group && \"Must have a group.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5275, __PRETTY_FUNCTION__))
;
5276
5277 // If the instruction's allocated size doesn't equal it's type size, it
5278 // requires padding and will be scalarized.
5279 auto &DL = I->getModule()->getDataLayout();
5280 auto *ScalarTy = getMemInstValueType(I);
5281 if (hasIrregularType(ScalarTy, DL))
5282 return false;
5283
5284 // Check if masking is required.
5285 // A Group may need masking for one of two reasons: it resides in a block that
5286 // needs predication, or it was decided to use masking to deal with gaps.
5287 bool PredicatedAccessRequiresMasking =
5288 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5289 bool AccessWithGapsRequiresMasking =
5290 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5291 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
5292 return true;
5293
5294 // If masked interleaving is required, we expect that the user/target had
5295 // enabled it, because otherwise it either wouldn't have been created or
5296 // it should have been invalidated by the CostModel.
5297 assert(useMaskedInterleavedAccesses(TTI) &&((useMaskedInterleavedAccesses(TTI) && "Masked interleave-groups for predicated accesses are not enabled."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5298, __PRETTY_FUNCTION__))
5298 "Masked interleave-groups for predicated accesses are not enabled.")((useMaskedInterleavedAccesses(TTI) && "Masked interleave-groups for predicated accesses are not enabled."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5298, __PRETTY_FUNCTION__))
;
5299
5300 auto *Ty = getMemInstValueType(I);
5301 const Align Alignment = getLoadStoreAlignment(I);
5302 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5303 : TTI.isLegalMaskedStore(Ty, Alignment);
5304}
5305
5306bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5307 Instruction *I, ElementCount VF) {
5308 // Get and ensure we have a valid memory instruction.
5309 LoadInst *LI = dyn_cast<LoadInst>(I);
5310 StoreInst *SI = dyn_cast<StoreInst>(I);
5311 assert((LI || SI) && "Invalid memory instruction")(((LI || SI) && "Invalid memory instruction") ? static_cast
<void> (0) : __assert_fail ("(LI || SI) && \"Invalid memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5311, __PRETTY_FUNCTION__))
;
5312
5313 auto *Ptr = getLoadStorePointerOperand(I);
5314
5315 // In order to be widened, the pointer should be consecutive, first of all.
5316 if (!Legal->isConsecutivePtr(Ptr))
5317 return false;
5318
5319 // If the instruction is a store located in a predicated block, it will be
5320 // scalarized.
5321 if (isScalarWithPredication(I))
5322 return false;
5323
5324 // If the instruction's allocated size doesn't equal it's type size, it
5325 // requires padding and will be scalarized.
5326 auto &DL = I->getModule()->getDataLayout();
5327 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5328 if (hasIrregularType(ScalarTy, DL))
5329 return false;
5330
5331 return true;
5332}
5333
5334void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5335 // We should not collect Uniforms more than once per VF. Right now,
5336 // this function is called from collectUniformsAndScalars(), which
5337 // already does this check. Collecting Uniforms for VF=1 does not make any
5338 // sense.
5339
5340 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&((VF.isVector() && Uniforms.find(VF) == Uniforms.end(
) && "This function should not be visited twice for the same VF"
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5341, __PRETTY_FUNCTION__))
5341 "This function should not be visited twice for the same VF")((VF.isVector() && Uniforms.find(VF) == Uniforms.end(
) && "This function should not be visited twice for the same VF"
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5341, __PRETTY_FUNCTION__))
;
5342
5343 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5344 // not analyze again. Uniforms.count(VF) will return 1.
5345 Uniforms[VF].clear();
5346
5347 // We now know that the loop is vectorizable!
5348 // Collect instructions inside the loop that will remain uniform after
5349 // vectorization.
5350
5351 // Global values, params and instructions outside of current loop are out of
5352 // scope.
5353 auto isOutOfScope = [&](Value *V) -> bool {
5354 Instruction *I = dyn_cast<Instruction>(V);
5355 return (!I || !TheLoop->contains(I));
5356 };
5357
5358 SetVector<Instruction *> Worklist;
5359 BasicBlock *Latch = TheLoop->getLoopLatch();
5360
5361 // Instructions that are scalar with predication must not be considered
5362 // uniform after vectorization, because that would create an erroneous
5363 // replicating region where only a single instance out of VF should be formed.
5364 // TODO: optimize such seldom cases if found important, see PR40816.
5365 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5366 if (isOutOfScope(I)) {
5367 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
5368 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
;
5369 return;
5370 }
5371 if (isScalarWithPredication(I, VF)) {
5372 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
5373 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
;
5374 return;
5375 }
5376 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
<< *I << "\n"; } } while (false)
;
5377 Worklist.insert(I);
5378 };
5379
5380 // Start with the conditional branch. If the branch condition is an
5381 // instruction contained in the loop that is only used by the branch, it is
5382 // uniform.
5383 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5384 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5385 addToWorklistIfAllowed(Cmp);
5386
5387 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5388 InstWidening WideningDecision = getWideningDecision(I, VF);
5389 assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5390, __PRETTY_FUNCTION__))
5390 "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5390, __PRETTY_FUNCTION__))
;
5391
5392 // A uniform memory op is itself uniform. We exclude uniform stores
5393 // here as they demand the last lane, not the first one.
5394 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5395 assert(WideningDecision == CM_Scalarize)((WideningDecision == CM_Scalarize) ? static_cast<void>
(0) : __assert_fail ("WideningDecision == CM_Scalarize", "/build/llvm-toolchain-snapshot-13~++20210405022414+5f57793c4fe4/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5395, __PRETTY_FUNCTION__))
;
5396 return true;
5397 }
5398
5399 return (WideningDecision == CM_Widen ||
5400 WideningDecision == CM_Widen_Reverse ||
5401 WideningDecision == CM_Interleave);
5402 };
5403
5404
5405 // Returns true if Ptr is the pointer operand of a memory access instruction
5406 // I, and I is known to not require scalarization.
5407 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5408 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5409 };
5410
5411 // Holds a list of values which are known to have at least one uniform use.
5412 // Note that there may be other uses which aren't uniform. A "uniform use"
5413 // here is something which only demands lane 0 of the unrolled iterations;
5414 // it does not imply that all lanes produce the same value (e.g. this is not
5415 // the usual meaning of uniform)
5416 SetVector<Value *> HasUniformUse;
5417
5418 // Scan the loop for instructions which are either a) known to have only
5419 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5420 for (auto *BB : TheLoop->blocks())
5421 for (auto &I : *BB) {
5422 // If there's no pointer operand, there's nothing to do.
5423 auto *Ptr = getLoadStorePointerOperand(&I);
5424 if (!Ptr)
5425 continue;
5426
5427 // A uniform memory op is itself uniform. We exclude uniform stores
5428 // here as they demand the last lane, not the first one.
5429 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5430 addToWorklistIfAllowed(&I);
5431
5432 if (isUniformDecision(&I, VF)) {
5433 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check")((isVectorizedMemAccessUse(&I, Ptr) && "consistency check"
) ? static_cast<void> (0) : __assert_fail ("isVec