Bug Summary

File:llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Warning:line 8682, column 5
Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopVectorize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/build-llvm/lib/Transforms/Vectorize -resource-dir /usr/lib/llvm-13/lib/clang/13.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/build-llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/build-llvm/include -I /build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/lib/llvm-13/lib/clang/13.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/build-llvm/lib/Transforms/Vectorize -fdebug-prefix-map=/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f=. -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-04-14-063029-18377-1 -x c++ /build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/Proposal/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanHCFGBuilder.h"
61#include "VPlanPredicator.h"
62#include "VPlanTransforms.h"
63#include "llvm/ADT/APInt.h"
64#include "llvm/ADT/ArrayRef.h"
65#include "llvm/ADT/DenseMap.h"
66#include "llvm/ADT/DenseMapInfo.h"
67#include "llvm/ADT/Hashing.h"
68#include "llvm/ADT/MapVector.h"
69#include "llvm/ADT/None.h"
70#include "llvm/ADT/Optional.h"
71#include "llvm/ADT/STLExtras.h"
72#include "llvm/ADT/SmallPtrSet.h"
73#include "llvm/ADT/SmallVector.h"
74#include "llvm/ADT/Statistic.h"
75#include "llvm/ADT/StringRef.h"
76#include "llvm/ADT/Twine.h"
77#include "llvm/ADT/iterator_range.h"
78#include "llvm/Analysis/AssumptionCache.h"
79#include "llvm/Analysis/BasicAliasAnalysis.h"
80#include "llvm/Analysis/BlockFrequencyInfo.h"
81#include "llvm/Analysis/CFG.h"
82#include "llvm/Analysis/CodeMetrics.h"
83#include "llvm/Analysis/DemandedBits.h"
84#include "llvm/Analysis/GlobalsModRef.h"
85#include "llvm/Analysis/LoopAccessAnalysis.h"
86#include "llvm/Analysis/LoopAnalysisManager.h"
87#include "llvm/Analysis/LoopInfo.h"
88#include "llvm/Analysis/LoopIterator.h"
89#include "llvm/Analysis/MemorySSA.h"
90#include "llvm/Analysis/OptimizationRemarkEmitter.h"
91#include "llvm/Analysis/ProfileSummaryInfo.h"
92#include "llvm/Analysis/ScalarEvolution.h"
93#include "llvm/Analysis/ScalarEvolutionExpressions.h"
94#include "llvm/Analysis/TargetLibraryInfo.h"
95#include "llvm/Analysis/TargetTransformInfo.h"
96#include "llvm/Analysis/VectorUtils.h"
97#include "llvm/IR/Attributes.h"
98#include "llvm/IR/BasicBlock.h"
99#include "llvm/IR/CFG.h"
100#include "llvm/IR/Constant.h"
101#include "llvm/IR/Constants.h"
102#include "llvm/IR/DataLayout.h"
103#include "llvm/IR/DebugInfoMetadata.h"
104#include "llvm/IR/DebugLoc.h"
105#include "llvm/IR/DerivedTypes.h"
106#include "llvm/IR/DiagnosticInfo.h"
107#include "llvm/IR/Dominators.h"
108#include "llvm/IR/Function.h"
109#include "llvm/IR/IRBuilder.h"
110#include "llvm/IR/InstrTypes.h"
111#include "llvm/IR/Instruction.h"
112#include "llvm/IR/Instructions.h"
113#include "llvm/IR/IntrinsicInst.h"
114#include "llvm/IR/Intrinsics.h"
115#include "llvm/IR/LLVMContext.h"
116#include "llvm/IR/Metadata.h"
117#include "llvm/IR/Module.h"
118#include "llvm/IR/Operator.h"
119#include "llvm/IR/PatternMatch.h"
120#include "llvm/IR/Type.h"
121#include "llvm/IR/Use.h"
122#include "llvm/IR/User.h"
123#include "llvm/IR/Value.h"
124#include "llvm/IR/ValueHandle.h"
125#include "llvm/IR/Verifier.h"
126#include "llvm/InitializePasses.h"
127#include "llvm/Pass.h"
128#include "llvm/Support/Casting.h"
129#include "llvm/Support/CommandLine.h"
130#include "llvm/Support/Compiler.h"
131#include "llvm/Support/Debug.h"
132#include "llvm/Support/ErrorHandling.h"
133#include "llvm/Support/InstructionCost.h"
134#include "llvm/Support/MathExtras.h"
135#include "llvm/Support/raw_ostream.h"
136#include "llvm/Transforms/Utils/BasicBlockUtils.h"
137#include "llvm/Transforms/Utils/InjectTLIMappings.h"
138#include "llvm/Transforms/Utils/LoopSimplify.h"
139#include "llvm/Transforms/Utils/LoopUtils.h"
140#include "llvm/Transforms/Utils/LoopVersioning.h"
141#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
142#include "llvm/Transforms/Utils/SizeOpts.h"
143#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
144#include <algorithm>
145#include <cassert>
146#include <cstdint>
147#include <cstdlib>
148#include <functional>
149#include <iterator>
150#include <limits>
151#include <memory>
152#include <string>
153#include <tuple>
154#include <utility>
155
156using namespace llvm;
157
158#define LV_NAME"loop-vectorize" "loop-vectorize"
159#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"
160
161#ifndef NDEBUG
162const char VerboseDebug[] = DEBUG_TYPE"loop-vectorize" "-verbose";
163#endif
164
165/// @{
166/// Metadata attribute names
167const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168const char LLVMLoopVectorizeFollowupVectorized[] =
169 "llvm.loop.vectorize.followup_vectorized";
170const char LLVMLoopVectorizeFollowupEpilogue[] =
171 "llvm.loop.vectorize.followup_epilogue";
172/// @}
173
174STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = {"loop-vectorize", "LoopsVectorized"
, "Number of loops vectorized"}
;
175STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = {"loop-vectorize", "LoopsAnalyzed"
, "Number of loops analyzed for vectorization"}
;
176STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized")static llvm::Statistic LoopsEpilogueVectorized = {"loop-vectorize"
, "LoopsEpilogueVectorized", "Number of epilogues vectorized"
}
;
177
178static cl::opt<bool> EnableEpilogueVectorization(
179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180 cl::desc("Enable vectorization of epilogue loops."));
181
182static cl::opt<unsigned> EpilogueVectorizationForceVF(
183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184 cl::desc("When epilogue vectorization is enabled, and a value greater than "
185 "1 is specified, forces the given VF for all applicable epilogue "
186 "loops."));
187
188static cl::opt<unsigned> EpilogueVectorizationMinVF(
189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190 cl::desc("Only loops with vectorization factor equal to or larger than "
191 "the specified value are considered for epilogue vectorization."));
192
193/// Loops with a known constant trip count below this number are vectorized only
194/// if no scalar iteration overheads are incurred.
195static cl::opt<unsigned> TinyTripCountVectorThreshold(
196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197 cl::desc("Loops with a constant trip count that is smaller than this "
198 "value are vectorized only if no scalar iteration overheads "
199 "are incurred."));
200
201static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203 cl::desc("The maximum allowed number of runtime memory checks with a "
204 "vectorize(enable) pragma."));
205
206// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207// that predication is preferred, and this lists all options. I.e., the
208// vectorizer will try to fold the tail-loop (epilogue) into the vector body
209// and predicate the instructions accordingly. If tail-folding fails, there are
210// different fallback strategies depending on these values:
211namespace PreferPredicateTy {
212 enum Option {
213 ScalarEpilogue = 0,
214 PredicateElseScalarEpilogue,
215 PredicateOrDontVectorize
216 };
217} // namespace PreferPredicateTy
218
219static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
220 "prefer-predicate-over-epilogue",
221 cl::init(PreferPredicateTy::ScalarEpilogue),
222 cl::Hidden,
223 cl::desc("Tail-folding and predication preferences over creating a scalar "
224 "epilogue loop."),
225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
226 "scalar-epilogue",llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
227 "Don't tail-predicate loops, create scalar epilogue")llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
,
228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
229 "predicate-else-scalar-epilogue",llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
230 "prefer tail-folding, create scalar epilogue if tail "llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
231 "folding fails.")llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
,
232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
233 "predicate-dont-vectorize",llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
234 "prefers tail-folding, don't attempt vectorization if "llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
235 "tail-folding fails.")llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
));
236
237static cl::opt<bool> MaximizeBandwidth(
238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
239 cl::desc("Maximize bandwidth when selecting vectorization factor which "
240 "will be determined by the smallest type in loop."));
241
242static cl::opt<bool> EnableInterleavedMemAccesses(
243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
245
246/// An interleave-group may need masking if it resides in a block that needs
247/// predication, or in order to mask away gaps.
248static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
251
252static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
254 cl::desc("We don't interleave loops with a estimated constant trip count "
255 "below this number"));
256
257static cl::opt<unsigned> ForceTargetNumScalarRegs(
258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
259 cl::desc("A flag that overrides the target's number of scalar registers."));
260
261static cl::opt<unsigned> ForceTargetNumVectorRegs(
262 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
263 cl::desc("A flag that overrides the target's number of vector registers."));
264
265static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
267 cl::desc("A flag that overrides the target's max interleave factor for "
268 "scalar loops."));
269
270static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
272 cl::desc("A flag that overrides the target's max interleave factor for "
273 "vectorized loops."));
274
275static cl::opt<unsigned> ForceTargetInstructionCost(
276 "force-target-instruction-cost", cl::init(0), cl::Hidden,
277 cl::desc("A flag that overrides the target's expected cost for "
278 "an instruction to a single constant value. Mostly "
279 "useful for getting consistent testing."));
280
281static cl::opt<bool> ForceTargetSupportsScalableVectors(
282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
283 cl::desc(
284 "Pretend that scalable vectors are supported, even if the target does "
285 "not support them. This flag should only be used for testing."));
286
287static cl::opt<unsigned> SmallLoopCost(
288 "small-loop-cost", cl::init(20), cl::Hidden,
289 cl::desc(
290 "The cost of a loop that is considered 'small' by the interleaver."));
291
292static cl::opt<bool> LoopVectorizeWithBlockFrequency(
293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
294 cl::desc("Enable the use of the block frequency analysis to access PGO "
295 "heuristics minimizing code growth in cold regions and being more "
296 "aggressive in hot regions."));
297
298// Runtime interleave loops for load/store throughput.
299static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
301 cl::desc(
302 "Enable runtime interleaving until load/store ports are saturated"));
303
304/// Interleave small loops with scalar reductions.
305static cl::opt<bool> InterleaveSmallLoopScalarReduction(
306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
307 cl::desc("Enable interleaving for loops with small iteration counts that "
308 "contain scalar reductions to expose ILP."));
309
310/// The number of stores in a loop that are allowed to need predication.
311static cl::opt<unsigned> NumberOfStoresToPredicate(
312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
313 cl::desc("Max number of stores to be predicated behind an if."));
314
315static cl::opt<bool> EnableIndVarRegisterHeur(
316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
317 cl::desc("Count the induction variable only once when interleaving"));
318
319static cl::opt<bool> EnableCondStoresVectorization(
320 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
321 cl::desc("Enable if predication of stores during vectorization."));
322
323static cl::opt<unsigned> MaxNestedScalarReductionIC(
324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
325 cl::desc("The maximum interleave count to use when interleaving a scalar "
326 "reduction in a nested loop."));
327
328static cl::opt<bool>
329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
330 cl::Hidden,
331 cl::desc("Prefer in-loop vector reductions, "
332 "overriding the targets preference."));
333
334cl::opt<bool> EnableStrictReductions(
335 "enable-strict-reductions", cl::init(false), cl::Hidden,
336 cl::desc("Enable the vectorisation of loops with in-order (strict) "
337 "FP reductions"));
338
339static cl::opt<bool> PreferPredicatedReductionSelect(
340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
341 cl::desc(
342 "Prefer predicating a reduction operation over an after loop select."));
343
344cl::opt<bool> EnableVPlanNativePath(
345 "enable-vplan-native-path", cl::init(false), cl::Hidden,
346 cl::desc("Enable VPlan-native vectorization path with "
347 "support for outer loop vectorization."));
348
349// FIXME: Remove this switch once we have divergence analysis. Currently we
350// assume divergent non-backedge branches when this switch is true.
351cl::opt<bool> EnableVPlanPredication(
352 "enable-vplan-predication", cl::init(false), cl::Hidden,
353 cl::desc("Enable VPlan-native vectorization path predicator with "
354 "support for outer loop vectorization."));
355
356// This flag enables the stress testing of the VPlan H-CFG construction in the
357// VPlan-native vectorization path. It must be used in conjuction with
358// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
359// verification of the H-CFGs built.
360static cl::opt<bool> VPlanBuildStressTest(
361 "vplan-build-stress-test", cl::init(false), cl::Hidden,
362 cl::desc(
363 "Build VPlan for every supported loop nest in the function and bail "
364 "out right after the build (stress test the VPlan H-CFG construction "
365 "in the VPlan-native vectorization path)."));
366
367cl::opt<bool> llvm::EnableLoopInterleaving(
368 "interleave-loops", cl::init(true), cl::Hidden,
369 cl::desc("Enable loop interleaving in Loop vectorization passes"));
370cl::opt<bool> llvm::EnableLoopVectorization(
371 "vectorize-loops", cl::init(true), cl::Hidden,
372 cl::desc("Run the Loop vectorization passes"));
373
374cl::opt<bool> PrintVPlansInDotFormat(
375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
376 cl::desc("Use dot format instead of plain text when dumping VPlans"));
377
378/// A helper function that returns the type of loaded or stored value.
379static Type *getMemInstValueType(Value *I) {
380 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Expected Load or Store instruction") ? static_cast<void>
(0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 381, __PRETTY_FUNCTION__))
381 "Expected Load or Store instruction")(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Expected Load or Store instruction") ? static_cast<void>
(0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 381, __PRETTY_FUNCTION__))
;
382 if (auto *LI = dyn_cast<LoadInst>(I))
383 return LI->getType();
384 return cast<StoreInst>(I)->getValueOperand()->getType();
385}
386
387/// A helper function that returns true if the given type is irregular. The
388/// type is irregular if its allocated size doesn't equal the store size of an
389/// element of the corresponding vector type.
390static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
391 // Determine if an array of N elements of type Ty is "bitcast compatible"
392 // with a <N x Ty> vector.
393 // This is only true if there is no padding between the array elements.
394 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
395}
396
397/// A helper function that returns the reciprocal of the block probability of
398/// predicated blocks. If we return X, we are assuming the predicated block
399/// will execute once for every X iterations of the loop header.
400///
401/// TODO: We should use actual block probability here, if available. Currently,
402/// we always assume predicated blocks have a 50% chance of executing.
403static unsigned getReciprocalPredBlockProb() { return 2; }
404
405/// A helper function that returns an integer or floating-point constant with
406/// value C.
407static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
408 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
409 : ConstantFP::get(Ty, C);
410}
411
412/// Returns "best known" trip count for the specified loop \p L as defined by
413/// the following procedure:
414/// 1) Returns exact trip count if it is known.
415/// 2) Returns expected trip count according to profile data if any.
416/// 3) Returns upper bound estimate if it is known.
417/// 4) Returns None if all of the above failed.
418static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
419 // Check if exact trip count is known.
420 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
421 return ExpectedTC;
422
423 // Check if there is an expected trip count available from profile data.
424 if (LoopVectorizeWithBlockFrequency)
425 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
426 return EstimatedTC;
427
428 // Check if upper bound estimate is known.
429 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
430 return ExpectedTC;
431
432 return None;
433}
434
435// Forward declare GeneratedRTChecks.
436class GeneratedRTChecks;
437
438namespace llvm {
439
440/// InnerLoopVectorizer vectorizes loops which contain only one basic
441/// block to a specified vectorization factor (VF).
442/// This class performs the widening of scalars into vectors, or multiple
443/// scalars. This class also implements the following features:
444/// * It inserts an epilogue loop for handling loops that don't have iteration
445/// counts that are known to be a multiple of the vectorization factor.
446/// * It handles the code generation for reduction variables.
447/// * Scalarization (implementation using scalars) of un-vectorizable
448/// instructions.
449/// InnerLoopVectorizer does not perform any vectorization-legality
450/// checks, and relies on the caller to check for the different legality
451/// aspects. The InnerLoopVectorizer relies on the
452/// LoopVectorizationLegality class to provide information about the induction
453/// and reduction variables that were found to a given vectorization factor.
454class InnerLoopVectorizer {
455public:
456 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
457 LoopInfo *LI, DominatorTree *DT,
458 const TargetLibraryInfo *TLI,
459 const TargetTransformInfo *TTI, AssumptionCache *AC,
460 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
461 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
462 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
463 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
464 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
465 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
466 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
467 PSI(PSI), RTChecks(RTChecks) {
468 // Query this against the original loop and save it here because the profile
469 // of the original loop header may change as the transformation happens.
470 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
471 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
472 }
473
474 virtual ~InnerLoopVectorizer() = default;
475
476 /// Create a new empty loop that will contain vectorized instructions later
477 /// on, while the old loop will be used as the scalar remainder. Control flow
478 /// is generated around the vectorized (and scalar epilogue) loops consisting
479 /// of various checks and bypasses. Return the pre-header block of the new
480 /// loop.
481 /// In the case of epilogue vectorization, this function is overriden to
482 /// handle the more complex control flow around the loops.
483 virtual BasicBlock *createVectorizedLoopSkeleton();
484
485 /// Widen a single instruction within the innermost loop.
486 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
487 VPTransformState &State);
488
489 /// Widen a single call instruction within the innermost loop.
490 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
491 VPTransformState &State);
492
493 /// Widen a single select instruction within the innermost loop.
494 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
495 bool InvariantCond, VPTransformState &State);
496
497 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
498 void fixVectorizedLoop(VPTransformState &State);
499
500 // Return true if any runtime check is added.
501 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
502
503 /// A type for vectorized values in the new loop. Each value from the
504 /// original loop, when vectorized, is represented by UF vector values in the
505 /// new unrolled loop, where UF is the unroll factor.
506 using VectorParts = SmallVector<Value *, 2>;
507
508 /// Vectorize a single GetElementPtrInst based on information gathered and
509 /// decisions taken during planning.
510 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
511 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
512 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
513
514 /// Vectorize a single PHINode in a block. This method handles the induction
515 /// variable canonicalization. It supports both VF = 1 for unrolled loops and
516 /// arbitrary length vectors.
517 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc,
518 VPWidenPHIRecipe *PhiR, VPTransformState &State);
519
520 /// A helper function to scalarize a single Instruction in the innermost loop.
521 /// Generates a sequence of scalar instances for each lane between \p MinLane
522 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
523 /// inclusive. Uses the VPValue operands from \p Operands instead of \p
524 /// Instr's operands.
525 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands,
526 const VPIteration &Instance, bool IfPredicateInstr,
527 VPTransformState &State);
528
529 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
530 /// is provided, the integer induction variable will first be truncated to
531 /// the corresponding type.
532 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc,
533 VPValue *Def, VPValue *CastDef,
534 VPTransformState &State);
535
536 /// Construct the vector value of a scalarized value \p V one lane at a time.
537 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
538 VPTransformState &State);
539
540 /// Try to vectorize interleaved access group \p Group with the base address
541 /// given in \p Addr, optionally masking the vector operations if \p
542 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
543 /// values in the vectorized loop.
544 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
545 ArrayRef<VPValue *> VPDefs,
546 VPTransformState &State, VPValue *Addr,
547 ArrayRef<VPValue *> StoredValues,
548 VPValue *BlockInMask = nullptr);
549
550 /// Vectorize Load and Store instructions with the base address given in \p
551 /// Addr, optionally masking the vector operations if \p BlockInMask is
552 /// non-null. Use \p State to translate given VPValues to IR values in the
553 /// vectorized loop.
554 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
555 VPValue *Def, VPValue *Addr,
556 VPValue *StoredValue, VPValue *BlockInMask);
557
558 /// Set the debug location in the builder using the debug location in
559 /// the instruction.
560 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
561
562 /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
563 void fixNonInductionPHIs(VPTransformState &State);
564
565 /// Create a broadcast instruction. This method generates a broadcast
566 /// instruction (shuffle) for loop invariant values and for the induction
567 /// value. If this is the induction variable then we extend it to N, N+1, ...
568 /// this is needed because each iteration in the loop corresponds to a SIMD
569 /// element.
570 virtual Value *getBroadcastInstrs(Value *V);
571
572protected:
573 friend class LoopVectorizationPlanner;
574
575 /// A small list of PHINodes.
576 using PhiVector = SmallVector<PHINode *, 4>;
577
578 /// A type for scalarized values in the new loop. Each value from the
579 /// original loop, when scalarized, is represented by UF x VF scalar values
580 /// in the new unrolled loop, where UF is the unroll factor and VF is the
581 /// vectorization factor.
582 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
583
584 /// Set up the values of the IVs correctly when exiting the vector loop.
585 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
586 Value *CountRoundDown, Value *EndValue,
587 BasicBlock *MiddleBlock);
588
589 /// Create a new induction variable inside L.
590 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
591 Value *Step, Instruction *DL);
592
593 /// Handle all cross-iteration phis in the header.
594 void fixCrossIterationPHIs(VPTransformState &State);
595
596 /// Fix a first-order recurrence. This is the second phase of vectorizing
597 /// this phi node.
598 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State);
599
600 /// Fix a reduction cross-iteration phi. This is the second phase of
601 /// vectorizing this phi node.
602 void fixReduction(PHINode *Phi, VPTransformState &State);
603
604 /// Clear NSW/NUW flags from reduction instructions if necessary.
605 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc,
606 VPTransformState &State);
607
608 /// Fixup the LCSSA phi nodes in the unique exit block. This simply
609 /// means we need to add the appropriate incoming value from the middle
610 /// block as exiting edges from the scalar epilogue loop (if present) are
611 /// already in place, and we exit the vector loop exclusively to the middle
612 /// block.
613 void fixLCSSAPHIs(VPTransformState &State);
614
615 /// Iteratively sink the scalarized operands of a predicated instruction into
616 /// the block that was created for it.
617 void sinkScalarOperands(Instruction *PredInst);
618
619 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
620 /// represented as.
621 void truncateToMinimalBitwidths(VPTransformState &State);
622
623 /// This function adds
624 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
625 /// to each vector element of Val. The sequence starts at StartIndex.
626 /// \p Opcode is relevant for FP induction variable.
627 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
628 Instruction::BinaryOps Opcode =
629 Instruction::BinaryOpsEnd);
630
631 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
632 /// variable on which to base the steps, \p Step is the size of the step, and
633 /// \p EntryVal is the value from the original loop that maps to the steps.
634 /// Note that \p EntryVal doesn't have to be an induction variable - it
635 /// can also be a truncate instruction.
636 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
637 const InductionDescriptor &ID, VPValue *Def,
638 VPValue *CastDef, VPTransformState &State);
639
640 /// Create a vector induction phi node based on an existing scalar one. \p
641 /// EntryVal is the value from the original loop that maps to the vector phi
642 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
643 /// truncate instruction, instead of widening the original IV, we widen a
644 /// version of the IV truncated to \p EntryVal's type.
645 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
646 Value *Step, Value *Start,
647 Instruction *EntryVal, VPValue *Def,
648 VPValue *CastDef,
649 VPTransformState &State);
650
651 /// Returns true if an instruction \p I should be scalarized instead of
652 /// vectorized for the chosen vectorization factor.
653 bool shouldScalarizeInstruction(Instruction *I) const;
654
655 /// Returns true if we should generate a scalar version of \p IV.
656 bool needsScalarInduction(Instruction *IV) const;
657
658 /// If there is a cast involved in the induction variable \p ID, which should
659 /// be ignored in the vectorized loop body, this function records the
660 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
661 /// cast. We had already proved that the casted Phi is equal to the uncasted
662 /// Phi in the vectorized loop (under a runtime guard), and therefore
663 /// there is no need to vectorize the cast - the same value can be used in the
664 /// vector loop for both the Phi and the cast.
665 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
666 /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
667 ///
668 /// \p EntryVal is the value from the original loop that maps to the vector
669 /// phi node and is used to distinguish what is the IV currently being
670 /// processed - original one (if \p EntryVal is a phi corresponding to the
671 /// original IV) or the "newly-created" one based on the proof mentioned above
672 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
673 /// latter case \p EntryVal is a TruncInst and we must not record anything for
674 /// that IV, but it's error-prone to expect callers of this routine to care
675 /// about that, hence this explicit parameter.
676 void recordVectorLoopValueForInductionCast(
677 const InductionDescriptor &ID, const Instruction *EntryVal,
678 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
679 unsigned Part, unsigned Lane = UINT_MAX(2147483647 *2U +1U));
680
681 /// Generate a shuffle sequence that will reverse the vector Vec.
682 virtual Value *reverseVector(Value *Vec);
683
684 /// Returns (and creates if needed) the original loop trip count.
685 Value *getOrCreateTripCount(Loop *NewLoop);
686
687 /// Returns (and creates if needed) the trip count of the widened loop.
688 Value *getOrCreateVectorTripCount(Loop *NewLoop);
689
690 /// Returns a bitcasted value to the requested vector type.
691 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
692 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
693 const DataLayout &DL);
694
695 /// Emit a bypass check to see if the vector trip count is zero, including if
696 /// it overflows.
697 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
698
699 /// Emit a bypass check to see if all of the SCEV assumptions we've
700 /// had to make are correct. Returns the block containing the checks or
701 /// nullptr if no checks have been added.
702 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
703
704 /// Emit bypass checks to check any memory assumptions we may have made.
705 /// Returns the block containing the checks or nullptr if no checks have been
706 /// added.
707 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
708
709 /// Compute the transformed value of Index at offset StartValue using step
710 /// StepValue.
711 /// For integer induction, returns StartValue + Index * StepValue.
712 /// For pointer induction, returns StartValue[Index * StepValue].
713 /// FIXME: The newly created binary instructions should contain nsw/nuw
714 /// flags, which can be found from the original scalar operations.
715 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
716 const DataLayout &DL,
717 const InductionDescriptor &ID) const;
718
719 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
720 /// vector loop preheader, middle block and scalar preheader. Also
721 /// allocate a loop object for the new vector loop and return it.
722 Loop *createVectorLoopSkeleton(StringRef Prefix);
723
724 /// Create new phi nodes for the induction variables to resume iteration count
725 /// in the scalar epilogue, from where the vectorized loop left off (given by
726 /// \p VectorTripCount).
727 /// In cases where the loop skeleton is more complicated (eg. epilogue
728 /// vectorization) and the resume values can come from an additional bypass
729 /// block, the \p AdditionalBypass pair provides information about the bypass
730 /// block and the end value on the edge from bypass to this loop.
731 void createInductionResumeValues(
732 Loop *L, Value *VectorTripCount,
733 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
734
735 /// Complete the loop skeleton by adding debug MDs, creating appropriate
736 /// conditional branches in the middle block, preparing the builder and
737 /// running the verifier. Take in the vector loop \p L as argument, and return
738 /// the preheader of the completed vector loop.
739 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
740
741 /// Add additional metadata to \p To that was not present on \p Orig.
742 ///
743 /// Currently this is used to add the noalias annotations based on the
744 /// inserted memchecks. Use this for instructions that are *cloned* into the
745 /// vector loop.
746 void addNewMetadata(Instruction *To, const Instruction *Orig);
747
748 /// Add metadata from one instruction to another.
749 ///
750 /// This includes both the original MDs from \p From and additional ones (\see
751 /// addNewMetadata). Use this for *newly created* instructions in the vector
752 /// loop.
753 void addMetadata(Instruction *To, Instruction *From);
754
755 /// Similar to the previous function but it adds the metadata to a
756 /// vector of instructions.
757 void addMetadata(ArrayRef<Value *> To, Instruction *From);
758
759 /// Allow subclasses to override and print debug traces before/after vplan
760 /// execution, when trace information is requested.
761 virtual void printDebugTracesAtStart(){};
762 virtual void printDebugTracesAtEnd(){};
763
764 /// The original loop.
765 Loop *OrigLoop;
766
767 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
768 /// dynamic knowledge to simplify SCEV expressions and converts them to a
769 /// more usable form.
770 PredicatedScalarEvolution &PSE;
771
772 /// Loop Info.
773 LoopInfo *LI;
774
775 /// Dominator Tree.
776 DominatorTree *DT;
777
778 /// Alias Analysis.
779 AAResults *AA;
780
781 /// Target Library Info.
782 const TargetLibraryInfo *TLI;
783
784 /// Target Transform Info.
785 const TargetTransformInfo *TTI;
786
787 /// Assumption Cache.
788 AssumptionCache *AC;
789
790 /// Interface to emit optimization remarks.
791 OptimizationRemarkEmitter *ORE;
792
793 /// LoopVersioning. It's only set up (non-null) if memchecks were
794 /// used.
795 ///
796 /// This is currently only used to add no-alias metadata based on the
797 /// memchecks. The actually versioning is performed manually.
798 std::unique_ptr<LoopVersioning> LVer;
799
800 /// The vectorization SIMD factor to use. Each vector will have this many
801 /// vector elements.
802 ElementCount VF;
803
804 /// The vectorization unroll factor to use. Each scalar is vectorized to this
805 /// many different vector instructions.
806 unsigned UF;
807
808 /// The builder that we use
809 IRBuilder<> Builder;
810
811 // --- Vectorization state ---
812
813 /// The vector-loop preheader.
814 BasicBlock *LoopVectorPreHeader;
815
816 /// The scalar-loop preheader.
817 BasicBlock *LoopScalarPreHeader;
818
819 /// Middle Block between the vector and the scalar.
820 BasicBlock *LoopMiddleBlock;
821
822 /// The (unique) ExitBlock of the scalar loop. Note that
823 /// there can be multiple exiting edges reaching this block.
824 BasicBlock *LoopExitBlock;
825
826 /// The vector loop body.
827 BasicBlock *LoopVectorBody;
828
829 /// The scalar loop body.
830 BasicBlock *LoopScalarBody;
831
832 /// A list of all bypass blocks. The first block is the entry of the loop.
833 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
834
835 /// The new Induction variable which was added to the new block.
836 PHINode *Induction = nullptr;
837
838 /// The induction variable of the old basic block.
839 PHINode *OldInduction = nullptr;
840
841 /// Store instructions that were predicated.
842 SmallVector<Instruction *, 4> PredicatedInstructions;
843
844 /// Trip count of the original loop.
845 Value *TripCount = nullptr;
846
847 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
848 Value *VectorTripCount = nullptr;
849
850 /// The legality analysis.
851 LoopVectorizationLegality *Legal;
852
853 /// The profitablity analysis.
854 LoopVectorizationCostModel *Cost;
855
856 // Record whether runtime checks are added.
857 bool AddedSafetyChecks = false;
858
859 // Holds the end values for each induction variable. We save the end values
860 // so we can later fix-up the external users of the induction variables.
861 DenseMap<PHINode *, Value *> IVEndValues;
862
863 // Vector of original scalar PHIs whose corresponding widened PHIs need to be
864 // fixed up at the end of vector code generation.
865 SmallVector<PHINode *, 8> OrigPHIsToFix;
866
867 /// BFI and PSI are used to check for profile guided size optimizations.
868 BlockFrequencyInfo *BFI;
869 ProfileSummaryInfo *PSI;
870
871 // Whether this loop should be optimized for size based on profile guided size
872 // optimizatios.
873 bool OptForSizeBasedOnProfile;
874
875 /// Structure to hold information about generated runtime checks, responsible
876 /// for cleaning the checks, if vectorization turns out unprofitable.
877 GeneratedRTChecks &RTChecks;
878};
879
880class InnerLoopUnroller : public InnerLoopVectorizer {
881public:
882 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
883 LoopInfo *LI, DominatorTree *DT,
884 const TargetLibraryInfo *TLI,
885 const TargetTransformInfo *TTI, AssumptionCache *AC,
886 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
887 LoopVectorizationLegality *LVL,
888 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
889 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
890 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
891 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
892 BFI, PSI, Check) {}
893
894private:
895 Value *getBroadcastInstrs(Value *V) override;
896 Value *getStepVector(Value *Val, int StartIdx, Value *Step,
897 Instruction::BinaryOps Opcode =
898 Instruction::BinaryOpsEnd) override;
899 Value *reverseVector(Value *Vec) override;
900};
901
902/// Encapsulate information regarding vectorization of a loop and its epilogue.
903/// This information is meant to be updated and used across two stages of
904/// epilogue vectorization.
905struct EpilogueLoopVectorizationInfo {
906 ElementCount MainLoopVF = ElementCount::getFixed(0);
907 unsigned MainLoopUF = 0;
908 ElementCount EpilogueVF = ElementCount::getFixed(0);
909 unsigned EpilogueUF = 0;
910 BasicBlock *MainLoopIterationCountCheck = nullptr;
911 BasicBlock *EpilogueIterationCountCheck = nullptr;
912 BasicBlock *SCEVSafetyCheck = nullptr;
913 BasicBlock *MemSafetyCheck = nullptr;
914 Value *TripCount = nullptr;
915 Value *VectorTripCount = nullptr;
916
917 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
918 unsigned EUF)
919 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
920 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
921 assert(EUF == 1 &&((EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? static_cast<void> (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 922, __PRETTY_FUNCTION__))
922 "A high UF for the epilogue loop is likely not beneficial.")((EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? static_cast<void> (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 922, __PRETTY_FUNCTION__))
;
923 }
924};
925
926/// An extension of the inner loop vectorizer that creates a skeleton for a
927/// vectorized loop that has its epilogue (residual) also vectorized.
928/// The idea is to run the vplan on a given loop twice, firstly to setup the
929/// skeleton and vectorize the main loop, and secondly to complete the skeleton
930/// from the first step and vectorize the epilogue. This is achieved by
931/// deriving two concrete strategy classes from this base class and invoking
932/// them in succession from the loop vectorizer planner.
933class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
934public:
935 InnerLoopAndEpilogueVectorizer(
936 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
937 DominatorTree *DT, const TargetLibraryInfo *TLI,
938 const TargetTransformInfo *TTI, AssumptionCache *AC,
939 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
940 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
941 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
942 GeneratedRTChecks &Checks)
943 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
944 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
945 Checks),
946 EPI(EPI) {}
947
948 // Override this function to handle the more complex control flow around the
949 // three loops.
950 BasicBlock *createVectorizedLoopSkeleton() final override {
951 return createEpilogueVectorizedLoopSkeleton();
952 }
953
954 /// The interface for creating a vectorized skeleton using one of two
955 /// different strategies, each corresponding to one execution of the vplan
956 /// as described above.
957 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
958
959 /// Holds and updates state information required to vectorize the main loop
960 /// and its epilogue in two separate passes. This setup helps us avoid
961 /// regenerating and recomputing runtime safety checks. It also helps us to
962 /// shorten the iteration-count-check path length for the cases where the
963 /// iteration count of the loop is so small that the main vector loop is
964 /// completely skipped.
965 EpilogueLoopVectorizationInfo &EPI;
966};
967
968/// A specialized derived class of inner loop vectorizer that performs
969/// vectorization of *main* loops in the process of vectorizing loops and their
970/// epilogues.
971class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
972public:
973 EpilogueVectorizerMainLoop(
974 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
975 DominatorTree *DT, const TargetLibraryInfo *TLI,
976 const TargetTransformInfo *TTI, AssumptionCache *AC,
977 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
978 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
979 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
980 GeneratedRTChecks &Check)
981 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
982 EPI, LVL, CM, BFI, PSI, Check) {}
983 /// Implements the interface for creating a vectorized skeleton using the
984 /// *main loop* strategy (ie the first pass of vplan execution).
985 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
986
987protected:
988 /// Emits an iteration count bypass check once for the main loop (when \p
989 /// ForEpilogue is false) and once for the epilogue loop (when \p
990 /// ForEpilogue is true).
991 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
992 bool ForEpilogue);
993 void printDebugTracesAtStart() override;
994 void printDebugTracesAtEnd() override;
995};
996
997// A specialized derived class of inner loop vectorizer that performs
998// vectorization of *epilogue* loops in the process of vectorizing loops and
999// their epilogues.
1000class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
1001public:
1002 EpilogueVectorizerEpilogueLoop(
1003 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
1004 DominatorTree *DT, const TargetLibraryInfo *TLI,
1005 const TargetTransformInfo *TTI, AssumptionCache *AC,
1006 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
1007 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
1008 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
1009 GeneratedRTChecks &Checks)
1010 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1011 EPI, LVL, CM, BFI, PSI, Checks) {}
1012 /// Implements the interface for creating a vectorized skeleton using the
1013 /// *epilogue loop* strategy (ie the second pass of vplan execution).
1014 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1015
1016protected:
1017 /// Emits an iteration count bypass check after the main vector loop has
1018 /// finished to see if there are any iterations left to execute by either
1019 /// the vector epilogue or the scalar epilogue.
1020 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1021 BasicBlock *Bypass,
1022 BasicBlock *Insert);
1023 void printDebugTracesAtStart() override;
1024 void printDebugTracesAtEnd() override;
1025};
1026} // end namespace llvm
1027
1028/// Look for a meaningful debug location on the instruction or it's
1029/// operands.
1030static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1031 if (!I)
1032 return I;
1033
1034 DebugLoc Empty;
1035 if (I->getDebugLoc() != Empty)
1036 return I;
1037
1038 for (Use &Op : I->operands()) {
1039 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
1040 if (OpInst->getDebugLoc() != Empty)
1041 return OpInst;
1042 }
1043
1044 return I;
1045}
1046
1047void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
1048 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
1049 const DILocation *DIL = Inst->getDebugLoc();
1050 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1051 !isa<DbgInfoIntrinsic>(Inst)) {
1052 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1052, __PRETTY_FUNCTION__))
;
1053 auto NewDIL =
1054 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1055 if (NewDIL)
1056 B.SetCurrentDebugLocation(NewDIL.getValue());
1057 else
1058 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
1059 << "Failed to create new discriminator: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
1060 << DIL->getFilename() << " Line: " << DIL->getLine())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
;
1061 }
1062 else
1063 B.SetCurrentDebugLocation(DIL);
1064 } else
1065 B.SetCurrentDebugLocation(DebugLoc());
1066}
1067
1068/// Write a record \p DebugMsg about vectorization failure to the debug
1069/// output stream. If \p I is passed, it is an instruction that prevents
1070/// vectorization.
1071#ifndef NDEBUG
1072static void debugVectorizationFailure(const StringRef DebugMsg,
1073 Instruction *I) {
1074 dbgs() << "LV: Not vectorizing: " << DebugMsg;
1075 if (I != nullptr)
1076 dbgs() << " " << *I;
1077 else
1078 dbgs() << '.';
1079 dbgs() << '\n';
1080}
1081#endif
1082
1083/// Create an analysis remark that explains why vectorization failed
1084///
1085/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
1086/// RemarkName is the identifier for the remark. If \p I is passed it is an
1087/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
1088/// the location of the remark. \return the remark object that can be
1089/// streamed to.
1090static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1091 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1092 Value *CodeRegion = TheLoop->getHeader();
1093 DebugLoc DL = TheLoop->getStartLoc();
1094
1095 if (I) {
1096 CodeRegion = I->getParent();
1097 // If there is no debug location attached to the instruction, revert back to
1098 // using the loop's.
1099 if (I->getDebugLoc())
1100 DL = I->getDebugLoc();
1101 }
1102
1103 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
1104 R << "loop not vectorized: ";
1105 return R;
1106}
1107
1108/// Return a value for Step multiplied by VF.
1109static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1110 assert(isa<ConstantInt>(Step) && "Expected an integer step")((isa<ConstantInt>(Step) && "Expected an integer step"
) ? static_cast<void> (0) : __assert_fail ("isa<ConstantInt>(Step) && \"Expected an integer step\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1110, __PRETTY_FUNCTION__))
;
1111 Constant *StepVal = ConstantInt::get(
1112 Step->getType(),
1113 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1114 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1115}
1116
1117namespace llvm {
1118
1119/// Return the runtime value for VF.
1120Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
1121 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1122 return VF.isScalable() ? B.CreateVScale(EC) : EC;
1123}
1124
1125void reportVectorizationFailure(const StringRef DebugMsg,
1126 const StringRef OREMsg, const StringRef ORETag,
1127 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
1128 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationFailure(DebugMsg, I);
} } while (false)
;
1129 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1130 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
1131 ORETag, TheLoop, I) << OREMsg);
1132}
1133
1134} // end namespace llvm
1135
1136#ifndef NDEBUG
1137/// \return string containing a file name and a line # for the given loop.
1138static std::string getDebugLocString(const Loop *L) {
1139 std::string Result;
1140 if (L) {
1141 raw_string_ostream OS(Result);
1142 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1143 LoopDbgLoc.print(OS);
1144 else
1145 // Just print the module name.
1146 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1147 OS.flush();
1148 }
1149 return Result;
1150}
1151#endif
1152
1153void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1154 const Instruction *Orig) {
1155 // If the loop was versioned with memchecks, add the corresponding no-alias
1156 // metadata.
1157 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1158 LVer->annotateInstWithNoAlias(To, Orig);
1159}
1160
1161void InnerLoopVectorizer::addMetadata(Instruction *To,
1162 Instruction *From) {
1163 propagateMetadata(To, From);
1164 addNewMetadata(To, From);
1165}
1166
1167void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1168 Instruction *From) {
1169 for (Value *V : To) {
1170 if (Instruction *I = dyn_cast<Instruction>(V))
1171 addMetadata(I, From);
1172 }
1173}
1174
1175namespace llvm {
1176
1177// Loop vectorization cost-model hints how the scalar epilogue loop should be
1178// lowered.
1179enum ScalarEpilogueLowering {
1180
1181 // The default: allowing scalar epilogues.
1182 CM_ScalarEpilogueAllowed,
1183
1184 // Vectorization with OptForSize: don't allow epilogues.
1185 CM_ScalarEpilogueNotAllowedOptSize,
1186
1187 // A special case of vectorisation with OptForSize: loops with a very small
1188 // trip count are considered for vectorization under OptForSize, thereby
1189 // making sure the cost of their loop body is dominant, free of runtime
1190 // guards and scalar iteration overheads.
1191 CM_ScalarEpilogueNotAllowedLowTripLoop,
1192
1193 // Loop hint predicate indicating an epilogue is undesired.
1194 CM_ScalarEpilogueNotNeededUsePredicate,
1195
1196 // Directive indicating we must either tail fold or not vectorize
1197 CM_ScalarEpilogueNotAllowedUsePredicate
1198};
1199
1200/// LoopVectorizationCostModel - estimates the expected speedups due to
1201/// vectorization.
1202/// In many cases vectorization is not profitable. This can happen because of
1203/// a number of reasons. In this class we mainly attempt to predict the
1204/// expected speedup/slowdowns due to the supported instruction set. We use the
1205/// TargetTransformInfo to query the different backends for the cost of
1206/// different operations.
1207class LoopVectorizationCostModel {
1208public:
1209 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1210 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1211 LoopVectorizationLegality *Legal,
1212 const TargetTransformInfo &TTI,
1213 const TargetLibraryInfo *TLI, DemandedBits *DB,
1214 AssumptionCache *AC,
1215 OptimizationRemarkEmitter *ORE, const Function *F,
1216 const LoopVectorizeHints *Hints,
1217 InterleavedAccessInfo &IAI)
1218 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1219 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1220 Hints(Hints), InterleaveInfo(IAI) {}
1221
1222 /// \return An upper bound for the vectorization factor, or None if
1223 /// vectorization and interleaving should be avoided up front.
1224 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
1225
1226 /// \return True if runtime checks are required for vectorization, and false
1227 /// otherwise.
1228 bool runtimeChecksRequired();
1229
1230 /// \return The most profitable vectorization factor and the cost of that VF.
1231 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1232 /// then this vectorization factor will be selected if vectorization is
1233 /// possible.
1234 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
1235 VectorizationFactor
1236 selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1237 const LoopVectorizationPlanner &LVP);
1238
1239 /// Setup cost-based decisions for user vectorization factor.
1240 void selectUserVectorizationFactor(ElementCount UserVF) {
1241 collectUniformsAndScalars(UserVF);
1242 collectInstsToScalarize(UserVF);
1243 }
1244
1245 /// \return The size (in bits) of the smallest and widest types in the code
1246 /// that needs to be vectorized. We ignore values that remain scalar such as
1247 /// 64 bit loop indices.
1248 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1249
1250 /// \return The desired interleave count.
1251 /// If interleave count has been specified by metadata it will be returned.
1252 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1253 /// are the selected vectorization factor and the cost of the selected VF.
1254 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1255
1256 /// Memory access instruction may be vectorized in more than one way.
1257 /// Form of instruction after vectorization depends on cost.
1258 /// This function takes cost-based decisions for Load/Store instructions
1259 /// and collects them in a map. This decisions map is used for building
1260 /// the lists of loop-uniform and loop-scalar instructions.
1261 /// The calculated cost is saved with widening decision in order to
1262 /// avoid redundant calculations.
1263 void setCostBasedWideningDecision(ElementCount VF);
1264
1265 /// A struct that represents some properties of the register usage
1266 /// of a loop.
1267 struct RegisterUsage {
1268 /// Holds the number of loop invariant values that are used in the loop.
1269 /// The key is ClassID of target-provided register class.
1270 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1271 /// Holds the maximum number of concurrent live intervals in the loop.
1272 /// The key is ClassID of target-provided register class.
1273 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1274 };
1275
1276 /// \return Returns information about the register usages of the loop for the
1277 /// given vectorization factors.
1278 SmallVector<RegisterUsage, 8>
1279 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1280
1281 /// Collect values we want to ignore in the cost model.
1282 void collectValuesToIgnore();
1283
1284 /// Split reductions into those that happen in the loop, and those that happen
1285 /// outside. In loop reductions are collected into InLoopReductionChains.
1286 void collectInLoopReductions();
1287
1288 /// \returns The smallest bitwidth each instruction can be represented with.
1289 /// The vector equivalents of these instructions should be truncated to this
1290 /// type.
1291 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1292 return MinBWs;
1293 }
1294
1295 /// \returns True if it is more profitable to scalarize instruction \p I for
1296 /// vectorization factor \p VF.
1297 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1298 assert(VF.isVector() &&((VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1299, __PRETTY_FUNCTION__))
1299 "Profitable to scalarize relevant only for VF > 1.")((VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1299, __PRETTY_FUNCTION__))
;
1300
1301 // Cost model is not run in the VPlan-native path - return conservative
1302 // result until this changes.
1303 if (EnableVPlanNativePath)
1304 return false;
1305
1306 auto Scalars = InstsToScalarize.find(VF);
1307 assert(Scalars != InstsToScalarize.end() &&((Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"
) ? static_cast<void> (0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1308, __PRETTY_FUNCTION__))
1308 "VF not yet analyzed for scalarization profitability")((Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"
) ? static_cast<void> (0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1308, __PRETTY_FUNCTION__))
;
1309 return Scalars->second.find(I) != Scalars->second.end();
1310 }
1311
1312 /// Returns true if \p I is known to be uniform after vectorization.
1313 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1314 if (VF.isScalar())
1315 return true;
1316
1317 // Cost model is not run in the VPlan-native path - return conservative
1318 // result until this changes.
1319 if (EnableVPlanNativePath)
1320 return false;
1321
1322 auto UniformsPerVF = Uniforms.find(VF);
1323 assert(UniformsPerVF != Uniforms.end() &&((UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"
) ? static_cast<void> (0) : __assert_fail ("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1324, __PRETTY_FUNCTION__))
1324 "VF not yet analyzed for uniformity")((UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"
) ? static_cast<void> (0) : __assert_fail ("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1324, __PRETTY_FUNCTION__))
;
1325 return UniformsPerVF->second.count(I);
1326 }
1327
1328 /// Returns true if \p I is known to be scalar after vectorization.
1329 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1330 if (VF.isScalar())
1331 return true;
1332
1333 // Cost model is not run in the VPlan-native path - return conservative
1334 // result until this changes.
1335 if (EnableVPlanNativePath)
1336 return false;
1337
1338 auto ScalarsPerVF = Scalars.find(VF);
1339 assert(ScalarsPerVF != Scalars.end() &&((ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"
) ? static_cast<void> (0) : __assert_fail ("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1340, __PRETTY_FUNCTION__))
1340 "Scalar values are not calculated for VF")((ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"
) ? static_cast<void> (0) : __assert_fail ("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1340, __PRETTY_FUNCTION__))
;
1341 return ScalarsPerVF->second.count(I);
1342 }
1343
1344 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1345 /// for vectorization factor \p VF.
1346 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1347 return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1348 !isProfitableToScalarize(I, VF) &&
1349 !isScalarAfterVectorization(I, VF);
1350 }
1351
1352 /// Decision that was taken during cost calculation for memory instruction.
1353 enum InstWidening {
1354 CM_Unknown,
1355 CM_Widen, // For consecutive accesses with stride +1.
1356 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1357 CM_Interleave,
1358 CM_GatherScatter,
1359 CM_Scalarize
1360 };
1361
1362 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1363 /// instruction \p I and vector width \p VF.
1364 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1365 InstructionCost Cost) {
1366 assert(VF.isVector() && "Expected VF >=2")((VF.isVector() && "Expected VF >=2") ? static_cast
<void> (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1366, __PRETTY_FUNCTION__))
;
1367 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1368 }
1369
1370 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1371 /// interleaving group \p Grp and vector width \p VF.
1372 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1373 ElementCount VF, InstWidening W,
1374 InstructionCost Cost) {
1375 assert(VF.isVector() && "Expected VF >=2")((VF.isVector() && "Expected VF >=2") ? static_cast
<void> (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1375, __PRETTY_FUNCTION__))
;
1376 /// Broadcast this decicion to all instructions inside the group.
1377 /// But the cost will be assigned to one instruction only.
1378 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1379 if (auto *I = Grp->getMember(i)) {
1380 if (Grp->getInsertPos() == I)
1381 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1382 else
1383 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1384 }
1385 }
1386 }
1387
1388 /// Return the cost model decision for the given instruction \p I and vector
1389 /// width \p VF. Return CM_Unknown if this instruction did not pass
1390 /// through the cost modeling.
1391 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1392 assert(VF.isVector() && "Expected VF to be a vector VF")((VF.isVector() && "Expected VF to be a vector VF") ?
static_cast<void> (0) : __assert_fail ("VF.isVector() && \"Expected VF to be a vector VF\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1392, __PRETTY_FUNCTION__))
;
1393 // Cost model is not run in the VPlan-native path - return conservative
1394 // result until this changes.
1395 if (EnableVPlanNativePath)
1396 return CM_GatherScatter;
1397
1398 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1399 auto Itr = WideningDecisions.find(InstOnVF);
1400 if (Itr == WideningDecisions.end())
1401 return CM_Unknown;
1402 return Itr->second.first;
1403 }
1404
1405 /// Return the vectorization cost for the given instruction \p I and vector
1406 /// width \p VF.
1407 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1408 assert(VF.isVector() && "Expected VF >=2")((VF.isVector() && "Expected VF >=2") ? static_cast
<void> (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1408, __PRETTY_FUNCTION__))
;
1409 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1410 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&((WideningDecisions.find(InstOnVF) != WideningDecisions.end()
&& "The cost is not calculated") ? static_cast<void
> (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1411, __PRETTY_FUNCTION__))
1411 "The cost is not calculated")((WideningDecisions.find(InstOnVF) != WideningDecisions.end()
&& "The cost is not calculated") ? static_cast<void
> (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1411, __PRETTY_FUNCTION__))
;
1412 return WideningDecisions[InstOnVF].second;
1413 }
1414
1415 /// Return True if instruction \p I is an optimizable truncate whose operand
1416 /// is an induction variable. Such a truncate will be removed by adding a new
1417 /// induction variable with the destination type.
1418 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1419 // If the instruction is not a truncate, return false.
1420 auto *Trunc = dyn_cast<TruncInst>(I);
1421 if (!Trunc)
1422 return false;
1423
1424 // Get the source and destination types of the truncate.
1425 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1426 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1427
1428 // If the truncate is free for the given types, return false. Replacing a
1429 // free truncate with an induction variable would add an induction variable
1430 // update instruction to each iteration of the loop. We exclude from this
1431 // check the primary induction variable since it will need an update
1432 // instruction regardless.
1433 Value *Op = Trunc->getOperand(0);
1434 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1435 return false;
1436
1437 // If the truncated value is not an induction variable, return false.
1438 return Legal->isInductionPhi(Op);
1439 }
1440
1441 /// Collects the instructions to scalarize for each predicated instruction in
1442 /// the loop.
1443 void collectInstsToScalarize(ElementCount VF);
1444
1445 /// Collect Uniform and Scalar values for the given \p VF.
1446 /// The sets depend on CM decision for Load/Store instructions
1447 /// that may be vectorized as interleave, gather-scatter or scalarized.
1448 void collectUniformsAndScalars(ElementCount VF) {
1449 // Do the analysis once.
1450 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1451 return;
1452 setCostBasedWideningDecision(VF);
1453 collectLoopUniforms(VF);
1454 collectLoopScalars(VF);
1455 }
1456
1457 /// Returns true if the target machine supports masked store operation
1458 /// for the given \p DataType and kind of access to \p Ptr.
1459 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1460 return Legal->isConsecutivePtr(Ptr) &&
1461 TTI.isLegalMaskedStore(DataType, Alignment);
1462 }
1463
1464 /// Returns true if the target machine supports masked load operation
1465 /// for the given \p DataType and kind of access to \p Ptr.
1466 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1467 return Legal->isConsecutivePtr(Ptr) &&
1468 TTI.isLegalMaskedLoad(DataType, Alignment);
1469 }
1470
1471 /// Returns true if the target machine supports masked scatter operation
1472 /// for the given \p DataType.
1473 bool isLegalMaskedScatter(Type *DataType, Align Alignment) const {
1474 return TTI.isLegalMaskedScatter(DataType, Alignment);
1475 }
1476
1477 /// Returns true if the target machine supports masked gather operation
1478 /// for the given \p DataType.
1479 bool isLegalMaskedGather(Type *DataType, Align Alignment) const {
1480 return TTI.isLegalMaskedGather(DataType, Alignment);
1481 }
1482
1483 /// Returns true if the target machine can represent \p V as a masked gather
1484 /// or scatter operation.
1485 bool isLegalGatherOrScatter(Value *V) {
1486 bool LI = isa<LoadInst>(V);
1487 bool SI = isa<StoreInst>(V);
1488 if (!LI && !SI)
1489 return false;
1490 auto *Ty = getMemInstValueType(V);
1491 Align Align = getLoadStoreAlignment(V);
1492 return (LI && isLegalMaskedGather(Ty, Align)) ||
1493 (SI && isLegalMaskedScatter(Ty, Align));
1494 }
1495
1496 /// Returns true if the target machine supports all of the reduction
1497 /// variables found for the given VF.
1498 bool canVectorizeReductions(ElementCount VF) {
1499 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1500 RecurrenceDescriptor RdxDesc = Reduction.second;
1501 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1502 }));
1503 }
1504
1505 /// Returns true if \p I is an instruction that will be scalarized with
1506 /// predication. Such instructions include conditional stores and
1507 /// instructions that may divide by zero.
1508 /// If a non-zero VF has been calculated, we check if I will be scalarized
1509 /// predication for that VF.
1510 bool
1511 isScalarWithPredication(Instruction *I,
1512 ElementCount VF = ElementCount::getFixed(1)) const;
1513
1514 // Returns true if \p I is an instruction that will be predicated either
1515 // through scalar predication or masked load/store or masked gather/scatter.
1516 // Superset of instructions that return true for isScalarWithPredication.
1517 bool isPredicatedInst(Instruction *I) {
1518 if (!blockNeedsPredication(I->getParent()))
1519 return false;
1520 // Loads and stores that need some form of masked operation are predicated
1521 // instructions.
1522 if (isa<LoadInst>(I) || isa<StoreInst>(I))
1523 return Legal->isMaskRequired(I);
1524 return isScalarWithPredication(I);
1525 }
1526
1527 /// Returns true if \p I is a memory instruction with consecutive memory
1528 /// access that can be widened.
1529 bool
1530 memoryInstructionCanBeWidened(Instruction *I,
1531 ElementCount VF = ElementCount::getFixed(1));
1532
1533 /// Returns true if \p I is a memory instruction in an interleaved-group
1534 /// of memory accesses that can be vectorized with wide vector loads/stores
1535 /// and shuffles.
1536 bool
1537 interleavedAccessCanBeWidened(Instruction *I,
1538 ElementCount VF = ElementCount::getFixed(1));
1539
1540 /// Check if \p Instr belongs to any interleaved access group.
1541 bool isAccessInterleaved(Instruction *Instr) {
1542 return InterleaveInfo.isInterleaved(Instr);
1543 }
1544
1545 /// Get the interleaved access group that \p Instr belongs to.
1546 const InterleaveGroup<Instruction> *
1547 getInterleavedAccessGroup(Instruction *Instr) {
1548 return InterleaveInfo.getInterleaveGroup(Instr);
1549 }
1550
1551 /// Returns true if we're required to use a scalar epilogue for at least
1552 /// the final iteration of the original loop.
1553 bool requiresScalarEpilogue() const {
1554 if (!isScalarEpilogueAllowed())
1555 return false;
1556 // If we might exit from anywhere but the latch, must run the exiting
1557 // iteration in scalar form.
1558 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1559 return true;
1560 return InterleaveInfo.requiresScalarEpilogue();
1561 }
1562
1563 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1564 /// loop hint annotation.
1565 bool isScalarEpilogueAllowed() const {
1566 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1567 }
1568
1569 /// Returns true if all loop blocks should be masked to fold tail loop.
1570 bool foldTailByMasking() const { return FoldTailByMasking; }
1571
1572 bool blockNeedsPredication(BasicBlock *BB) const {
1573 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1574 }
1575
1576 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1577 /// nodes to the chain of instructions representing the reductions. Uses a
1578 /// MapVector to ensure deterministic iteration order.
1579 using ReductionChainMap =
1580 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1581
1582 /// Return the chain of instructions representing an inloop reduction.
1583 const ReductionChainMap &getInLoopReductionChains() const {
1584 return InLoopReductionChains;
1585 }
1586
1587 /// Returns true if the Phi is part of an inloop reduction.
1588 bool isInLoopReduction(PHINode *Phi) const {
1589 return InLoopReductionChains.count(Phi);
1590 }
1591
1592 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1593 /// with factor VF. Return the cost of the instruction, including
1594 /// scalarization overhead if it's needed.
1595 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1596
1597 /// Estimate cost of a call instruction CI if it were vectorized with factor
1598 /// VF. Return the cost of the instruction, including scalarization overhead
1599 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1600 /// scalarized -
1601 /// i.e. either vector version isn't available, or is too expensive.
1602 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1603 bool &NeedToScalarize) const;
1604
1605 /// Invalidates decisions already taken by the cost model.
1606 void invalidateCostModelingDecisions() {
1607 WideningDecisions.clear();
1608 Uniforms.clear();
1609 Scalars.clear();
1610 }
1611
1612private:
1613 unsigned NumPredStores = 0;
1614
1615 /// \return An upper bound for the vectorization factor, a power-of-2 larger
1616 /// than zero. One is returned if vectorization should best be avoided due
1617 /// to cost.
1618 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
1619 ElementCount UserVF);
1620
1621 /// The vectorization cost is a combination of the cost itself and a boolean
1622 /// indicating whether any of the contributing operations will actually
1623 /// operate on
1624 /// vector values after type legalization in the backend. If this latter value
1625 /// is
1626 /// false, then all operations will be scalarized (i.e. no vectorization has
1627 /// actually taken place).
1628 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1629
1630 /// Returns the expected execution cost. The unit of the cost does
1631 /// not matter because we use the 'cost' units to compare different
1632 /// vector widths. The cost that is returned is *not* normalized by
1633 /// the factor width.
1634 VectorizationCostTy expectedCost(ElementCount VF);
1635
1636 /// Returns the execution time cost of an instruction for a given vector
1637 /// width. Vector width of one means scalar.
1638 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1639
1640 /// The cost-computation logic from getInstructionCost which provides
1641 /// the vector type as an output parameter.
1642 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1643 Type *&VectorTy);
1644
1645 /// Return the cost of instructions in an inloop reduction pattern, if I is
1646 /// part of that pattern.
1647 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF,
1648 Type *VectorTy,
1649 TTI::TargetCostKind CostKind);
1650
1651 /// Calculate vectorization cost of memory instruction \p I.
1652 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1653
1654 /// The cost computation for scalarized memory instruction.
1655 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1656
1657 /// The cost computation for interleaving group of memory instructions.
1658 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1659
1660 /// The cost computation for Gather/Scatter instruction.
1661 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1662
1663 /// The cost computation for widening instruction \p I with consecutive
1664 /// memory access.
1665 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1666
1667 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1668 /// Load: scalar load + broadcast.
1669 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1670 /// element)
1671 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1672
1673 /// Estimate the overhead of scalarizing an instruction. This is a
1674 /// convenience wrapper for the type-based getScalarizationOverhead API.
1675 InstructionCost getScalarizationOverhead(Instruction *I,
1676 ElementCount VF) const;
1677
1678 /// Returns whether the instruction is a load or store and will be a emitted
1679 /// as a vector operation.
1680 bool isConsecutiveLoadOrStore(Instruction *I);
1681
1682 /// Returns true if an artificially high cost for emulated masked memrefs
1683 /// should be used.
1684 bool useEmulatedMaskMemRefHack(Instruction *I);
1685
1686 /// Map of scalar integer values to the smallest bitwidth they can be legally
1687 /// represented as. The vector equivalents of these values should be truncated
1688 /// to this type.
1689 MapVector<Instruction *, uint64_t> MinBWs;
1690
1691 /// A type representing the costs for instructions if they were to be
1692 /// scalarized rather than vectorized. The entries are Instruction-Cost
1693 /// pairs.
1694 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1695
1696 /// A set containing all BasicBlocks that are known to present after
1697 /// vectorization as a predicated block.
1698 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1699
1700 /// Records whether it is allowed to have the original scalar loop execute at
1701 /// least once. This may be needed as a fallback loop in case runtime
1702 /// aliasing/dependence checks fail, or to handle the tail/remainder
1703 /// iterations when the trip count is unknown or doesn't divide by the VF,
1704 /// or as a peel-loop to handle gaps in interleave-groups.
1705 /// Under optsize and when the trip count is very small we don't allow any
1706 /// iterations to execute in the scalar loop.
1707 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1708
1709 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1710 bool FoldTailByMasking = false;
1711
1712 /// A map holding scalar costs for different vectorization factors. The
1713 /// presence of a cost for an instruction in the mapping indicates that the
1714 /// instruction will be scalarized when vectorizing with the associated
1715 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1716 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1717
1718 /// Holds the instructions known to be uniform after vectorization.
1719 /// The data is collected per VF.
1720 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1721
1722 /// Holds the instructions known to be scalar after vectorization.
1723 /// The data is collected per VF.
1724 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1725
1726 /// Holds the instructions (address computations) that are forced to be
1727 /// scalarized.
1728 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1729
1730 /// PHINodes of the reductions that should be expanded in-loop along with
1731 /// their associated chains of reduction operations, in program order from top
1732 /// (PHI) to bottom
1733 ReductionChainMap InLoopReductionChains;
1734
1735 /// A Map of inloop reduction operations and their immediate chain operand.
1736 /// FIXME: This can be removed once reductions can be costed correctly in
1737 /// vplan. This was added to allow quick lookup to the inloop operations,
1738 /// without having to loop through InLoopReductionChains.
1739 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1740
1741 /// Returns the expected difference in cost from scalarizing the expression
1742 /// feeding a predicated instruction \p PredInst. The instructions to
1743 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1744 /// non-negative return value implies the expression will be scalarized.
1745 /// Currently, only single-use chains are considered for scalarization.
1746 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1747 ElementCount VF);
1748
1749 /// Collect the instructions that are uniform after vectorization. An
1750 /// instruction is uniform if we represent it with a single scalar value in
1751 /// the vectorized loop corresponding to each vector iteration. Examples of
1752 /// uniform instructions include pointer operands of consecutive or
1753 /// interleaved memory accesses. Note that although uniformity implies an
1754 /// instruction will be scalar, the reverse is not true. In general, a
1755 /// scalarized instruction will be represented by VF scalar values in the
1756 /// vectorized loop, each corresponding to an iteration of the original
1757 /// scalar loop.
1758 void collectLoopUniforms(ElementCount VF);
1759
1760 /// Collect the instructions that are scalar after vectorization. An
1761 /// instruction is scalar if it is known to be uniform or will be scalarized
1762 /// during vectorization. Non-uniform scalarized instructions will be
1763 /// represented by VF values in the vectorized loop, each corresponding to an
1764 /// iteration of the original scalar loop.
1765 void collectLoopScalars(ElementCount VF);
1766
1767 /// Keeps cost model vectorization decision and cost for instructions.
1768 /// Right now it is used for memory instructions only.
1769 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1770 std::pair<InstWidening, InstructionCost>>;
1771
1772 DecisionList WideningDecisions;
1773
1774 /// Returns true if \p V is expected to be vectorized and it needs to be
1775 /// extracted.
1776 bool needsExtract(Value *V, ElementCount VF) const {
1777 Instruction *I = dyn_cast<Instruction>(V);
1778 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1779 TheLoop->isLoopInvariant(I))
1780 return false;
1781
1782 // Assume we can vectorize V (and hence we need extraction) if the
1783 // scalars are not computed yet. This can happen, because it is called
1784 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1785 // the scalars are collected. That should be a safe assumption in most
1786 // cases, because we check if the operands have vectorizable types
1787 // beforehand in LoopVectorizationLegality.
1788 return Scalars.find(VF) == Scalars.end() ||
1789 !isScalarAfterVectorization(I, VF);
1790 };
1791
1792 /// Returns a range containing only operands needing to be extracted.
1793 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1794 ElementCount VF) const {
1795 return SmallVector<Value *, 4>(make_filter_range(
1796 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1797 }
1798
1799 /// Determines if we have the infrastructure to vectorize loop \p L and its
1800 /// epilogue, assuming the main loop is vectorized by \p VF.
1801 bool isCandidateForEpilogueVectorization(const Loop &L,
1802 const ElementCount VF) const;
1803
1804 /// Returns true if epilogue vectorization is considered profitable, and
1805 /// false otherwise.
1806 /// \p VF is the vectorization factor chosen for the original loop.
1807 bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1808
1809public:
1810 /// The loop that we evaluate.
1811 Loop *TheLoop;
1812
1813 /// Predicated scalar evolution analysis.
1814 PredicatedScalarEvolution &PSE;
1815
1816 /// Loop Info analysis.
1817 LoopInfo *LI;
1818
1819 /// Vectorization legality.
1820 LoopVectorizationLegality *Legal;
1821
1822 /// Vector target information.
1823 const TargetTransformInfo &TTI;
1824
1825 /// Target Library Info.
1826 const TargetLibraryInfo *TLI;
1827
1828 /// Demanded bits analysis.
1829 DemandedBits *DB;
1830
1831 /// Assumption cache.
1832 AssumptionCache *AC;
1833
1834 /// Interface to emit optimization remarks.
1835 OptimizationRemarkEmitter *ORE;
1836
1837 const Function *TheFunction;
1838
1839 /// Loop Vectorize Hint.
1840 const LoopVectorizeHints *Hints;
1841
1842 /// The interleave access information contains groups of interleaved accesses
1843 /// with the same stride and close to each other.
1844 InterleavedAccessInfo &InterleaveInfo;
1845
1846 /// Values to ignore in the cost model.
1847 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1848
1849 /// Values to ignore in the cost model when VF > 1.
1850 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1851
1852 /// Profitable vector factors.
1853 SmallVector<VectorizationFactor, 8> ProfitableVFs;
1854};
1855} // end namespace llvm
1856
1857/// Helper struct to manage generating runtime checks for vectorization.
1858///
1859/// The runtime checks are created up-front in temporary blocks to allow better
1860/// estimating the cost and un-linked from the existing IR. After deciding to
1861/// vectorize, the checks are moved back. If deciding not to vectorize, the
1862/// temporary blocks are completely removed.
1863class GeneratedRTChecks {
1864 /// Basic block which contains the generated SCEV checks, if any.
1865 BasicBlock *SCEVCheckBlock = nullptr;
1866
1867 /// The value representing the result of the generated SCEV checks. If it is
1868 /// nullptr, either no SCEV checks have been generated or they have been used.
1869 Value *SCEVCheckCond = nullptr;
1870
1871 /// Basic block which contains the generated memory runtime checks, if any.
1872 BasicBlock *MemCheckBlock = nullptr;
1873
1874 /// The value representing the result of the generated memory runtime checks.
1875 /// If it is nullptr, either no memory runtime checks have been generated or
1876 /// they have been used.
1877 Instruction *MemRuntimeCheckCond = nullptr;
1878
1879 DominatorTree *DT;
1880 LoopInfo *LI;
1881
1882 SCEVExpander SCEVExp;
1883 SCEVExpander MemCheckExp;
1884
1885public:
1886 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1887 const DataLayout &DL)
1888 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1889 MemCheckExp(SE, DL, "scev.check") {}
1890
1891 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1892 /// accurately estimate the cost of the runtime checks. The blocks are
1893 /// un-linked from the IR and is added back during vector code generation. If
1894 /// there is no vector code generation, the check blocks are removed
1895 /// completely.
1896 void Create(Loop *L, const LoopAccessInfo &LAI,
1897 const SCEVUnionPredicate &UnionPred) {
1898
1899 BasicBlock *LoopHeader = L->getHeader();
1900 BasicBlock *Preheader = L->getLoopPreheader();
1901
1902 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1903 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1904 // may be used by SCEVExpander. The blocks will be un-linked from their
1905 // predecessors and removed from LI & DT at the end of the function.
1906 if (!UnionPred.isAlwaysTrue()) {
1907 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1908 nullptr, "vector.scevcheck");
1909
1910 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1911 &UnionPred, SCEVCheckBlock->getTerminator());
1912 }
1913
1914 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1915 if (RtPtrChecking.Need) {
1916 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1917 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1918 "vector.memcheck");
1919
1920 std::tie(std::ignore, MemRuntimeCheckCond) =
1921 addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1922 RtPtrChecking.getChecks(), MemCheckExp);
1923 assert(MemRuntimeCheckCond &&((MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? static_cast<void> (0)
: __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1925, __PRETTY_FUNCTION__))
1924 "no RT checks generated although RtPtrChecking "((MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? static_cast<void> (0)
: __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1925, __PRETTY_FUNCTION__))
1925 "claimed checks are required")((MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? static_cast<void> (0)
: __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1925, __PRETTY_FUNCTION__))
;
1926 }
1927
1928 if (!MemCheckBlock && !SCEVCheckBlock)
1929 return;
1930
1931 // Unhook the temporary block with the checks, update various places
1932 // accordingly.
1933 if (SCEVCheckBlock)
1934 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1935 if (MemCheckBlock)
1936 MemCheckBlock->replaceAllUsesWith(Preheader);
1937
1938 if (SCEVCheckBlock) {
1939 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1940 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1941 Preheader->getTerminator()->eraseFromParent();
1942 }
1943 if (MemCheckBlock) {
1944 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1945 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1946 Preheader->getTerminator()->eraseFromParent();
1947 }
1948
1949 DT->changeImmediateDominator(LoopHeader, Preheader);
1950 if (MemCheckBlock) {
1951 DT->eraseNode(MemCheckBlock);
1952 LI->removeBlock(MemCheckBlock);
1953 }
1954 if (SCEVCheckBlock) {
1955 DT->eraseNode(SCEVCheckBlock);
1956 LI->removeBlock(SCEVCheckBlock);
1957 }
1958 }
1959
1960 /// Remove the created SCEV & memory runtime check blocks & instructions, if
1961 /// unused.
1962 ~GeneratedRTChecks() {
1963 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
1964 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
1965 if (!SCEVCheckCond)
1966 SCEVCleaner.markResultUsed();
1967
1968 if (!MemRuntimeCheckCond)
1969 MemCheckCleaner.markResultUsed();
1970
1971 if (MemRuntimeCheckCond) {
1972 auto &SE = *MemCheckExp.getSE();
1973 // Memory runtime check generation creates compares that use expanded
1974 // values. Remove them before running the SCEVExpanderCleaners.
1975 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
1976 if (MemCheckExp.isInsertedInstruction(&I))
1977 continue;
1978 SE.forgetValue(&I);
1979 SE.eraseValueFromMap(&I);
1980 I.eraseFromParent();
1981 }
1982 }
1983 MemCheckCleaner.cleanup();
1984 SCEVCleaner.cleanup();
1985
1986 if (SCEVCheckCond)
1987 SCEVCheckBlock->eraseFromParent();
1988 if (MemRuntimeCheckCond)
1989 MemCheckBlock->eraseFromParent();
1990 }
1991
1992 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
1993 /// adjusts the branches to branch to the vector preheader or \p Bypass,
1994 /// depending on the generated condition.
1995 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
1996 BasicBlock *LoopVectorPreHeader,
1997 BasicBlock *LoopExitBlock) {
1998 if (!SCEVCheckCond)
1999 return nullptr;
2000 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2001 if (C->isZero())
2002 return nullptr;
2003
2004 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2005
2006 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2007 // Create new preheader for vector loop.
2008 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2009 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2010
2011 SCEVCheckBlock->getTerminator()->eraseFromParent();
2012 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2013 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2014 SCEVCheckBlock);
2015
2016 DT->addNewBlock(SCEVCheckBlock, Pred);
2017 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2018
2019 ReplaceInstWithInst(
2020 SCEVCheckBlock->getTerminator(),
2021 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2022 // Mark the check as used, to prevent it from being removed during cleanup.
2023 SCEVCheckCond = nullptr;
2024 return SCEVCheckBlock;
2025 }
2026
2027 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2028 /// the branches to branch to the vector preheader or \p Bypass, depending on
2029 /// the generated condition.
2030 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2031 BasicBlock *LoopVectorPreHeader) {
2032 // Check if we generated code that checks in runtime if arrays overlap.
2033 if (!MemRuntimeCheckCond)
2034 return nullptr;
2035
2036 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2037 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2038 MemCheckBlock);
2039
2040 DT->addNewBlock(MemCheckBlock, Pred);
2041 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2042 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2043
2044 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2045 PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2046
2047 ReplaceInstWithInst(
2048 MemCheckBlock->getTerminator(),
2049 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2050 MemCheckBlock->getTerminator()->setDebugLoc(
2051 Pred->getTerminator()->getDebugLoc());
2052
2053 // Mark the check as used, to prevent it from being removed during cleanup.
2054 MemRuntimeCheckCond = nullptr;
2055 return MemCheckBlock;
2056 }
2057};
2058
2059// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2060// vectorization. The loop needs to be annotated with #pragma omp simd
2061// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2062// vector length information is not provided, vectorization is not considered
2063// explicit. Interleave hints are not allowed either. These limitations will be
2064// relaxed in the future.
2065// Please, note that we are currently forced to abuse the pragma 'clang
2066// vectorize' semantics. This pragma provides *auto-vectorization hints*
2067// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2068// provides *explicit vectorization hints* (LV can bypass legal checks and
2069// assume that vectorization is legal). However, both hints are implemented
2070// using the same metadata (llvm.loop.vectorize, processed by
2071// LoopVectorizeHints). This will be fixed in the future when the native IR
2072// representation for pragma 'omp simd' is introduced.
2073static bool isExplicitVecOuterLoop(Loop *OuterLp,
2074 OptimizationRemarkEmitter *ORE) {
2075 assert(!OuterLp->isInnermost() && "This is not an outer loop")((!OuterLp->isInnermost() && "This is not an outer loop"
) ? static_cast<void> (0) : __assert_fail ("!OuterLp->isInnermost() && \"This is not an outer loop\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2075, __PRETTY_FUNCTION__))
;
2076 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2077
2078 // Only outer loops with an explicit vectorization hint are supported.
2079 // Unannotated outer loops are ignored.
2080 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2081 return false;
2082
2083 Function *Fn = OuterLp->getHeader()->getParent();
2084 if (!Hints.allowVectorization(Fn, OuterLp,
2085 true /*VectorizeOnlyWhenForced*/)) {
2086 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"
; } } while (false)
;
2087 return false;
2088 }
2089
2090 if (Hints.getInterleave() > 1) {
2091 // TODO: Interleave support is future work.
2092 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
2093 "outer loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
;
2094 Hints.emitRemarkWithHints();
2095 return false;
2096 }
2097
2098 return true;
2099}
2100
2101static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2102 OptimizationRemarkEmitter *ORE,
2103 SmallVectorImpl<Loop *> &V) {
2104 // Collect inner loops and outer loops without irreducible control flow. For
2105 // now, only collect outer loops that have explicit vectorization hints. If we
2106 // are stress testing the VPlan H-CFG construction, we collect the outermost
2107 // loop of every loop nest.
2108 if (L.isInnermost() || VPlanBuildStressTest ||
2109 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2110 LoopBlocksRPO RPOT(&L);
2111 RPOT.perform(LI);
2112 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2113 V.push_back(&L);
2114 // TODO: Collect inner loops inside marked outer loops in case
2115 // vectorization fails for the outer loop. Do not invoke
2116 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2117 // already known to be reducible. We can use an inherited attribute for
2118 // that.
2119 return;
2120 }
2121 }
2122 for (Loop *InnerL : L)
2123 collectSupportedLoops(*InnerL, LI, ORE, V);
2124}
2125
2126namespace {
2127
2128/// The LoopVectorize Pass.
2129struct LoopVectorize : public FunctionPass {
2130 /// Pass identification, replacement for typeid
2131 static char ID;
2132
2133 LoopVectorizePass Impl;
2134
2135 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2136 bool VectorizeOnlyWhenForced = false)
2137 : FunctionPass(ID),
2138 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2139 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2140 }
2141
2142 bool runOnFunction(Function &F) override {
2143 if (skipFunction(F))
2144 return false;
2145
2146 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2147 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2148 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2149 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2150 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2151 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2152 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2153 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2154 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2155 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2156 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2157 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2158 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2159
2160 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2161 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2162
2163 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2164 GetLAA, *ORE, PSI).MadeAnyChange;
2165 }
2166
2167 void getAnalysisUsage(AnalysisUsage &AU) const override {
2168 AU.addRequired<AssumptionCacheTracker>();
2169 AU.addRequired<BlockFrequencyInfoWrapperPass>();
2170 AU.addRequired<DominatorTreeWrapperPass>();
2171 AU.addRequired<LoopInfoWrapperPass>();
2172 AU.addRequired<ScalarEvolutionWrapperPass>();
2173 AU.addRequired<TargetTransformInfoWrapperPass>();
2174 AU.addRequired<AAResultsWrapperPass>();
2175 AU.addRequired<LoopAccessLegacyAnalysis>();
2176 AU.addRequired<DemandedBitsWrapperPass>();
2177 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2178 AU.addRequired<InjectTLIMappingsLegacy>();
2179
2180 // We currently do not preserve loopinfo/dominator analyses with outer loop
2181 // vectorization. Until this is addressed, mark these analyses as preserved
2182 // only for non-VPlan-native path.
2183 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2184 if (!EnableVPlanNativePath) {
2185 AU.addPreserved<LoopInfoWrapperPass>();
2186 AU.addPreserved<DominatorTreeWrapperPass>();
2187 }
2188
2189 AU.addPreserved<BasicAAWrapperPass>();
2190 AU.addPreserved<GlobalsAAWrapperPass>();
2191 AU.addRequired<ProfileSummaryInfoWrapperPass>();
2192 }
2193};
2194
2195} // end anonymous namespace
2196
2197//===----------------------------------------------------------------------===//
2198// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2199// LoopVectorizationCostModel and LoopVectorizationPlanner.
2200//===----------------------------------------------------------------------===//
2201
2202Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2203 // We need to place the broadcast of invariant variables outside the loop,
2204 // but only if it's proven safe to do so. Else, broadcast will be inside
2205 // vector loop body.
2206 Instruction *Instr = dyn_cast<Instruction>(V);
2207 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2208 (!Instr ||
2209 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2210 // Place the code for broadcasting invariant variables in the new preheader.
2211 IRBuilder<>::InsertPointGuard Guard(Builder);
2212 if (SafeToHoist)
2213 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2214
2215 // Broadcast the scalar into all locations in the vector.
2216 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2217
2218 return Shuf;
2219}
2220
2221void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2222 const InductionDescriptor &II, Value *Step, Value *Start,
2223 Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
2224 VPTransformState &State) {
2225 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2226, __PRETTY_FUNCTION__))
2226 "Expected either an induction phi-node or a truncate of it!")(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2226, __PRETTY_FUNCTION__))
;
2227
2228 // Construct the initial value of the vector IV in the vector loop preheader
2229 auto CurrIP = Builder.saveIP();
2230 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2231 if (isa<TruncInst>(EntryVal)) {
2232 assert(Start->getType()->isIntegerTy() &&((Start->getType()->isIntegerTy() && "Truncation requires an integer type"
) ? static_cast<void> (0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2233, __PRETTY_FUNCTION__))
2233 "Truncation requires an integer type")((Start->getType()->isIntegerTy() && "Truncation requires an integer type"
) ? static_cast<void> (0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2233, __PRETTY_FUNCTION__))
;
2234 auto *TruncType = cast<IntegerType>(EntryVal->getType());
2235 Step = Builder.CreateTrunc(Step, TruncType);
2236 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2237 }
2238 Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2239 Value *SteppedStart =
2240 getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2241
2242 // We create vector phi nodes for both integer and floating-point induction
2243 // variables. Here, we determine the kind of arithmetic we will perform.
2244 Instruction::BinaryOps AddOp;
2245 Instruction::BinaryOps MulOp;
2246 if (Step->getType()->isIntegerTy()) {
2247 AddOp = Instruction::Add;
2248 MulOp = Instruction::Mul;
2249 } else {
2250 AddOp = II.getInductionOpcode();
2251 MulOp = Instruction::FMul;
2252 }
2253
2254 // Multiply the vectorization factor by the step using integer or
2255 // floating-point arithmetic as appropriate.
2256 Type *StepType = Step->getType();
2257 if (Step->getType()->isFloatingPointTy())
2258 StepType = IntegerType::get(StepType->getContext(),
2259 StepType->getScalarSizeInBits());
2260 Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF);
2261 if (Step->getType()->isFloatingPointTy())
2262 RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType());
2263 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2264
2265 // Create a vector splat to use in the induction update.
2266 //
2267 // FIXME: If the step is non-constant, we create the vector splat with
2268 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2269 // handle a constant vector splat.
2270 Value *SplatVF = isa<Constant>(Mul)
2271 ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2272 : Builder.CreateVectorSplat(VF, Mul);
2273 Builder.restoreIP(CurrIP);
2274
2275 // We may need to add the step a number of times, depending on the unroll
2276 // factor. The last of those goes into the PHI.
2277 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2278 &*LoopVectorBody->getFirstInsertionPt());
2279 VecInd->setDebugLoc(EntryVal->getDebugLoc());
2280 Instruction *LastInduction = VecInd;
2281 for (unsigned Part = 0; Part < UF; ++Part) {
2282 State.set(Def, LastInduction, Part);
2283
2284 if (isa<TruncInst>(EntryVal))
2285 addMetadata(LastInduction, EntryVal);
2286 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
2287 State, Part);
2288
2289 LastInduction = cast<Instruction>(
2290 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2291 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2292 }
2293
2294 // Move the last step to the end of the latch block. This ensures consistent
2295 // placement of all induction updates.
2296 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2297 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2298 auto *ICmp = cast<Instruction>(Br->getCondition());
2299 LastInduction->moveBefore(ICmp);
2300 LastInduction->setName("vec.ind.next");
2301
2302 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2303 VecInd->addIncoming(LastInduction, LoopVectorLatch);
2304}
2305
2306bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2307 return Cost->isScalarAfterVectorization(I, VF) ||
2308 Cost->isProfitableToScalarize(I, VF);
2309}
2310
2311bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2312 if (shouldScalarizeInstruction(IV))
2313 return true;
2314 auto isScalarInst = [&](User *U) -> bool {
2315 auto *I = cast<Instruction>(U);
2316 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2317 };
2318 return llvm::any_of(IV->users(), isScalarInst);
2319}
2320
2321void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2322 const InductionDescriptor &ID, const Instruction *EntryVal,
2323 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
2324 unsigned Part, unsigned Lane) {
2325 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2326, __PRETTY_FUNCTION__))
2326 "Expected either an induction phi-node or a truncate of it!")(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2326, __PRETTY_FUNCTION__))
;
2327
2328 // This induction variable is not the phi from the original loop but the
2329 // newly-created IV based on the proof that casted Phi is equal to the
2330 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2331 // re-uses the same InductionDescriptor that original IV uses but we don't
2332 // have to do any recording in this case - that is done when original IV is
2333 // processed.
2334 if (isa<TruncInst>(EntryVal))
2335 return;
2336
2337 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2338 if (Casts.empty())
2339 return;
2340 // Only the first Cast instruction in the Casts vector is of interest.
2341 // The rest of the Casts (if exist) have no uses outside the
2342 // induction update chain itself.
2343 if (Lane < UINT_MAX(2147483647 *2U +1U))
2344 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
2345 else
2346 State.set(CastDef, VectorLoopVal, Part);
2347}
2348
2349void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
2350 TruncInst *Trunc, VPValue *Def,
2351 VPValue *CastDef,
2352 VPTransformState &State) {
2353 assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&(((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type") ? static_cast
<void> (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2354, __PRETTY_FUNCTION__))
2354 "Primary induction variable must have an integer type")(((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type") ? static_cast
<void> (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2354, __PRETTY_FUNCTION__))
;
2355
2356 auto II = Legal->getInductionVars().find(IV);
2357 assert(II != Legal->getInductionVars().end() && "IV is not an induction")((II != Legal->getInductionVars().end() && "IV is not an induction"
) ? static_cast<void> (0) : __assert_fail ("II != Legal->getInductionVars().end() && \"IV is not an induction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2357, __PRETTY_FUNCTION__))
;
2358
2359 auto ID = II->second;
2360 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match")((IV->getType() == ID.getStartValue()->getType() &&
"Types must match") ? static_cast<void> (0) : __assert_fail
("IV->getType() == ID.getStartValue()->getType() && \"Types must match\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2360, __PRETTY_FUNCTION__))
;
2361
2362 // The value from the original loop to which we are mapping the new induction
2363 // variable.
2364 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2365
2366 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2367
2368 // Generate code for the induction step. Note that induction steps are
2369 // required to be loop-invariant
2370 auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2371 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&((PSE.getSE()->isLoopInvariant(Step, OrigLoop) && "Induction step should be loop invariant"
) ? static_cast<void> (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(Step, OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2372, __PRETTY_FUNCTION__))
2372 "Induction step should be loop invariant")((PSE.getSE()->isLoopInvariant(Step, OrigLoop) && "Induction step should be loop invariant"
) ? static_cast<void> (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(Step, OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2372, __PRETTY_FUNCTION__))
;
2373 if (PSE.getSE()->isSCEVable(IV->getType())) {
2374 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2375 return Exp.expandCodeFor(Step, Step->getType(),
2376 LoopVectorPreHeader->getTerminator());
2377 }
2378 return cast<SCEVUnknown>(Step)->getValue();
2379 };
2380
2381 // The scalar value to broadcast. This is derived from the canonical
2382 // induction variable. If a truncation type is given, truncate the canonical
2383 // induction variable and step. Otherwise, derive these values from the
2384 // induction descriptor.
2385 auto CreateScalarIV = [&](Value *&Step) -> Value * {
2386 Value *ScalarIV = Induction;
2387 if (IV != OldInduction) {
2388 ScalarIV = IV->getType()->isIntegerTy()
2389 ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2390 : Builder.CreateCast(Instruction::SIToFP, Induction,
2391 IV->getType());
2392 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2393 ScalarIV->setName("offset.idx");
2394 }
2395 if (Trunc) {
2396 auto *TruncType = cast<IntegerType>(Trunc->getType());
2397 assert(Step->getType()->isIntegerTy() &&((Step->getType()->isIntegerTy() && "Truncation requires an integer step"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2398, __PRETTY_FUNCTION__))
2398 "Truncation requires an integer step")((Step->getType()->isIntegerTy() && "Truncation requires an integer step"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2398, __PRETTY_FUNCTION__))
;
2399 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2400 Step = Builder.CreateTrunc(Step, TruncType);
2401 }
2402 return ScalarIV;
2403 };
2404
2405 // Create the vector values from the scalar IV, in the absence of creating a
2406 // vector IV.
2407 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2408 Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2409 for (unsigned Part = 0; Part < UF; ++Part) {
2410 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2410, __PRETTY_FUNCTION__))
;
2411 Value *EntryPart =
2412 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2413 ID.getInductionOpcode());
2414 State.set(Def, EntryPart, Part);
2415 if (Trunc)
2416 addMetadata(EntryPart, Trunc);
2417 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
2418 State, Part);
2419 }
2420 };
2421
2422 // Fast-math-flags propagate from the original induction instruction.
2423 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2424 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2425 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2426
2427 // Now do the actual transformations, and start with creating the step value.
2428 Value *Step = CreateStepValue(ID.getStep());
2429 if (VF.isZero() || VF.isScalar()) {
2430 Value *ScalarIV = CreateScalarIV(Step);
2431 CreateSplatIV(ScalarIV, Step);
2432 return;
2433 }
2434
2435 // Determine if we want a scalar version of the induction variable. This is
2436 // true if the induction variable itself is not widened, or if it has at
2437 // least one user in the loop that is not widened.
2438 auto NeedsScalarIV = needsScalarInduction(EntryVal);
2439 if (!NeedsScalarIV) {
2440 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2441 State);
2442 return;
2443 }
2444
2445 // Try to create a new independent vector induction variable. If we can't
2446 // create the phi node, we will splat the scalar induction variable in each
2447 // loop iteration.
2448 if (!shouldScalarizeInstruction(EntryVal)) {
2449 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2450 State);
2451 Value *ScalarIV = CreateScalarIV(Step);
2452 // Create scalar steps that can be used by instructions we will later
2453 // scalarize. Note that the addition of the scalar steps will not increase
2454 // the number of instructions in the loop in the common case prior to
2455 // InstCombine. We will be trading one vector extract for each scalar step.
2456 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2457 return;
2458 }
2459
2460 // All IV users are scalar instructions, so only emit a scalar IV, not a
2461 // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2462 // predicate used by the masked loads/stores.
2463 Value *ScalarIV = CreateScalarIV(Step);
2464 if (!Cost->isScalarEpilogueAllowed())
2465 CreateSplatIV(ScalarIV, Step);
2466 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2467}
2468
2469Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2470 Instruction::BinaryOps BinOp) {
2471 // Create and check the types.
2472 auto *ValVTy = cast<VectorType>(Val->getType());
2473 ElementCount VLen = ValVTy->getElementCount();
2474
2475 Type *STy = Val->getType()->getScalarType();
2476 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&(((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
"Induction Step must be an integer or FP") ? static_cast<
void> (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2477, __PRETTY_FUNCTION__))
2477 "Induction Step must be an integer or FP")(((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
"Induction Step must be an integer or FP") ? static_cast<
void> (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2477, __PRETTY_FUNCTION__))
;
2478 assert(Step->getType() == STy && "Step has wrong type")((Step->getType() == STy && "Step has wrong type")
? static_cast<void> (0) : __assert_fail ("Step->getType() == STy && \"Step has wrong type\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2478, __PRETTY_FUNCTION__))
;
2479
2480 SmallVector<Constant *, 8> Indices;
2481
2482 // Create a vector of consecutive numbers from zero to VF.
2483 VectorType *InitVecValVTy = ValVTy;
2484 Type *InitVecValSTy = STy;
2485 if (STy->isFloatingPointTy()) {
2486 InitVecValSTy =
2487 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2488 InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2489 }
2490 Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2491
2492 // Add on StartIdx
2493 Value *StartIdxSplat = Builder.CreateVectorSplat(
2494 VLen, ConstantInt::get(InitVecValSTy, StartIdx));
2495 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2496
2497 if (STy->isIntegerTy()) {
2498 Step = Builder.CreateVectorSplat(VLen, Step);
2499 assert(Step->getType() == Val->getType() && "Invalid step vec")((Step->getType() == Val->getType() && "Invalid step vec"
) ? static_cast<void> (0) : __assert_fail ("Step->getType() == Val->getType() && \"Invalid step vec\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2499, __PRETTY_FUNCTION__))
;
2500 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2501 // which can be found from the original scalar operations.
2502 Step = Builder.CreateMul(InitVec, Step);
2503 return Builder.CreateAdd(Val, Step, "induction");
2504 }
2505
2506 // Floating point induction.
2507 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&(((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
"Binary Opcode should be specified for FP induction") ? static_cast
<void> (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2508, __PRETTY_FUNCTION__))
2508 "Binary Opcode should be specified for FP induction")(((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
"Binary Opcode should be specified for FP induction") ? static_cast
<void> (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2508, __PRETTY_FUNCTION__))
;
2509 InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2510 Step = Builder.CreateVectorSplat(VLen, Step);
2511 Value *MulOp = Builder.CreateFMul(InitVec, Step);
2512 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2513}
2514
2515void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2516 Instruction *EntryVal,
2517 const InductionDescriptor &ID,
2518 VPValue *Def, VPValue *CastDef,
2519 VPTransformState &State) {
2520 // We shouldn't have to build scalar steps if we aren't vectorizing.
2521 assert(VF.isVector() && "VF should be greater than one")((VF.isVector() && "VF should be greater than one") ?
static_cast<void> (0) : __assert_fail ("VF.isVector() && \"VF should be greater than one\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2521, __PRETTY_FUNCTION__))
;
2522 // Get the value type and ensure it and the step have the same integer type.
2523 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2524 assert(ScalarIVTy == Step->getType() &&((ScalarIVTy == Step->getType() && "Val and Step should have the same type"
) ? static_cast<void> (0) : __assert_fail ("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2525, __PRETTY_FUNCTION__))
2525 "Val and Step should have the same type")((ScalarIVTy == Step->getType() && "Val and Step should have the same type"
) ? static_cast<void> (0) : __assert_fail ("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2525, __PRETTY_FUNCTION__))
;
2526
2527 // We build scalar steps for both integer and floating-point induction
2528 // variables. Here, we determine the kind of arithmetic we will perform.
2529 Instruction::BinaryOps AddOp;
2530 Instruction::BinaryOps MulOp;
2531 if (ScalarIVTy->isIntegerTy()) {
2532 AddOp = Instruction::Add;
2533 MulOp = Instruction::Mul;
2534 } else {
2535 AddOp = ID.getInductionOpcode();
2536 MulOp = Instruction::FMul;
2537 }
2538
2539 // Determine the number of scalars we need to generate for each unroll
2540 // iteration. If EntryVal is uniform, we only need to generate the first
2541 // lane. Otherwise, we generate all VF values.
2542 bool IsUniform =
2543 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF);
2544 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue();
2545 // Compute the scalar steps and save the results in State.
2546 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2547 ScalarIVTy->getScalarSizeInBits());
2548 Type *VecIVTy = nullptr;
2549 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2550 if (!IsUniform && VF.isScalable()) {
2551 VecIVTy = VectorType::get(ScalarIVTy, VF);
2552 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF));
2553 SplatStep = Builder.CreateVectorSplat(VF, Step);
2554 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV);
2555 }
2556
2557 for (unsigned Part = 0; Part < UF; ++Part) {
2558 Value *StartIdx0 =
2559 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2560
2561 if (!IsUniform && VF.isScalable()) {
2562 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0);
2563 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2564 if (ScalarIVTy->isFloatingPointTy())
2565 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2566 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2567 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2568 State.set(Def, Add, Part);
2569 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2570 Part);
2571 // It's useful to record the lane values too for the known minimum number
2572 // of elements so we do those below. This improves the code quality when
2573 // trying to extract the first element, for example.
2574 }
2575
2576 if (ScalarIVTy->isFloatingPointTy())
2577 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2578
2579 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2580 Value *StartIdx = Builder.CreateBinOp(
2581 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2582 // The step returned by `createStepForVF` is a runtime-evaluated value
2583 // when VF is scalable. Otherwise, it should be folded into a Constant.
2584 assert((VF.isScalable() || isa<Constant>(StartIdx)) &&(((VF.isScalable() || isa<Constant>(StartIdx)) &&
"Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? static_cast<void> (0) : __assert_fail ("(VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2586, __PRETTY_FUNCTION__))
2585 "Expected StartIdx to be folded to a constant when VF is not "(((VF.isScalable() || isa<Constant>(StartIdx)) &&
"Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? static_cast<void> (0) : __assert_fail ("(VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2586, __PRETTY_FUNCTION__))
2586 "scalable")(((VF.isScalable() || isa<Constant>(StartIdx)) &&
"Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? static_cast<void> (0) : __assert_fail ("(VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2586, __PRETTY_FUNCTION__))
;
2587 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2588 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2589 State.set(Def, Add, VPIteration(Part, Lane));
2590 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2591 Part, Lane);
2592 }
2593 }
2594}
2595
2596void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2597 const VPIteration &Instance,
2598 VPTransformState &State) {
2599 Value *ScalarInst = State.get(Def, Instance);
2600 Value *VectorValue = State.get(Def, Instance.Part);
2601 VectorValue = Builder.CreateInsertElement(
2602 VectorValue, ScalarInst,
2603 Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2604 State.set(Def, VectorValue, Instance.Part);
2605}
2606
2607Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2608 assert(Vec->getType()->isVectorTy() && "Invalid type")((Vec->getType()->isVectorTy() && "Invalid type"
) ? static_cast<void> (0) : __assert_fail ("Vec->getType()->isVectorTy() && \"Invalid type\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2608, __PRETTY_FUNCTION__))
;
2609 return Builder.CreateVectorReverse(Vec, "reverse");
2610}
2611
2612// Return whether we allow using masked interleave-groups (for dealing with
2613// strided loads/stores that reside in predicated blocks, or for dealing
2614// with gaps).
2615static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2616 // If an override option has been passed in for interleaved accesses, use it.
2617 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2618 return EnableMaskedInterleavedMemAccesses;
2619
2620 return TTI.enableMaskedInterleavedAccessVectorization();
2621}
2622
2623// Try to vectorize the interleave group that \p Instr belongs to.
2624//
2625// E.g. Translate following interleaved load group (factor = 3):
2626// for (i = 0; i < N; i+=3) {
2627// R = Pic[i]; // Member of index 0
2628// G = Pic[i+1]; // Member of index 1
2629// B = Pic[i+2]; // Member of index 2
2630// ... // do something to R, G, B
2631// }
2632// To:
2633// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2634// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2635// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2636// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2637//
2638// Or translate following interleaved store group (factor = 3):
2639// for (i = 0; i < N; i+=3) {
2640// ... do something to R, G, B
2641// Pic[i] = R; // Member of index 0
2642// Pic[i+1] = G; // Member of index 1
2643// Pic[i+2] = B; // Member of index 2
2644// }
2645// To:
2646// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2647// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2648// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2649// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2650// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2651void InnerLoopVectorizer::vectorizeInterleaveGroup(
2652 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2653 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2654 VPValue *BlockInMask) {
2655 Instruction *Instr = Group->getInsertPos();
2656 const DataLayout &DL = Instr->getModule()->getDataLayout();
2657
2658 // Prepare for the vector type of the interleaved load/store.
2659 Type *ScalarTy = getMemInstValueType(Instr);
2660 unsigned InterleaveFactor = Group->getFactor();
2661 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2661, __PRETTY_FUNCTION__))
;
2662 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2663
2664 // Prepare for the new pointers.
2665 SmallVector<Value *, 2> AddrParts;
2666 unsigned Index = Group->getIndex(Instr);
2667
2668 // TODO: extend the masked interleaved-group support to reversed access.
2669 assert((!BlockInMask || !Group->isReverse()) &&(((!BlockInMask || !Group->isReverse()) && "Reversed masked interleave-group not supported."
) ? static_cast<void> (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2670, __PRETTY_FUNCTION__))
2670 "Reversed masked interleave-group not supported.")(((!BlockInMask || !Group->isReverse()) && "Reversed masked interleave-group not supported."
) ? static_cast<void> (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2670, __PRETTY_FUNCTION__))
;
2671
2672 // If the group is reverse, adjust the index to refer to the last vector lane
2673 // instead of the first. We adjust the index from the first vector lane,
2674 // rather than directly getting the pointer for lane VF - 1, because the
2675 // pointer operand of the interleaved access is supposed to be uniform. For
2676 // uniform instructions, we're only required to generate a value for the
2677 // first vector lane in each unroll iteration.
2678 assert(!VF.isScalable() &&((!VF.isScalable() && "scalable vector reverse operation is not implemented"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vector reverse operation is not implemented\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2679, __PRETTY_FUNCTION__))
2679 "scalable vector reverse operation is not implemented")((!VF.isScalable() && "scalable vector reverse operation is not implemented"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vector reverse operation is not implemented\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2679, __PRETTY_FUNCTION__))
;
2680 if (Group->isReverse())
2681 Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2682
2683 for (unsigned Part = 0; Part < UF; Part++) {
2684 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2685 setDebugLocFromInst(Builder, AddrPart);
2686
2687 // Notice current instruction could be any index. Need to adjust the address
2688 // to the member of index 0.
2689 //
2690 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2691 // b = A[i]; // Member of index 0
2692 // Current pointer is pointed to A[i+1], adjust it to A[i].
2693 //
2694 // E.g. A[i+1] = a; // Member of index 1
2695 // A[i] = b; // Member of index 0
2696 // A[i+2] = c; // Member of index 2 (Current instruction)
2697 // Current pointer is pointed to A[i+2], adjust it to A[i].
2698
2699 bool InBounds = false;
2700 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2701 InBounds = gep->isInBounds();
2702 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2703 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2704
2705 // Cast to the vector pointer type.
2706 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2707 Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2708 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2709 }
2710
2711 setDebugLocFromInst(Builder, Instr);
2712 Value *PoisonVec = PoisonValue::get(VecTy);
2713
2714 Value *MaskForGaps = nullptr;
2715 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2716 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2716, __PRETTY_FUNCTION__))
;
2717 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2718 assert(MaskForGaps && "Mask for Gaps is required but it is null")((MaskForGaps && "Mask for Gaps is required but it is null"
) ? static_cast<void> (0) : __assert_fail ("MaskForGaps && \"Mask for Gaps is required but it is null\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2718, __PRETTY_FUNCTION__))
;
2719 }
2720
2721 // Vectorize the interleaved load group.
2722 if (isa<LoadInst>(Instr)) {
2723 // For each unroll part, create a wide load for the group.
2724 SmallVector<Value *, 2> NewLoads;
2725 for (unsigned Part = 0; Part < UF; Part++) {
2726 Instruction *NewLoad;
2727 if (BlockInMask || MaskForGaps) {
2728 assert(useMaskedInterleavedAccesses(*TTI) &&((useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2729, __PRETTY_FUNCTION__))
2729 "masked interleaved groups are not allowed.")((useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2729, __PRETTY_FUNCTION__))
;
2730 Value *GroupMask = MaskForGaps;
2731 if (BlockInMask) {
2732 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2733 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2733, __PRETTY_FUNCTION__))
;
2734 Value *ShuffledMask = Builder.CreateShuffleVector(
2735 BlockInMaskPart,
2736 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2737 "interleaved.mask");
2738 GroupMask = MaskForGaps
2739 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2740 MaskForGaps)
2741 : ShuffledMask;
2742 }
2743 NewLoad =
2744 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2745 GroupMask, PoisonVec, "wide.masked.vec");
2746 }
2747 else
2748 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2749 Group->getAlign(), "wide.vec");
2750 Group->addMetadata(NewLoad);
2751 NewLoads.push_back(NewLoad);
2752 }
2753
2754 // For each member in the group, shuffle out the appropriate data from the
2755 // wide loads.
2756 unsigned J = 0;
2757 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2758 Instruction *Member = Group->getMember(I);
2759
2760 // Skip the gaps in the group.
2761 if (!Member)
2762 continue;
2763
2764 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2764, __PRETTY_FUNCTION__))
;
2765 auto StrideMask =
2766 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2767 for (unsigned Part = 0; Part < UF; Part++) {
2768 Value *StridedVec = Builder.CreateShuffleVector(
2769 NewLoads[Part], StrideMask, "strided.vec");
2770
2771 // If this member has different type, cast the result type.
2772 if (Member->getType() != ScalarTy) {
2773 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2773, __PRETTY_FUNCTION__))
;
2774 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2775 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2776 }
2777
2778 if (Group->isReverse())
2779 StridedVec = reverseVector(StridedVec);
2780
2781 State.set(VPDefs[J], StridedVec, Part);
2782 }
2783 ++J;
2784 }
2785 return;
2786 }
2787
2788 // The sub vector type for current instruction.
2789 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2789, __PRETTY_FUNCTION__))
;
2790 auto *SubVT = VectorType::get(ScalarTy, VF);
2791
2792 // Vectorize the interleaved store group.
2793 for (unsigned Part = 0; Part < UF; Part++) {
2794 // Collect the stored vector from each member.
2795 SmallVector<Value *, 4> StoredVecs;
2796 for (unsigned i = 0; i < InterleaveFactor; i++) {
2797 // Interleaved store group doesn't allow a gap, so each index has a member
2798 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group")((Group->getMember(i) && "Fail to get a member from an interleaved store group"
) ? static_cast<void> (0) : __assert_fail ("Group->getMember(i) && \"Fail to get a member from an interleaved store group\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2798, __PRETTY_FUNCTION__))
;
2799
2800 Value *StoredVec = State.get(StoredValues[i], Part);
2801
2802 if (Group->isReverse())
2803 StoredVec = reverseVector(StoredVec);
2804
2805 // If this member has different type, cast it to a unified type.
2806
2807 if (StoredVec->getType() != SubVT)
2808 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2809
2810 StoredVecs.push_back(StoredVec);
2811 }
2812
2813 // Concatenate all vectors into a wide vector.
2814 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2815
2816 // Interleave the elements in the wide vector.
2817 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2817, __PRETTY_FUNCTION__))
;
2818 Value *IVec = Builder.CreateShuffleVector(
2819 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2820 "interleaved.vec");
2821
2822 Instruction *NewStoreInstr;
2823 if (BlockInMask) {
2824 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2825 Value *ShuffledMask = Builder.CreateShuffleVector(
2826 BlockInMaskPart,
2827 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2828 "interleaved.mask");
2829 NewStoreInstr = Builder.CreateMaskedStore(
2830 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2831 }
2832 else
2833 NewStoreInstr =
2834 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2835
2836 Group->addMetadata(NewStoreInstr);
2837 }
2838}
2839
2840void InnerLoopVectorizer::vectorizeMemoryInstruction(
2841 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2842 VPValue *StoredValue, VPValue *BlockInMask) {
2843 // Attempt to issue a wide load.
2844 LoadInst *LI = dyn_cast<LoadInst>(Instr);
2845 StoreInst *SI = dyn_cast<StoreInst>(Instr);
2846
2847 assert((LI || SI) && "Invalid Load/Store instruction")(((LI || SI) && "Invalid Load/Store instruction") ? static_cast
<void> (0) : __assert_fail ("(LI || SI) && \"Invalid Load/Store instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2847, __PRETTY_FUNCTION__))
;
2848 assert((!SI || StoredValue) && "No stored value provided for widened store")(((!SI || StoredValue) && "No stored value provided for widened store"
) ? static_cast<void> (0) : __assert_fail ("(!SI || StoredValue) && \"No stored value provided for widened store\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2848, __PRETTY_FUNCTION__))
;
2849 assert((!LI || !StoredValue) && "Stored value provided for widened load")(((!LI || !StoredValue) && "Stored value provided for widened load"
) ? static_cast<void> (0) : __assert_fail ("(!LI || !StoredValue) && \"Stored value provided for widened load\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2849, __PRETTY_FUNCTION__))
;
2850
2851 LoopVectorizationCostModel::InstWidening Decision =
2852 Cost->getWideningDecision(Instr, VF);
2853 assert((Decision == LoopVectorizationCostModel::CM_Widen ||(((Decision == LoopVectorizationCostModel::CM_Widen || Decision
== LoopVectorizationCostModel::CM_Widen_Reverse || Decision ==
LoopVectorizationCostModel::CM_GatherScatter) && "CM decision is not to widen the memory instruction"
) ? static_cast<void> (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2856, __PRETTY_FUNCTION__))
2854 Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||(((Decision == LoopVectorizationCostModel::CM_Widen || Decision
== LoopVectorizationCostModel::CM_Widen_Reverse || Decision ==
LoopVectorizationCostModel::CM_GatherScatter) && "CM decision is not to widen the memory instruction"
) ? static_cast<void> (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2856, __PRETTY_FUNCTION__))
2855 Decision == LoopVectorizationCostModel::CM_GatherScatter) &&(((Decision == LoopVectorizationCostModel::CM_Widen || Decision
== LoopVectorizationCostModel::CM_Widen_Reverse || Decision ==
LoopVectorizationCostModel::CM_GatherScatter) && "CM decision is not to widen the memory instruction"
) ? static_cast<void> (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2856, __PRETTY_FUNCTION__))
2856 "CM decision is not to widen the memory instruction")(((Decision == LoopVectorizationCostModel::CM_Widen || Decision
== LoopVectorizationCostModel::CM_Widen_Reverse || Decision ==
LoopVectorizationCostModel::CM_GatherScatter) && "CM decision is not to widen the memory instruction"
) ? static_cast<void> (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2856, __PRETTY_FUNCTION__))
;
2857
2858 Type *ScalarDataTy = getMemInstValueType(Instr);
2859
2860 auto *DataTy = VectorType::get(ScalarDataTy, VF);
2861 const Align Alignment = getLoadStoreAlignment(Instr);
2862
2863 // Determine if the pointer operand of the access is either consecutive or
2864 // reverse consecutive.
2865 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2866 bool ConsecutiveStride =
2867 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2868 bool CreateGatherScatter =
2869 (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2870
2871 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2872 // gather/scatter. Otherwise Decision should have been to Scalarize.
2873 assert((ConsecutiveStride || CreateGatherScatter) &&(((ConsecutiveStride || CreateGatherScatter) && "The instruction should be scalarized"
) ? static_cast<void> (0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2874, __PRETTY_FUNCTION__))
2874 "The instruction should be scalarized")(((ConsecutiveStride || CreateGatherScatter) && "The instruction should be scalarized"
) ? static_cast<void> (0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2874, __PRETTY_FUNCTION__))
;
2875 (void)ConsecutiveStride;
2876
2877 VectorParts BlockInMaskParts(UF);
2878 bool isMaskRequired = BlockInMask;
2879 if (isMaskRequired)
2880 for (unsigned Part = 0; Part < UF; ++Part)
2881 BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2882
2883 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2884 // Calculate the pointer for the specific unroll-part.
2885 GetElementPtrInst *PartPtr = nullptr;
2886
2887 bool InBounds = false;
2888 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2889 InBounds = gep->isInBounds();
2890 if (Reverse) {
2891 // If the address is consecutive but reversed, then the
2892 // wide store needs to start at the last vector element.
2893 // RunTimeVF = VScale * VF.getKnownMinValue()
2894 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
2895 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2896 // NumElt = -Part * RunTimeVF
2897 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
2898 // LastLane = 1 - RunTimeVF
2899 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
2900 PartPtr =
2901 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
2902 PartPtr->setIsInBounds(InBounds);
2903 PartPtr = cast<GetElementPtrInst>(
2904 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
2905 PartPtr->setIsInBounds(InBounds);
2906 if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2907 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2908 } else {
2909 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2910 PartPtr = cast<GetElementPtrInst>(
2911 Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2912 PartPtr->setIsInBounds(InBounds);
2913 }
2914
2915 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2916 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2917 };
2918
2919 // Handle Stores:
2920 if (SI) {
2921 setDebugLocFromInst(Builder, SI);
2922
2923 for (unsigned Part = 0; Part < UF; ++Part) {
2924 Instruction *NewSI = nullptr;
2925 Value *StoredVal = State.get(StoredValue, Part);
2926 if (CreateGatherScatter) {
2927 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2928 Value *VectorGep = State.get(Addr, Part);
2929 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2930 MaskPart);
2931 } else {
2932 if (Reverse) {
2933 // If we store to reverse consecutive memory locations, then we need
2934 // to reverse the order of elements in the stored value.
2935 StoredVal = reverseVector(StoredVal);
2936 // We don't want to update the value in the map as it might be used in
2937 // another expression. So don't call resetVectorValue(StoredVal).
2938 }
2939 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2940 if (isMaskRequired)
2941 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2942 BlockInMaskParts[Part]);
2943 else
2944 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2945 }
2946 addMetadata(NewSI, SI);
2947 }
2948 return;
2949 }
2950
2951 // Handle loads.
2952 assert(LI && "Must have a load instruction")((LI && "Must have a load instruction") ? static_cast
<void> (0) : __assert_fail ("LI && \"Must have a load instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2952, __PRETTY_FUNCTION__))
;
2953 setDebugLocFromInst(Builder, LI);
2954 for (unsigned Part = 0; Part < UF; ++Part) {
2955 Value *NewLI;
2956 if (CreateGatherScatter) {
2957 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2958 Value *VectorGep = State.get(Addr, Part);
2959 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2960 nullptr, "wide.masked.gather");
2961 addMetadata(NewLI, LI);
2962 } else {
2963 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2964 if (isMaskRequired)
2965 NewLI = Builder.CreateMaskedLoad(
2966 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy),
2967 "wide.masked.load");
2968 else
2969 NewLI =
2970 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2971
2972 // Add metadata to the load, but setVectorValue to the reverse shuffle.
2973 addMetadata(NewLI, LI);
2974 if (Reverse)
2975 NewLI = reverseVector(NewLI);
2976 }
2977
2978 State.set(Def, NewLI, Part);
2979 }
2980}
2981
2982void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
2983 VPUser &User,
2984 const VPIteration &Instance,
2985 bool IfPredicateInstr,
2986 VPTransformState &State) {
2987 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")((!Instr->getType()->isAggregateType() && "Can't handle vectors"
) ? static_cast<void> (0) : __assert_fail ("!Instr->getType()->isAggregateType() && \"Can't handle vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2987, __PRETTY_FUNCTION__))
;
2988
2989 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2990 // the first lane and part.
2991 if (isa<NoAliasScopeDeclInst>(Instr))
2992 if (!Instance.isFirstIteration())
2993 return;
2994
2995 setDebugLocFromInst(Builder, Instr);
2996
2997 // Does this instruction return a value ?
2998 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2999
3000 Instruction *Cloned = Instr->clone();
3001 if (!IsVoidRetTy)
3002 Cloned->setName(Instr->getName() + ".cloned");
3003
3004 State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
3005 Builder.GetInsertPoint());
3006 // Replace the operands of the cloned instructions with their scalar
3007 // equivalents in the new loop.
3008 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
3009 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
3010 auto InputInstance = Instance;
3011 if (!Operand || !OrigLoop->contains(Operand) ||
3012 (Cost->isUniformAfterVectorization(Operand, State.VF)))
3013 InputInstance.Lane = VPLane::getFirstLane();
3014 auto *NewOp = State.get(User.getOperand(op), InputInstance);
3015 Cloned->setOperand(op, NewOp);
3016 }
3017 addNewMetadata(Cloned, Instr);
3018
3019 // Place the cloned scalar in the new loop.
3020 Builder.Insert(Cloned);
3021
3022 State.set(Def, Cloned, Instance);
3023
3024 // If we just cloned a new assumption, add it the assumption cache.
3025 if (auto *II = dyn_cast<AssumeInst>(Cloned))
3026 AC->registerAssumption(II);
3027
3028 // End if-block.
3029 if (IfPredicateInstr)
3030 PredicatedInstructions.push_back(Cloned);
3031}
3032
3033PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
3034 Value *End, Value *Step,
3035 Instruction *DL) {
3036 BasicBlock *Header = L->getHeader();
3037 BasicBlock *Latch = L->getLoopLatch();
3038 // As we're just creating this loop, it's possible no latch exists
3039 // yet. If so, use the header as this will be a single block loop.
3040 if (!Latch)
3041 Latch = Header;
3042
3043 IRBuilder<> Builder(&*Header->getFirstInsertionPt());
3044 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3045 setDebugLocFromInst(Builder, OldInst);
3046 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
3047
3048 Builder.SetInsertPoint(Latch->getTerminator());
3049 setDebugLocFromInst(Builder, OldInst);
3050
3051 // Create i+1 and fill the PHINode.
3052 Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
3053 Induction->addIncoming(Start, L->getLoopPreheader());
3054 Induction->addIncoming(Next, Latch);
3055 // Create the compare.
3056 Value *ICmp = Builder.CreateICmpEQ(Next, End);
3057 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
3058
3059 // Now we have two terminators. Remove the old one from the block.
3060 Latch->getTerminator()->eraseFromParent();
3061
3062 return Induction;
3063}
3064
3065Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3066 if (TripCount)
3067 return TripCount;
3068
3069 assert(L && "Create Trip Count for null loop.")((L && "Create Trip Count for null loop.") ? static_cast
<void> (0) : __assert_fail ("L && \"Create Trip Count for null loop.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3069, __PRETTY_FUNCTION__))
;
3070 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3071 // Find the loop boundaries.
3072 ScalarEvolution *SE = PSE.getSE();
3073 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3074 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&((!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
"Invalid loop count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3075, __PRETTY_FUNCTION__))
3075 "Invalid loop count")((!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
"Invalid loop count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3075, __PRETTY_FUNCTION__))
;
3076
3077 Type *IdxTy = Legal->getWidestInductionType();
3078 assert(IdxTy && "No type for induction")((IdxTy && "No type for induction") ? static_cast<
void> (0) : __assert_fail ("IdxTy && \"No type for induction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3078, __PRETTY_FUNCTION__))
;
3079
3080 // The exit count might have the type of i64 while the phi is i32. This can
3081 // happen if we have an induction variable that is sign extended before the
3082 // compare. The only way that we get a backedge taken count is that the
3083 // induction variable was signed and as such will not overflow. In such a case
3084 // truncation is legal.
3085 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3086 IdxTy->getPrimitiveSizeInBits())
3087 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3088 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3089
3090 // Get the total trip count from the count by adding 1.
3091 const SCEV *ExitCount = SE->getAddExpr(
3092 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3093
3094 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3095
3096 // Expand the trip count and place the new instructions in the preheader.
3097 // Notice that the pre-header does not change, only the loop body.
3098 SCEVExpander Exp(*SE, DL, "induction");
3099
3100 // Count holds the overall loop count (N).
3101 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3102 L->getLoopPreheader()->getTerminator());
3103
3104 if (TripCount->getType()->isPointerTy())
3105 TripCount =
3106 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3107 L->getLoopPreheader()->getTerminator());
3108
3109 return TripCount;
3110}
3111
3112Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3113 if (VectorTripCount)
3114 return VectorTripCount;
3115
3116 Value *TC = getOrCreateTripCount(L);
3117 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3118
3119 Type *Ty = TC->getType();
3120 // This is where we can make the step a runtime constant.
3121 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
3122
3123 // If the tail is to be folded by masking, round the number of iterations N
3124 // up to a multiple of Step instead of rounding down. This is done by first
3125 // adding Step-1 and then rounding down. Note that it's ok if this addition
3126 // overflows: the vector induction variable will eventually wrap to zero given
3127 // that it starts at zero and its Step is a power of two; the loop will then
3128 // exit, with the last early-exit vector comparison also producing all-true.
3129 if (Cost->foldTailByMasking()) {
3130 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&((isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3131, __PRETTY_FUNCTION__))
3131 "VF*UF must be a power of 2 when folding tail by masking")((isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3131, __PRETTY_FUNCTION__))
;
3132 assert(!VF.isScalable() &&((!VF.isScalable() && "Tail folding not yet supported for scalable vectors"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"Tail folding not yet supported for scalable vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3133, __PRETTY_FUNCTION__))
3133 "Tail folding not yet supported for scalable vectors")((!VF.isScalable() && "Tail folding not yet supported for scalable vectors"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"Tail folding not yet supported for scalable vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3133, __PRETTY_FUNCTION__))
;
3134 TC = Builder.CreateAdd(
3135 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3136 }
3137
3138 // Now we need to generate the expression for the part of the loop that the
3139 // vectorized body will execute. This is equal to N - (N % Step) if scalar
3140 // iterations are not required for correctness, or N - Step, otherwise. Step
3141 // is equal to the vectorization factor (number of SIMD elements) times the
3142 // unroll factor (number of SIMD instructions).
3143 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3144
3145 // There are two cases where we need to ensure (at least) the last iteration
3146 // runs in the scalar remainder loop. Thus, if the step evenly divides
3147 // the trip count, we set the remainder to be equal to the step. If the step
3148 // does not evenly divide the trip count, no adjustment is necessary since
3149 // there will already be scalar iterations. Note that the minimum iterations
3150 // check ensures that N >= Step. The cases are:
3151 // 1) If there is a non-reversed interleaved group that may speculatively
3152 // access memory out-of-bounds.
3153 // 2) If any instruction may follow a conditionally taken exit. That is, if
3154 // the loop contains multiple exiting blocks, or a single exiting block
3155 // which is not the latch.
3156 if (VF.isVector() && Cost->requiresScalarEpilogue()) {
3157 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3158 R = Builder.CreateSelect(IsZero, Step, R);
3159 }
3160
3161 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3162
3163 return VectorTripCount;
3164}
3165
3166Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3167 const DataLayout &DL) {
3168 // Verify that V is a vector type with same number of elements as DstVTy.
3169 auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3170 unsigned VF = DstFVTy->getNumElements();
3171 auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3172 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match")(((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"
) ? static_cast<void> (0) : __assert_fail ("(VF == SrcVecTy->getNumElements()) && \"Vector dimensions do not match\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3172, __PRETTY_FUNCTION__))
;
3173 Type *SrcElemTy = SrcVecTy->getElementType();
3174 Type *DstElemTy = DstFVTy->getElementType();
3175 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&(((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy
)) && "Vector elements must have same size") ? static_cast
<void> (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3176, __PRETTY_FUNCTION__))
3176 "Vector elements must have same size")(((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy
)) && "Vector elements must have same size") ? static_cast
<void> (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3176, __PRETTY_FUNCTION__))
;
3177
3178 // Do a direct cast if element types are castable.
3179 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3180 return Builder.CreateBitOrPointerCast(V, DstFVTy);
3181 }
3182 // V cannot be directly casted to desired vector type.
3183 // May happen when V is a floating point vector but DstVTy is a vector of
3184 // pointers or vice-versa. Handle this using a two-step bitcast using an
3185 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3186 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&(((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()
) && "Only one type should be a pointer type") ? static_cast
<void> (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3187, __PRETTY_FUNCTION__))
3187 "Only one type should be a pointer type")(((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()
) && "Only one type should be a pointer type") ? static_cast
<void> (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3187, __PRETTY_FUNCTION__))
;
3188 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&(((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy
()) && "Only one type should be a floating point type"
) ? static_cast<void> (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3189, __PRETTY_FUNCTION__))
3189 "Only one type should be a floating point type")(((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy
()) && "Only one type should be a floating point type"
) ? static_cast<void> (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3189, __PRETTY_FUNCTION__))
;
3190 Type *IntTy =
3191 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3192 auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3193 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3194 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3195}
3196
3197void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3198 BasicBlock *Bypass) {
3199 Value *Count = getOrCreateTripCount(L);
3200 // Reuse existing vector loop preheader for TC checks.
3201 // Note that new preheader block is generated for vector loop.
3202 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3203 IRBuilder<> Builder(TCCheckBlock->getTerminator());
3204
3205 // Generate code to check if the loop's trip count is less than VF * UF, or
3206 // equal to it in case a scalar epilogue is required; this implies that the
3207 // vector trip count is zero. This check also covers the case where adding one
3208 // to the backedge-taken count overflowed leading to an incorrect trip count
3209 // of zero. In this case we will also jump to the scalar loop.
3210 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3211 : ICmpInst::ICMP_ULT;
3212
3213 // If tail is to be folded, vector loop takes care of all iterations.
3214 Value *CheckMinIters = Builder.getFalse();
3215 if (!Cost->foldTailByMasking()) {
3216 Value *Step =
3217 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3218 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3219 }
3220 // Create new preheader for vector loop.
3221 LoopVectorPreHeader =
3222 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3223 "vector.ph");
3224
3225 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3227, __PRETTY_FUNCTION__))
3226 DT->getNode(Bypass)->getIDom()) &&((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3227, __PRETTY_FUNCTION__))
3227 "TC check is expected to dominate Bypass")((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3227, __PRETTY_FUNCTION__))
;
3228
3229 // Update dominator for Bypass & LoopExit.
3230 DT->changeImmediateDominator(Bypass, TCCheckBlock);
3231 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3232
3233 ReplaceInstWithInst(
3234 TCCheckBlock->getTerminator(),
3235 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3236 LoopBypassBlocks.push_back(TCCheckBlock);
3237}
3238
3239BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3240
3241 BasicBlock *const SCEVCheckBlock =
3242 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3243 if (!SCEVCheckBlock)
3244 return nullptr;
3245
3246 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||((!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile
&& Cost->Hints->getForce() != LoopVectorizeHints
::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3249, __PRETTY_FUNCTION__))
3247 (OptForSizeBasedOnProfile &&((!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile
&& Cost->Hints->getForce() != LoopVectorizeHints
::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3249, __PRETTY_FUNCTION__))
3248 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&((!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile
&& Cost->Hints->getForce() != LoopVectorizeHints
::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3249, __PRETTY_FUNCTION__))
3249 "Cannot SCEV check stride or overflow when optimizing for size")((!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile
&& Cost->Hints->getForce() != LoopVectorizeHints
::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3249, __PRETTY_FUNCTION__))
;
3250
3251
3252 // Update dominator only if this is first RT check.
3253 if (LoopBypassBlocks.empty()) {
3254 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3255 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3256 }
3257
3258 LoopBypassBlocks.push_back(SCEVCheckBlock);
3259 AddedSafetyChecks = true;
3260 return SCEVCheckBlock;
3261}
3262
3263BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3264 BasicBlock *Bypass) {
3265 // VPlan-native path does not do any analysis for runtime checks currently.
3266 if (EnableVPlanNativePath)
3267 return nullptr;
3268
3269 BasicBlock *const MemCheckBlock =
3270 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3271
3272 // Check if we generated code that checks in runtime if arrays overlap. We put
3273 // the checks into a separate block to make the more common case of few
3274 // elements faster.
3275 if (!MemCheckBlock)
3276 return nullptr;
3277
3278 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3279 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&((Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled
&& "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? static_cast<void> (0) : __assert_fail
("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3281, __PRETTY_FUNCTION__))
3280 "Cannot emit memory checks when optimizing for size, unless forced "((Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled
&& "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? static_cast<void> (0) : __assert_fail
("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3281, __PRETTY_FUNCTION__))
3281 "to vectorize.")((Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled
&& "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? static_cast<void> (0) : __assert_fail
("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3281, __PRETTY_FUNCTION__))
;
3282 ORE->emit([&]() {
3283 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationCodeSize",
3284 L->getStartLoc(), L->getHeader())
3285 << "Code-size may be reduced by not forcing "
3286 "vectorization, or by source-code modifications "
3287 "eliminating the need for runtime checks "
3288 "(e.g., adding 'restrict').";
3289 });
3290 }
3291
3292 LoopBypassBlocks.push_back(MemCheckBlock);
3293
3294 AddedSafetyChecks = true;
3295
3296 // We currently don't use LoopVersioning for the actual loop cloning but we
3297 // still use it to add the noalias metadata.
3298 LVer = std::make_unique<LoopVersioning>(
3299 *Legal->getLAI(),
3300 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3301 DT, PSE.getSE());
3302 LVer->prepareNoAliasMetadata();
3303 return MemCheckBlock;
3304}
3305
3306Value *InnerLoopVectorizer::emitTransformedIndex(
3307 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3308 const InductionDescriptor &ID) const {
3309
3310 SCEVExpander Exp(*SE, DL, "induction");
3311 auto Step = ID.getStep();
3312 auto StartValue = ID.getStartValue();
3313 assert(Index->getType() == Step->getType() &&((Index->getType() == Step->getType() && "Index type does not match StepValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == Step->getType() && \"Index type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3314, __PRETTY_FUNCTION__))
3314 "Index type does not match StepValue type")((Index->getType() == Step->getType() && "Index type does not match StepValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == Step->getType() && \"Index type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3314, __PRETTY_FUNCTION__))
;
3315
3316 // Note: the IR at this point is broken. We cannot use SE to create any new
3317 // SCEV and then expand it, hoping that SCEV's simplification will give us
3318 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3319 // lead to various SCEV crashes. So all we can do is to use builder and rely
3320 // on InstCombine for future simplifications. Here we handle some trivial
3321 // cases only.
3322 auto CreateAdd = [&B](Value *X, Value *Y) {
3323 assert(X->getType() == Y->getType() && "Types don't match!")((X->getType() == Y->getType() && "Types don't match!"
) ? static_cast<void> (0) : __assert_fail ("X->getType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3323, __PRETTY_FUNCTION__))
;
3324 if (auto *CX = dyn_cast<ConstantInt>(X))
3325 if (CX->isZero())
3326 return Y;
3327 if (auto *CY = dyn_cast<ConstantInt>(Y))
3328 if (CY->isZero())
3329 return X;
3330 return B.CreateAdd(X, Y);
3331 };
3332
3333 auto CreateMul = [&B](Value *X, Value *Y) {
3334 assert(X->getType() == Y->getType() && "Types don't match!")((X->getType() == Y->getType() && "Types don't match!"
) ? static_cast<void> (0) : __assert_fail ("X->getType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3334, __PRETTY_FUNCTION__))
;
3335 if (auto *CX = dyn_cast<ConstantInt>(X))
3336 if (CX->isOne())
3337 return Y;
3338 if (auto *CY = dyn_cast<ConstantInt>(Y))
3339 if (CY->isOne())
3340 return X;
3341 return B.CreateMul(X, Y);
3342 };
3343
3344 // Get a suitable insert point for SCEV expansion. For blocks in the vector
3345 // loop, choose the end of the vector loop header (=LoopVectorBody), because
3346 // the DomTree is not kept up-to-date for additional blocks generated in the
3347 // vector loop. By using the header as insertion point, we guarantee that the
3348 // expanded instructions dominate all their uses.
3349 auto GetInsertPoint = [this, &B]() {
3350 BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3351 if (InsertBB != LoopVectorBody &&
3352 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3353 return LoopVectorBody->getTerminator();
3354 return &*B.GetInsertPoint();
3355 };
3356
3357 switch (ID.getKind()) {
3358 case InductionDescriptor::IK_IntInduction: {
3359 assert(Index->getType() == StartValue->getType() &&((Index->getType() == StartValue->getType() && "Index type does not match StartValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3360, __PRETTY_FUNCTION__))
3360 "Index type does not match StartValue type")((Index->getType() == StartValue->getType() && "Index type does not match StartValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3360, __PRETTY_FUNCTION__))
;
3361 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3362 return B.CreateSub(StartValue, Index);
3363 auto *Offset = CreateMul(
3364 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3365 return CreateAdd(StartValue, Offset);
3366 }
3367 case InductionDescriptor::IK_PtrInduction: {
3368 assert(isa<SCEVConstant>(Step) &&((isa<SCEVConstant>(Step) && "Expected constant step for pointer induction"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3369, __PRETTY_FUNCTION__))
3369 "Expected constant step for pointer induction")((isa<SCEVConstant>(Step) && "Expected constant step for pointer induction"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3369, __PRETTY_FUNCTION__))
;
3370 return B.CreateGEP(
3371 StartValue->getType()->getPointerElementType(), StartValue,
3372 CreateMul(Index,
3373 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3374 }
3375 case InductionDescriptor::IK_FpInduction: {
3376 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value")((Step->getType()->isFloatingPointTy() && "Expected FP Step value"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isFloatingPointTy() && \"Expected FP Step value\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3376, __PRETTY_FUNCTION__))
;
3377 auto InductionBinOp = ID.getInductionBinOp();
3378 assert(InductionBinOp &&((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3381, __PRETTY_FUNCTION__))
3379 (InductionBinOp->getOpcode() == Instruction::FAdd ||((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3381, __PRETTY_FUNCTION__))
3380 InductionBinOp->getOpcode() == Instruction::FSub) &&((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3381, __PRETTY_FUNCTION__))
3381 "Original bin op should be defined for FP induction")((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3381, __PRETTY_FUNCTION__))
;
3382
3383 Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3384 Value *MulExp = B.CreateFMul(StepValue, Index);
3385 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3386 "induction");
3387 }
3388 case InductionDescriptor::IK_NoInduction:
3389 return nullptr;
3390 }
3391 llvm_unreachable("invalid enum")::llvm::llvm_unreachable_internal("invalid enum", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3391)
;
3392}
3393
3394Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3395 LoopScalarBody = OrigLoop->getHeader();
3396 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3397 LoopExitBlock = OrigLoop->getUniqueExitBlock();
3398 assert(LoopExitBlock && "Must have an exit block")((LoopExitBlock && "Must have an exit block") ? static_cast
<void> (0) : __assert_fail ("LoopExitBlock && \"Must have an exit block\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3398, __PRETTY_FUNCTION__))
;
3399 assert(LoopVectorPreHeader && "Invalid loop structure")((LoopVectorPreHeader && "Invalid loop structure") ? static_cast
<void> (0) : __assert_fail ("LoopVectorPreHeader && \"Invalid loop structure\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3399, __PRETTY_FUNCTION__))
;
3400
3401 LoopMiddleBlock =
3402 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3403 LI, nullptr, Twine(Prefix) + "middle.block");
3404 LoopScalarPreHeader =
3405 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3406 nullptr, Twine(Prefix) + "scalar.ph");
3407
3408 // Set up branch from middle block to the exit and scalar preheader blocks.
3409 // completeLoopSkeleton will update the condition to use an iteration check,
3410 // if required to decide whether to execute the remainder.
3411 BranchInst *BrInst =
3412 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
3413 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3414 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3415 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3416
3417 // We intentionally don't let SplitBlock to update LoopInfo since
3418 // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3419 // LoopVectorBody is explicitly added to the correct place few lines later.
3420 LoopVectorBody =
3421 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3422 nullptr, nullptr, Twine(Prefix) + "vector.body");
3423
3424 // Update dominator for loop exit.
3425 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3426
3427 // Create and register the new vector loop.
3428 Loop *Lp = LI->AllocateLoop();
3429 Loop *ParentLoop = OrigLoop->getParentLoop();
3430
3431 // Insert the new loop into the loop nest and register the new basic blocks
3432 // before calling any utilities such as SCEV that require valid LoopInfo.
3433 if (ParentLoop) {
3434 ParentLoop->addChildLoop(Lp);
3435 } else {
3436 LI->addTopLevelLoop(Lp);
3437 }
3438 Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3439 return Lp;
3440}
3441
3442void InnerLoopVectorizer::createInductionResumeValues(
3443 Loop *L, Value *VectorTripCount,
3444 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3445 assert(VectorTripCount && L && "Expected valid arguments")((VectorTripCount && L && "Expected valid arguments"
) ? static_cast<void> (0) : __assert_fail ("VectorTripCount && L && \"Expected valid arguments\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3445, __PRETTY_FUNCTION__))
;
3446 assert(((AdditionalBypass.first && AdditionalBypass.second) ||((((AdditionalBypass.first && AdditionalBypass.second
) || (!AdditionalBypass.first && !AdditionalBypass.second
)) && "Inconsistent information about additional bypass."
) ? static_cast<void> (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3448, __PRETTY_FUNCTION__))
3447 (!AdditionalBypass.first && !AdditionalBypass.second)) &&((((AdditionalBypass.first && AdditionalBypass.second
) || (!AdditionalBypass.first && !AdditionalBypass.second
)) && "Inconsistent information about additional bypass."
) ? static_cast<void> (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3448, __PRETTY_FUNCTION__))
3448 "Inconsistent information about additional bypass.")((((AdditionalBypass.first && AdditionalBypass.second
) || (!AdditionalBypass.first && !AdditionalBypass.second
)) && "Inconsistent information about additional bypass."
) ? static_cast<void> (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3448, __PRETTY_FUNCTION__))
;
3449 // We are going to resume the execution of the scalar loop.
3450 // Go over all of the induction variables that we found and fix the
3451 // PHIs that are left in the scalar version of the loop.
3452 // The starting values of PHI nodes depend on the counter of the last
3453 // iteration in the vectorized loop.
3454 // If we come from a bypass edge then we need to start from the original
3455 // start value.
3456 for (auto &InductionEntry : Legal->getInductionVars()) {
3457 PHINode *OrigPhi = InductionEntry.first;
3458 InductionDescriptor II = InductionEntry.second;
3459
3460 // Create phi nodes to merge from the backedge-taken check block.
3461 PHINode *BCResumeVal =
3462 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3463 LoopScalarPreHeader->getTerminator());
3464 // Copy original phi DL over to the new one.
3465 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3466 Value *&EndValue = IVEndValues[OrigPhi];
3467 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3468 if (OrigPhi == OldInduction) {
3469 // We know what the end value is.
3470 EndValue = VectorTripCount;
3471 } else {
3472 IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3473
3474 // Fast-math-flags propagate from the original induction instruction.
3475 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3476 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3477
3478 Type *StepType = II.getStep()->getType();
3479 Instruction::CastOps CastOp =
3480 CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3481 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3482 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3483 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3484 EndValue->setName("ind.end");
3485
3486 // Compute the end value for the additional bypass (if applicable).
3487 if (AdditionalBypass.first) {
3488 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3489 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3490 StepType, true);
3491 CRD =
3492 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3493 EndValueFromAdditionalBypass =
3494 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3495 EndValueFromAdditionalBypass->setName("ind.end");
3496 }
3497 }
3498 // The new PHI merges the original incoming value, in case of a bypass,
3499 // or the value at the end of the vectorized loop.
3500 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3501
3502 // Fix the scalar body counter (PHI node).
3503 // The old induction's phi node in the scalar body needs the truncated
3504 // value.
3505 for (BasicBlock *BB : LoopBypassBlocks)
3506 BCResumeVal->addIncoming(II.getStartValue(), BB);
3507
3508 if (AdditionalBypass.first)
3509 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3510 EndValueFromAdditionalBypass);
3511
3512 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3513 }
3514}
3515
3516BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3517 MDNode *OrigLoopID) {
3518 assert(L && "Expected valid loop.")((L && "Expected valid loop.") ? static_cast<void>
(0) : __assert_fail ("L && \"Expected valid loop.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3518, __PRETTY_FUNCTION__))
;
3519
3520 // The trip counts should be cached by now.
3521 Value *Count = getOrCreateTripCount(L);
3522 Value *VectorTripCount = getOrCreateVectorTripCount(L);
3523
3524 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3525
3526 // Add a check in the middle block to see if we have completed
3527 // all of the iterations in the first vector loop.
3528 // If (N - N%VF) == N, then we *don't* need to run the remainder.
3529 // If tail is to be folded, we know we don't need to run the remainder.
3530 if (!Cost->foldTailByMasking()) {
3531 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3532 Count, VectorTripCount, "cmp.n",
3533 LoopMiddleBlock->getTerminator());
3534
3535 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3536 // of the corresponding compare because they may have ended up with
3537 // different line numbers and we want to avoid awkward line stepping while
3538 // debugging. Eg. if the compare has got a line number inside the loop.
3539 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3540 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3541 }
3542
3543 // Get ready to start creating new instructions into the vectorized body.
3544 assert(LoopVectorPreHeader == L->getLoopPreheader() &&((LoopVectorPreHeader == L->getLoopPreheader() && "Inconsistent vector loop preheader"
) ? static_cast<void> (0) : __assert_fail ("LoopVectorPreHeader == L->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3545, __PRETTY_FUNCTION__))
3545 "Inconsistent vector loop preheader")((LoopVectorPreHeader == L->getLoopPreheader() && "Inconsistent vector loop preheader"
) ? static_cast<void> (0) : __assert_fail ("LoopVectorPreHeader == L->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3545, __PRETTY_FUNCTION__))
;
3546 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3547
3548 Optional<MDNode *> VectorizedLoopID =
3549 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3550 LLVMLoopVectorizeFollowupVectorized});
3551 if (VectorizedLoopID.hasValue()) {
3552 L->setLoopID(VectorizedLoopID.getValue());
3553
3554 // Do not setAlreadyVectorized if loop attributes have been defined
3555 // explicitly.
3556 return LoopVectorPreHeader;
3557 }
3558
3559 // Keep all loop hints from the original loop on the vector loop (we'll
3560 // replace the vectorizer-specific hints below).
3561 if (MDNode *LID = OrigLoop->getLoopID())
3562 L->setLoopID(LID);
3563
3564 LoopVectorizeHints Hints(L, true, *ORE);
3565 Hints.setAlreadyVectorized();
3566
3567#ifdef EXPENSIVE_CHECKS
3568 assert(DT->verify(DominatorTree::VerificationLevel::Fast))((DT->verify(DominatorTree::VerificationLevel::Fast)) ? static_cast
<void> (0) : __assert_fail ("DT->verify(DominatorTree::VerificationLevel::Fast)"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3568, __PRETTY_FUNCTION__))
;
3569 LI->verify(*DT);
3570#endif
3571
3572 return LoopVectorPreHeader;
3573}
3574
3575BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3576 /*
3577 In this function we generate a new loop. The new loop will contain
3578 the vectorized instructions while the old loop will continue to run the
3579 scalar remainder.
3580
3581 [ ] <-- loop iteration number check.
3582 / |
3583 / v
3584 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3585 | / |
3586 | / v
3587 || [ ] <-- vector pre header.
3588 |/ |
3589 | v
3590 | [ ] \
3591 | [ ]_| <-- vector loop.
3592 | |
3593 | v
3594 | -[ ] <--- middle-block.
3595 | / |
3596 | / v
3597 -|- >[ ] <--- new preheader.
3598 | |
3599 | v
3600 | [ ] \
3601 | [ ]_| <-- old scalar loop to handle remainder.
3602 \ |
3603 \ v
3604 >[ ] <-- exit block.
3605 ...
3606 */
3607
3608 // Get the metadata of the original loop before it gets modified.
3609 MDNode *OrigLoopID = OrigLoop->getLoopID();
3610
3611 // Create an empty vector loop, and prepare basic blocks for the runtime
3612 // checks.
3613 Loop *Lp = createVectorLoopSkeleton("");
3614
3615 // Now, compare the new count to zero. If it is zero skip the vector loop and
3616 // jump to the scalar loop. This check also covers the case where the
3617 // backedge-taken count is uint##_max: adding one to it will overflow leading
3618 // to an incorrect trip count of zero. In this (rare) case we will also jump
3619 // to the scalar loop.
3620 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3621
3622 // Generate the code to check any assumptions that we've made for SCEV
3623 // expressions.
3624 emitSCEVChecks(Lp, LoopScalarPreHeader);
3625
3626 // Generate the code that checks in runtime if arrays overlap. We put the
3627 // checks into a separate block to make the more common case of few elements
3628 // faster.
3629 emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3630
3631 // Some loops have a single integer induction variable, while other loops
3632 // don't. One example is c++ iterators that often have multiple pointer
3633 // induction variables. In the code below we also support a case where we
3634 // don't have a single induction variable.
3635 //
3636 // We try to obtain an induction variable from the original loop as hard
3637 // as possible. However if we don't find one that:
3638 // - is an integer
3639 // - counts from zero, stepping by one
3640 // - is the size of the widest induction variable type
3641 // then we create a new one.
3642 OldInduction = Legal->getPrimaryInduction();
3643 Type *IdxTy = Legal->getWidestInductionType();
3644 Value *StartIdx = ConstantInt::get(IdxTy, 0);
3645 // The loop step is equal to the vectorization factor (num of SIMD elements)
3646 // times the unroll factor (num of SIMD instructions).
3647 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3648 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3649 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3650 Induction =
3651 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3652 getDebugLocFromInstOrOperands(OldInduction));
3653
3654 // Emit phis for the new starting index of the scalar loop.
3655 createInductionResumeValues(Lp, CountRoundDown);
3656
3657 return completeLoopSkeleton(Lp, OrigLoopID);
3658}
3659
3660// Fix up external users of the induction variable. At this point, we are
3661// in LCSSA form, with all external PHIs that use the IV having one input value,
3662// coming from the remainder loop. We need those PHIs to also have a correct
3663// value for the IV when arriving directly from the middle block.
3664void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3665 const InductionDescriptor &II,
3666 Value *CountRoundDown, Value *EndValue,
3667 BasicBlock *MiddleBlock) {
3668 // There are two kinds of external IV usages - those that use the value
3669 // computed in the last iteration (the PHI) and those that use the penultimate
3670 // value (the value that feeds into the phi from the loop latch).
3671 // We allow both, but they, obviously, have different values.
3672
3673 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block")((OrigLoop->getUniqueExitBlock() && "Expected a single exit block"
) ? static_cast<void> (0) : __assert_fail ("OrigLoop->getUniqueExitBlock() && \"Expected a single exit block\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3673, __PRETTY_FUNCTION__))
;
3674
3675 DenseMap<Value *, Value *> MissingVals;
3676
3677 // An external user of the last iteration's value should see the value that
3678 // the remainder loop uses to initialize its own IV.
3679 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3680 for (User *U : PostInc->users()) {
3681 Instruction *UI = cast<Instruction>(U);
3682 if (!OrigLoop->contains(UI)) {
3683 assert(isa<PHINode>(UI) && "Expected LCSSA form")((isa<PHINode>(UI) && "Expected LCSSA form") ? static_cast
<void> (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3683, __PRETTY_FUNCTION__))
;
3684 MissingVals[UI] = EndValue;
3685 }
3686 }
3687
3688 // An external user of the penultimate value need to see EndValue - Step.
3689 // The simplest way to get this is to recompute it from the constituent SCEVs,
3690 // that is Start + (Step * (CRD - 1)).
3691 for (User *U : OrigPhi->users()) {
3692 auto *UI = cast<Instruction>(U);
3693 if (!OrigLoop->contains(UI)) {
3694 const DataLayout &DL =
3695 OrigLoop->getHeader()->getModule()->getDataLayout();
3696 assert(isa<PHINode>(UI) && "Expected LCSSA form")((isa<PHINode>(UI) && "Expected LCSSA form") ? static_cast
<void> (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3696, __PRETTY_FUNCTION__))
;
3697
3698 IRBuilder<> B(MiddleBlock->getTerminator());
3699
3700 // Fast-math-flags propagate from the original induction instruction.
3701 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3702 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3703
3704 Value *CountMinusOne = B.CreateSub(
3705 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3706 Value *CMO =
3707 !II.getStep()->getType()->isIntegerTy()
3708 ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3709 II.getStep()->getType())
3710 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3711 CMO->setName("cast.cmo");
3712 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3713 Escape->setName("ind.escape");
3714 MissingVals[UI] = Escape;
3715 }
3716 }
3717
3718 for (auto &I : MissingVals) {
3719 PHINode *PHI = cast<PHINode>(I.first);
3720 // One corner case we have to handle is two IVs "chasing" each-other,
3721 // that is %IV2 = phi [...], [ %IV1, %latch ]
3722 // In this case, if IV1 has an external use, we need to avoid adding both
3723 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3724 // don't already have an incoming value for the middle block.
3725 if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3726 PHI->addIncoming(I.second, MiddleBlock);
3727 }
3728}
3729
3730namespace {
3731
3732struct CSEDenseMapInfo {
3733 static bool canHandle(const Instruction *I) {
3734 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3735 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3736 }
3737
3738 static inline Instruction *getEmptyKey() {
3739 return DenseMapInfo<Instruction *>::getEmptyKey();
3740 }
3741
3742 static inline Instruction *getTombstoneKey() {
3743 return DenseMapInfo<Instruction *>::getTombstoneKey();
3744 }
3745
3746 static unsigned getHashValue(const Instruction *I) {
3747 assert(canHandle(I) && "Unknown instruction!")((canHandle(I) && "Unknown instruction!") ? static_cast
<void> (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3747, __PRETTY_FUNCTION__))
;
3748 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3749 I->value_op_end()));
3750 }
3751
3752 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3753 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3754 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3755 return LHS == RHS;
3756 return LHS->isIdenticalTo(RHS);
3757 }
3758};
3759
3760} // end anonymous namespace
3761
3762///Perform cse of induction variable instructions.
3763static void cse(BasicBlock *BB) {
3764 // Perform simple cse.
3765 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3766 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3767 Instruction *In = &*I++;
3768
3769 if (!CSEDenseMapInfo::canHandle(In))
3770 continue;
3771
3772 // Check if we can replace this instruction with any of the
3773 // visited instructions.
3774 if (Instruction *V = CSEMap.lookup(In)) {
3775 In->replaceAllUsesWith(V);
3776 In->eraseFromParent();
3777 continue;
3778 }
3779
3780 CSEMap[In] = In;
3781 }
3782}
3783
3784InstructionCost
3785LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3786 bool &NeedToScalarize) const {
3787 Function *F = CI->getCalledFunction();
3788 Type *ScalarRetTy = CI->getType();
3789 SmallVector<Type *, 4> Tys, ScalarTys;
3790 for (auto &ArgOp : CI->arg_operands())
3791 ScalarTys.push_back(ArgOp->getType());
3792
3793 // Estimate cost of scalarized vector call. The source operands are assumed
3794 // to be vectors, so we need to extract individual elements from there,
3795 // execute VF scalar calls, and then gather the result into the vector return
3796 // value.
3797 InstructionCost ScalarCallCost =
3798 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3799 if (VF.isScalar())
3800 return ScalarCallCost;
3801
3802 // Compute corresponding vector type for return value and arguments.
3803 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3804 for (Type *ScalarTy : ScalarTys)
3805 Tys.push_back(ToVectorTy(ScalarTy, VF));
3806
3807 // Compute costs of unpacking argument values for the scalar calls and
3808 // packing the return values to a vector.
3809 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3810
3811 InstructionCost Cost =
3812 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3813
3814 // If we can't emit a vector call for this function, then the currently found
3815 // cost is the cost we need to return.
3816 NeedToScalarize = true;
3817 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3818 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3819
3820 if (!TLI || CI->isNoBuiltin() || !VecFunc)
3821 return Cost;
3822
3823 // If the corresponding vector cost is cheaper, return its cost.
3824 InstructionCost VectorCallCost =
3825 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3826 if (VectorCallCost < Cost) {
3827 NeedToScalarize = false;
3828 Cost = VectorCallCost;
3829 }
3830 return Cost;
3831}
3832
3833static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3834 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3835 return Elt;
3836 return VectorType::get(Elt, VF);
3837}
3838
3839InstructionCost
3840LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3841 ElementCount VF) const {
3842 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3843 assert(ID && "Expected intrinsic call!")((ID && "Expected intrinsic call!") ? static_cast<
void> (0) : __assert_fail ("ID && \"Expected intrinsic call!\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3843, __PRETTY_FUNCTION__))
;
3844 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3845 FastMathFlags FMF;
3846 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3847 FMF = FPMO->getFastMathFlags();
3848
3849 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
3850 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3851 SmallVector<Type *> ParamTys;
3852 std::transform(FTy->param_begin(), FTy->param_end(),
3853 std::back_inserter(ParamTys),
3854 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3855
3856 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3857 dyn_cast<IntrinsicInst>(CI));
3858 return TTI.getIntrinsicInstrCost(CostAttrs,
3859 TargetTransformInfo::TCK_RecipThroughput);
3860}
3861
3862static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3863 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3864 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3865 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3866}
3867
3868static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3869 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3870 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3871 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3872}
3873
3874void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3875 // For every instruction `I` in MinBWs, truncate the operands, create a
3876 // truncated version of `I` and reextend its result. InstCombine runs
3877 // later and will remove any ext/trunc pairs.
3878 SmallPtrSet<Value *, 4> Erased;
3879 for (const auto &KV : Cost->getMinimalBitwidths()) {
3880 // If the value wasn't vectorized, we must maintain the original scalar
3881 // type. The absence of the value from State indicates that it
3882 // wasn't vectorized.
3883 VPValue *Def = State.Plan->getVPValue(KV.first);
3884 if (!State.hasAnyVectorValue(Def))
3885 continue;
3886 for (unsigned Part = 0; Part < UF; ++Part) {
3887 Value *I = State.get(Def, Part);
3888 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3889 continue;
3890 Type *OriginalTy = I->getType();
3891 Type *ScalarTruncatedTy =
3892 IntegerType::get(OriginalTy->getContext(), KV.second);
3893 auto *TruncatedTy = FixedVectorType::get(
3894 ScalarTruncatedTy,
3895 cast<FixedVectorType>(OriginalTy)->getNumElements());
3896 if (TruncatedTy == OriginalTy)
3897 continue;
3898
3899 IRBuilder<> B(cast<Instruction>(I));
3900 auto ShrinkOperand = [&](Value *V) -> Value * {
3901 if (auto *ZI = dyn_cast<ZExtInst>(V))
3902 if (ZI->getSrcTy() == TruncatedTy)
3903 return ZI->getOperand(0);
3904 return B.CreateZExtOrTrunc(V, TruncatedTy);
3905 };
3906
3907 // The actual instruction modification depends on the instruction type,
3908 // unfortunately.
3909 Value *NewI = nullptr;
3910 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3911 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3912 ShrinkOperand(BO->getOperand(1)));
3913
3914 // Any wrapping introduced by shrinking this operation shouldn't be
3915 // considered undefined behavior. So, we can't unconditionally copy
3916 // arithmetic wrapping flags to NewI.
3917 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3918 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3919 NewI =
3920 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3921 ShrinkOperand(CI->getOperand(1)));
3922 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3923 NewI = B.CreateSelect(SI->getCondition(),
3924 ShrinkOperand(SI->getTrueValue()),
3925 ShrinkOperand(SI->getFalseValue()));
3926 } else if (auto *CI = dyn_cast<CastInst>(I)) {
3927 switch (CI->getOpcode()) {
3928 default:
3929 llvm_unreachable("Unhandled cast!")::llvm::llvm_unreachable_internal("Unhandled cast!", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3929)
;
3930 case Instruction::Trunc:
3931 NewI = ShrinkOperand(CI->getOperand(0));
3932 break;
3933 case Instruction::SExt:
3934 NewI = B.CreateSExtOrTrunc(
3935 CI->getOperand(0),
3936 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3937 break;
3938 case Instruction::ZExt:
3939 NewI = B.CreateZExtOrTrunc(
3940 CI->getOperand(0),
3941 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3942 break;
3943 }
3944 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3945 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3946 ->getNumElements();
3947 auto *O0 = B.CreateZExtOrTrunc(
3948 SI->getOperand(0),
3949 FixedVectorType::get(ScalarTruncatedTy, Elements0));
3950 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3951 ->getNumElements();
3952 auto *O1 = B.CreateZExtOrTrunc(
3953 SI->getOperand(1),
3954 FixedVectorType::get(ScalarTruncatedTy, Elements1));
3955
3956 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3957 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3958 // Don't do anything with the operands, just extend the result.
3959 continue;
3960 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3961 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3962 ->getNumElements();
3963 auto *O0 = B.CreateZExtOrTrunc(
3964 IE->getOperand(0),
3965 FixedVectorType::get(ScalarTruncatedTy, Elements));
3966 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3967 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3968 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3969 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3970 ->getNumElements();
3971 auto *O0 = B.CreateZExtOrTrunc(
3972 EE->getOperand(0),
3973 FixedVectorType::get(ScalarTruncatedTy, Elements));
3974 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3975 } else {
3976 // If we don't know what to do, be conservative and don't do anything.
3977 continue;
3978 }
3979
3980 // Lastly, extend the result.
3981 NewI->takeName(cast<Instruction>(I));
3982 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3983 I->replaceAllUsesWith(Res);
3984 cast<Instruction>(I)->eraseFromParent();
3985 Erased.insert(I);
3986 State.reset(Def, Res, Part);
3987 }
3988 }
3989
3990 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3991 for (const auto &KV : Cost->getMinimalBitwidths()) {
3992 // If the value wasn't vectorized, we must maintain the original scalar
3993 // type. The absence of the value from State indicates that it
3994 // wasn't vectorized.
3995 VPValue *Def = State.Plan->getVPValue(KV.first);
3996 if (!State.hasAnyVectorValue(Def))
3997 continue;
3998 for (unsigned Part = 0; Part < UF; ++Part) {
3999 Value *I = State.get(Def, Part);
4000 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
4001 if (Inst && Inst->use_empty()) {
4002 Value *NewI = Inst->getOperand(0);
4003 Inst->eraseFromParent();
4004 State.reset(Def, NewI, Part);
4005 }
4006 }
4007 }
4008}
4009
4010void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
4011 // Insert truncates and extends for any truncated instructions as hints to
4012 // InstCombine.
4013 if (VF.isVector())
4014 truncateToMinimalBitwidths(State);
4015
4016 // Fix widened non-induction PHIs by setting up the PHI operands.
4017 if (OrigPHIsToFix.size()) {
4018 assert(EnableVPlanNativePath &&((EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4019, __PRETTY_FUNCTION__))
4019 "Unexpected non-induction PHIs for fixup in non VPlan-native path")((EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4019, __PRETTY_FUNCTION__))
;
4020 fixNonInductionPHIs(State);
4021 }
4022
4023 // At this point every instruction in the original loop is widened to a
4024 // vector form. Now we need to fix the recurrences in the loop. These PHI
4025 // nodes are currently empty because we did not want to introduce cycles.
4026 // This is the second stage of vectorizing recurrences.
4027 fixCrossIterationPHIs(State);
4028
4029 // Forget the original basic block.
4030 PSE.getSE()->forgetLoop(OrigLoop);
4031
4032 // Fix-up external users of the induction variables.
4033 for (auto &Entry : Legal->getInductionVars())
4034 fixupIVUsers(Entry.first, Entry.second,
4035 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4036 IVEndValues[Entry.first], LoopMiddleBlock);
4037
4038 fixLCSSAPHIs(State);
4039 for (Instruction *PI : PredicatedInstructions)
4040 sinkScalarOperands(&*PI);
4041
4042 // Remove redundant induction instructions.
4043 cse(LoopVectorBody);
4044
4045 // Set/update profile weights for the vector and remainder loops as original
4046 // loop iterations are now distributed among them. Note that original loop
4047 // represented by LoopScalarBody becomes remainder loop after vectorization.
4048 //
4049 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4050 // end up getting slightly roughened result but that should be OK since
4051 // profile is not inherently precise anyway. Note also possible bypass of
4052 // vector code caused by legality checks is ignored, assigning all the weight
4053 // to the vector loop, optimistically.
4054 //
4055 // For scalable vectorization we can't know at compile time how many iterations
4056 // of the loop are handled in one vector iteration, so instead assume a pessimistic
4057 // vscale of '1'.
4058 setProfileInfoAfterUnrolling(
4059 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
4060 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
4061}
4062
4063void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
4064 // In order to support recurrences we need to be able to vectorize Phi nodes.
4065 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4066 // stage #2: We now need to fix the recurrences by adding incoming edges to
4067 // the currently empty PHI nodes. At this point every instruction in the
4068 // original loop is widened to a vector form so we can use them to construct
4069 // the incoming edges.
4070 for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
4071 // Handle first-order recurrences and reductions that need to be fixed.
4072 if (Legal->isFirstOrderRecurrence(&Phi))
4073 fixFirstOrderRecurrence(&Phi, State);
4074 else if (Legal->isReductionVariable(&Phi))
4075 fixReduction(&Phi, State);
4076 }
4077}
4078
4079void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi,
4080 VPTransformState &State) {
4081 // This is the second phase of vectorizing first-order recurrences. An
4082 // overview of the transformation is described below. Suppose we have the
4083 // following loop.
4084 //
4085 // for (int i = 0; i < n; ++i)
4086 // b[i] = a[i] - a[i - 1];
4087 //
4088 // There is a first-order recurrence on "a". For this loop, the shorthand
4089 // scalar IR looks like:
4090 //
4091 // scalar.ph:
4092 // s_init = a[-1]
4093 // br scalar.body
4094 //
4095 // scalar.body:
4096 // i = phi [0, scalar.ph], [i+1, scalar.body]
4097 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4098 // s2 = a[i]
4099 // b[i] = s2 - s1
4100 // br cond, scalar.body, ...
4101 //
4102 // In this example, s1 is a recurrence because it's value depends on the
4103 // previous iteration. In the first phase of vectorization, we created a
4104 // temporary value for s1. We now complete the vectorization and produce the
4105 // shorthand vector IR shown below (for VF = 4, UF = 1).
4106 //
4107 // vector.ph:
4108 // v_init = vector(..., ..., ..., a[-1])
4109 // br vector.body
4110 //
4111 // vector.body
4112 // i = phi [0, vector.ph], [i+4, vector.body]
4113 // v1 = phi [v_init, vector.ph], [v2, vector.body]
4114 // v2 = a[i, i+1, i+2, i+3];
4115 // v3 = vector(v1(3), v2(0, 1, 2))
4116 // b[i, i+1, i+2, i+3] = v2 - v3
4117 // br cond, vector.body, middle.block
4118 //
4119 // middle.block:
4120 // x = v2(3)
4121 // br scalar.ph
4122 //
4123 // scalar.ph:
4124 // s_init = phi [x, middle.block], [a[-1], otherwise]
4125 // br scalar.body
4126 //
4127 // After execution completes the vector loop, we extract the next value of
4128 // the recurrence (x) to use as the initial value in the scalar loop.
4129
4130 // Get the original loop preheader and single loop latch.
4131 auto *Preheader = OrigLoop->getLoopPreheader();
4132 auto *Latch = OrigLoop->getLoopLatch();
4133
4134 // Get the initial and previous values of the scalar recurrence.
4135 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4136 auto *Previous = Phi->getIncomingValueForBlock(Latch);
4137
4138 // Create a vector from the initial value.
4139 auto *VectorInit = ScalarInit;
4140 if (VF.isVector()) {
4141 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4142 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4142, __PRETTY_FUNCTION__))
;
4143 VectorInit = Builder.CreateInsertElement(
4144 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4145 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
4146 }
4147
4148 VPValue *PhiDef = State.Plan->getVPValue(Phi);
4149 VPValue *PreviousDef = State.Plan->getVPValue(Previous);
4150 // We constructed a temporary phi node in the first phase of vectorization.
4151 // This phi node will eventually be deleted.
4152 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0)));
4153
4154 // Create a phi node for the new recurrence. The current value will either be
4155 // the initial value inserted into a vector or loop-varying vector value.
4156 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4157 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4158
4159 // Get the vectorized previous value of the last part UF - 1. It appears last
4160 // among all unrolled iterations, due to the order of their construction.
4161 Value *PreviousLastPart = State.get(PreviousDef, UF - 1);
4162
4163 // Find and set the insertion point after the previous value if it is an
4164 // instruction.
4165 BasicBlock::iterator InsertPt;
4166 // Note that the previous value may have been constant-folded so it is not
4167 // guaranteed to be an instruction in the vector loop.
4168 // FIXME: Loop invariant values do not form recurrences. We should deal with
4169 // them earlier.
4170 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
4171 InsertPt = LoopVectorBody->getFirstInsertionPt();
4172 else {
4173 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
4174 if (isa<PHINode>(PreviousLastPart))
4175 // If the previous value is a phi node, we should insert after all the phi
4176 // nodes in the block containing the PHI to avoid breaking basic block
4177 // verification. Note that the basic block may be different to
4178 // LoopVectorBody, in case we predicate the loop.
4179 InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
4180 else
4181 InsertPt = ++PreviousInst->getIterator();
4182 }
4183 Builder.SetInsertPoint(&*InsertPt);
4184
4185 // We will construct a vector for the recurrence by combining the values for
4186 // the current and previous iterations. This is the required shuffle mask.
4187 assert(!VF.isScalable())((!VF.isScalable()) ? static_cast<void> (0) : __assert_fail
("!VF.isScalable()", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4187, __PRETTY_FUNCTION__))
;
4188 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
4189 ShuffleMask[0] = VF.getKnownMinValue() - 1;
4190 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
4191 ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
4192
4193 // The vector from which to take the initial value for the current iteration
4194 // (actual or unrolled). Initially, this is the vector phi node.
4195 Value *Incoming = VecPhi;
4196
4197 // Shuffle the current and previous vector and update the vector parts.
4198 for (unsigned Part = 0; Part < UF; ++Part) {
4199 Value *PreviousPart = State.get(PreviousDef, Part);
4200 Value *PhiPart = State.get(PhiDef, Part);
4201 auto *Shuffle =
4202 VF.isVector()
4203 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
4204 : Incoming;
4205 PhiPart->replaceAllUsesWith(Shuffle);
4206 cast<Instruction>(PhiPart)->eraseFromParent();
4207 State.reset(PhiDef, Shuffle, Part);
4208 Incoming = PreviousPart;
4209 }
4210
4211 // Fix the latch value of the new recurrence in the vector loop.
4212 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4213
4214 // Extract the last vector element in the middle block. This will be the
4215 // initial value for the recurrence when jumping to the scalar loop.
4216 auto *ExtractForScalar = Incoming;
4217 if (VF.isVector()) {
4218 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4219 ExtractForScalar = Builder.CreateExtractElement(
4220 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
4221 "vector.recur.extract");
4222 }
4223 // Extract the second last element in the middle block if the
4224 // Phi is used outside the loop. We need to extract the phi itself
4225 // and not the last element (the phi update in the current iteration). This
4226 // will be the value when jumping to the exit block from the LoopMiddleBlock,
4227 // when the scalar loop is not run at all.
4228 Value *ExtractForPhiUsedOutsideLoop = nullptr;
4229 if (VF.isVector())
4230 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4231 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
4232 "vector.recur.extract.for.phi");
4233 // When loop is unrolled without vectorizing, initialize
4234 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4235 // `Incoming`. This is analogous to the vectorized case above: extracting the
4236 // second last element when VF > 1.
4237 else if (UF > 1)
4238 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4239
4240 // Fix the initial value of the original recurrence in the scalar loop.
4241 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4242 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4243 for (auto *BB : predecessors(LoopScalarPreHeader)) {
4244 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4245 Start->addIncoming(Incoming, BB);
4246 }
4247
4248 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4249 Phi->setName("scalar.recur");
4250
4251 // Finally, fix users of the recurrence outside the loop. The users will need
4252 // either the last value of the scalar recurrence or the last value of the
4253 // vector recurrence we extracted in the middle block. Since the loop is in
4254 // LCSSA form, we just need to find all the phi nodes for the original scalar
4255 // recurrence in the exit block, and then add an edge for the middle block.
4256 // Note that LCSSA does not imply single entry when the original scalar loop
4257 // had multiple exiting edges (as we always run the last iteration in the
4258 // scalar epilogue); in that case, the exiting path through middle will be
4259 // dynamically dead and the value picked for the phi doesn't matter.
4260 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4261 if (any_of(LCSSAPhi.incoming_values(),
4262 [Phi](Value *V) { return V == Phi; }))
4263 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4264}
4265
4266static bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
4267 return EnableStrictReductions && RdxDesc.isOrdered();
4268}
4269
4270void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) {
4271 // Get it's reduction variable descriptor.
4272 assert(Legal->isReductionVariable(Phi) &&((Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->isReductionVariable(Phi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4273, __PRETTY_FUNCTION__))
4273 "Unable to find the reduction variable")((Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->isReductionVariable(Phi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4273, __PRETTY_FUNCTION__))
;
4274 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4275
4276 RecurKind RK = RdxDesc.getRecurrenceKind();
4277 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4278 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4279 setDebugLocFromInst(Builder, ReductionStartValue);
4280 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
4281
4282 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst);
4283 // This is the vector-clone of the value that leaves the loop.
4284 Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4285
4286 // Wrap flags are in general invalid after vectorization, clear them.
4287 clearReductionWrapFlags(RdxDesc, State);
4288
4289 // Fix the vector-loop phi.
4290
4291 // Reductions do not have to start at zero. They can start with
4292 // any loop invariant values.
4293 BasicBlock *Latch = OrigLoop->getLoopLatch();
4294 Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4295
4296 for (unsigned Part = 0; Part < UF; ++Part) {
4297 Value *VecRdxPhi = State.get(State.Plan->getVPValue(Phi), Part);
4298 Value *Val = State.get(State.Plan->getVPValue(LoopVal), Part);
4299 if (IsInLoopReductionPhi && useOrderedReductions(RdxDesc) &&
4300 State.VF.isVector())
4301 Val = State.get(State.Plan->getVPValue(LoopVal), UF - 1);
4302 cast<PHINode>(VecRdxPhi)
4303 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4304 }
4305
4306 // Before each round, move the insertion point right between
4307 // the PHIs and the values we are going to write.
4308 // This allows us to write both PHINodes and the extractelement
4309 // instructions.
4310 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4311
4312 setDebugLocFromInst(Builder, LoopExitInst);
4313
4314 Type *PhiTy = Phi->getType();
4315 // If tail is folded by masking, the vector value to leave the loop should be
4316 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4317 // instead of the former. For an inloop reduction the reduction will already
4318 // be predicated, and does not need to be handled here.
4319 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4320 for (unsigned Part = 0; Part < UF; ++Part) {
4321 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4322 Value *Sel = nullptr;
4323 for (User *U : VecLoopExitInst->users()) {
4324 if (isa<SelectInst>(U)) {
4325 assert(!Sel && "Reduction exit feeding two selects")((!Sel && "Reduction exit feeding two selects") ? static_cast
<void> (0) : __assert_fail ("!Sel && \"Reduction exit feeding two selects\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4325, __PRETTY_FUNCTION__))
;
4326 Sel = U;
4327 } else
4328 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select")((isa<PHINode>(U) && "Reduction exit must feed Phi's or select"
) ? static_cast<void> (0) : __assert_fail ("isa<PHINode>(U) && \"Reduction exit must feed Phi's or select\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4328, __PRETTY_FUNCTION__))
;
4329 }
4330 assert(Sel && "Reduction exit feeds no select")((Sel && "Reduction exit feeds no select") ? static_cast
<void> (0) : __assert_fail ("Sel && \"Reduction exit feeds no select\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4330, __PRETTY_FUNCTION__))
;
4331 State.reset(LoopExitInstDef, Sel, Part);
4332
4333 // If the target can create a predicated operator for the reduction at no
4334 // extra cost in the loop (for example a predicated vadd), it can be
4335 // cheaper for the select to remain in the loop than be sunk out of it,
4336 // and so use the select value for the phi instead of the old
4337 // LoopExitValue.
4338 if (PreferPredicatedReductionSelect ||
4339 TTI->preferPredicatedReductionSelect(
4340 RdxDesc.getOpcode(), PhiTy,
4341 TargetTransformInfo::ReductionFlags())) {
4342 auto *VecRdxPhi =
4343 cast<PHINode>(State.get(State.Plan->getVPValue(Phi), Part));
4344 VecRdxPhi->setIncomingValueForBlock(
4345 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4346 }
4347 }
4348 }
4349
4350 // If the vector reduction can be performed in a smaller type, we truncate
4351 // then extend the loop exit value to enable InstCombine to evaluate the
4352 // entire expression in the smaller type.
4353 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4354 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!")((!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"
) ? static_cast<void> (0) : __assert_fail ("!IsInLoopReductionPhi && \"Unexpected truncated inloop reduction!\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4354, __PRETTY_FUNCTION__))
;
4355 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4355, __PRETTY_FUNCTION__))
;
4356 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4357 Builder.SetInsertPoint(
4358 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4359 VectorParts RdxParts(UF);
4360 for (unsigned Part = 0; Part < UF; ++Part) {
4361 RdxParts[Part] = State.get(LoopExitInstDef, Part);
4362 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4363 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4364 : Builder.CreateZExt(Trunc, VecTy);
4365 for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4366 UI != RdxParts[Part]->user_end();)
4367 if (*UI != Trunc) {
4368 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4369 RdxParts[Part] = Extnd;
4370 } else {
4371 ++UI;
4372 }
4373 }
4374 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4375 for (unsigned Part = 0; Part < UF; ++Part) {
4376 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4377 State.reset(LoopExitInstDef, RdxParts[Part], Part);
4378 }
4379 }
4380
4381 // Reduce all of the unrolled parts into a single vector.
4382 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4383 unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4384
4385 // The middle block terminator has already been assigned a DebugLoc here (the
4386 // OrigLoop's single latch terminator). We want the whole middle block to
4387 // appear to execute on this line because: (a) it is all compiler generated,
4388 // (b) these instructions are always executed after evaluating the latch
4389 // conditional branch, and (c) other passes may add new predecessors which
4390 // terminate on this line. This is the easiest way to ensure we don't
4391 // accidentally cause an extra step back into the loop while debugging.
4392 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4393 if (IsInLoopReductionPhi && useOrderedReductions(RdxDesc))
4394 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
4395 else {
4396 // Floating-point operations should have some FMF to enable the reduction.
4397 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4398 Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4399 for (unsigned Part = 1; Part < UF; ++Part) {
4400 Value *RdxPart = State.get(LoopExitInstDef, Part);
4401 if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4402 ReducedPartRdx = Builder.CreateBinOp(
4403 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4404 } else {
4405 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4406 }
4407 }
4408 }
4409
4410 // Create the reduction after the loop. Note that inloop reductions create the
4411 // target reduction in the loop using a Reduction recipe.
4412 if (VF.isVector() && !IsInLoopReductionPhi) {
4413 ReducedPartRdx =
4414 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
4415 // If the reduction can be performed in a smaller type, we need to extend
4416 // the reduction to the wider type before we branch to the original loop.
4417 if (PhiTy != RdxDesc.getRecurrenceType())
4418 ReducedPartRdx = RdxDesc.isSigned()
4419 ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4420 : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4421 }
4422
4423 // Create a phi node that merges control-flow from the backedge-taken check
4424 // block and the middle block.
4425 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4426 LoopScalarPreHeader->getTerminator());
4427 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4428 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4429 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4430
4431 // Now, we need to fix the users of the reduction variable
4432 // inside and outside of the scalar remainder loop.
4433
4434 // We know that the loop is in LCSSA form. We need to update the PHI nodes
4435 // in the exit blocks. See comment on analogous loop in
4436 // fixFirstOrderRecurrence for a more complete explaination of the logic.
4437 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4438 if (any_of(LCSSAPhi.incoming_values(),
4439 [LoopExitInst](Value *V) { return V == LoopExitInst; }))
4440 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4441
4442 // Fix the scalar loop reduction variable with the incoming reduction sum
4443 // from the vector body and from the backedge value.
4444 int IncomingEdgeBlockIdx =
4445 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4446 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")((IncomingEdgeBlockIdx >= 0 && "Invalid block index"
) ? static_cast<void> (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4446, __PRETTY_FUNCTION__))
;
4447 // Pick the other block.
4448 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4449 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4450 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4451}
4452
4453void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc,
4454 VPTransformState &State) {
4455 RecurKind RK = RdxDesc.getRecurrenceKind();
4456 if (RK != RecurKind::Add && RK != RecurKind::Mul)
4457 return;
4458
4459 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4460 assert(LoopExitInstr && "null loop exit instruction")((LoopExitInstr && "null loop exit instruction") ? static_cast
<void> (0) : __assert_fail ("LoopExitInstr && \"null loop exit instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4460, __PRETTY_FUNCTION__))
;
4461 SmallVector<Instruction *, 8> Worklist;
4462 SmallPtrSet<Instruction *, 8> Visited;
4463 Worklist.push_back(LoopExitInstr);
4464 Visited.insert(LoopExitInstr);
4465
4466 while (!Worklist.empty()) {
4467 Instruction *Cur = Worklist.pop_back_val();
4468 if (isa<OverflowingBinaryOperator>(Cur))
4469 for (unsigned Part = 0; Part < UF; ++Part) {
4470 Value *V = State.get(State.Plan->getVPValue(Cur), Part);
4471 cast<Instruction>(V)->dropPoisonGeneratingFlags();
4472 }
4473
4474 for (User *U : Cur->users()) {
4475 Instruction *UI = cast<Instruction>(U);
4476 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4477 Visited.insert(UI).second)
4478 Worklist.push_back(UI);
4479 }
4480 }
4481}
4482
4483void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4484 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4485 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4486 // Some phis were already hand updated by the reduction and recurrence
4487 // code above, leave them alone.
4488 continue;
4489
4490 auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4491 // Non-instruction incoming values will have only one value.
4492
4493 VPLane Lane = VPLane::getFirstLane();
4494 if (isa<Instruction>(IncomingValue) &&
4495 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4496 VF))
4497 Lane = VPLane::getLastLaneForVF(VF);
4498
4499 // Can be a loop invariant incoming value or the last scalar value to be
4500 // extracted from the vectorized loop.
4501 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4502 Value *lastIncomingValue =
4503 OrigLoop->isLoopInvariant(IncomingValue)
4504 ? IncomingValue
4505 : State.get(State.Plan->getVPValue(IncomingValue),
4506 VPIteration(UF - 1, Lane));
4507 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4508 }
4509}
4510
4511void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4512 // The basic block and loop containing the predicated instruction.
4513 auto *PredBB = PredInst->getParent();
4514 auto *VectorLoop = LI->getLoopFor(PredBB);
4515
4516 // Initialize a worklist with the operands of the predicated instruction.
4517 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4518
4519 // Holds instructions that we need to analyze again. An instruction may be
4520 // reanalyzed if we don't yet know if we can sink it or not.
4521 SmallVector<Instruction *, 8> InstsToReanalyze;
4522
4523 // Returns true if a given use occurs in the predicated block. Phi nodes use
4524 // their operands in their corresponding predecessor blocks.
4525 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4526 auto *I = cast<Instruction>(U.getUser());
4527 BasicBlock *BB = I->getParent();
4528 if (auto *Phi = dyn_cast<PHINode>(I))
4529 BB = Phi->getIncomingBlock(
4530 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4531 return BB == PredBB;
4532 };
4533
4534 // Iteratively sink the scalarized operands of the predicated instruction
4535 // into the block we created for it. When an instruction is sunk, it's
4536 // operands are then added to the worklist. The algorithm ends after one pass
4537 // through the worklist doesn't sink a single instruction.
4538 bool Changed;
4539 do {
4540 // Add the instructions that need to be reanalyzed to the worklist, and
4541 // reset the changed indicator.
4542 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4543 InstsToReanalyze.clear();
4544 Changed = false;
4545
4546 while (!Worklist.empty()) {
4547 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4548
4549 // We can't sink an instruction if it is a phi node, is already in the
4550 // predicated block, is not in the loop, or may have side effects.
4551 if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4552 !VectorLoop->contains(I) || I->mayHaveSideEffects())
4553 continue;
4554
4555 // It's legal to sink the instruction if all its uses occur in the
4556 // predicated block. Otherwise, there's nothing to do yet, and we may
4557 // need to reanalyze the instruction.
4558 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4559 InstsToReanalyze.push_back(I);
4560 continue;
4561 }
4562
4563 // Move the instruction to the beginning of the predicated block, and add
4564 // it's operands to the worklist.
4565 I->moveBefore(&*PredBB->getFirstInsertionPt());
4566 Worklist.insert(I->op_begin(), I->op_end());
4567
4568 // The sinking may have enabled other instructions to be sunk, so we will
4569 // need to iterate.
4570 Changed = true;
4571 }
4572 } while (Changed);
4573}
4574
4575void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4576 for (PHINode *OrigPhi : OrigPHIsToFix) {
4577 VPWidenPHIRecipe *VPPhi =
4578 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4579 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4580 // Make sure the builder has a valid insert point.
4581 Builder.SetInsertPoint(NewPhi);
4582 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4583 VPValue *Inc = VPPhi->getIncomingValue(i);
4584 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4585 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4586 }
4587 }
4588}
4589
4590void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4591 VPUser &Operands, unsigned UF,
4592 ElementCount VF, bool IsPtrLoopInvariant,
4593 SmallBitVector &IsIndexLoopInvariant,
4594 VPTransformState &State) {
4595 // Construct a vector GEP by widening the operands of the scalar GEP as
4596 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4597 // results in a vector of pointers when at least one operand of the GEP
4598 // is vector-typed. Thus, to keep the representation compact, we only use
4599 // vector-typed operands for loop-varying values.
4600
4601 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4602 // If we are vectorizing, but the GEP has only loop-invariant operands,
4603 // the GEP we build (by only using vector-typed operands for
4604 // loop-varying values) would be a scalar pointer. Thus, to ensure we
4605 // produce a vector of pointers, we need to either arbitrarily pick an
4606 // operand to broadcast, or broadcast a clone of the original GEP.
4607 // Here, we broadcast a clone of the original.
4608 //
4609 // TODO: If at some point we decide to scalarize instructions having
4610 // loop-invariant operands, this special case will no longer be
4611 // required. We would add the scalarization decision to
4612 // collectLoopScalars() and teach getVectorValue() to broadcast
4613 // the lane-zero scalar value.
4614 auto *Clone = Builder.Insert(GEP->clone());
4615 for (unsigned Part = 0; Part < UF; ++Part) {
4616 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4617 State.set(VPDef, EntryPart, Part);
4618 addMetadata(EntryPart, GEP);
4619 }
4620 } else {
4621 // If the GEP has at least one loop-varying operand, we are sure to
4622 // produce a vector of pointers. But if we are only unrolling, we want
4623 // to produce a scalar GEP for each unroll part. Thus, the GEP we
4624 // produce with the code below will be scalar (if VF == 1) or vector
4625 // (otherwise). Note that for the unroll-only case, we still maintain
4626 // values in the vector mapping with initVector, as we do for other
4627 // instructions.
4628 for (unsigned Part = 0; Part < UF; ++Part) {
4629 // The pointer operand of the new GEP. If it's loop-invariant, we
4630 // won't broadcast it.
4631 auto *Ptr = IsPtrLoopInvariant
4632 ? State.get(Operands.getOperand(0), VPIteration(0, 0))
4633 : State.get(Operands.getOperand(0), Part);
4634
4635 // Collect all the indices for the new GEP. If any index is
4636 // loop-invariant, we won't broadcast it.
4637 SmallVector<Value *, 4> Indices;
4638 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4639 VPValue *Operand = Operands.getOperand(I);
4640 if (IsIndexLoopInvariant[I - 1])
4641 Indices.push_back(State.get(Operand, VPIteration(0, 0)));
4642 else
4643 Indices.push_back(State.get(Operand, Part));
4644 }
4645
4646 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4647 // but it should be a vector, otherwise.
4648 auto *NewGEP =
4649 GEP->isInBounds()
4650 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4651 Indices)
4652 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4653 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&(((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector") ? static_cast<void> (
0) : __assert_fail ("(VF.isScalar() || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4654, __PRETTY_FUNCTION__))
4654 "NewGEP is not a pointer vector")(((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector") ? static_cast<void> (
0) : __assert_fail ("(VF.isScalar() || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4654, __PRETTY_FUNCTION__))
;
4655 State.set(VPDef, NewGEP, Part);
4656 addMetadata(NewGEP, GEP);
4657 }
4658 }
4659}
4660
4661void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4662 RecurrenceDescriptor *RdxDesc,
4663 VPWidenPHIRecipe *PhiR,
4664 VPTransformState &State) {
4665 PHINode *P = cast<PHINode>(PN);
4666 if (EnableVPlanNativePath) {
4667 // Currently we enter here in the VPlan-native path for non-induction
4668 // PHIs where all control flow is uniform. We simply widen these PHIs.
4669 // Create a vector phi with no operands - the vector phi operands will be
4670 // set at the end of vector code generation.
4671 Type *VecTy = (State.VF.isScalar())
4672 ? PN->getType()
4673 : VectorType::get(PN->getType(), State.VF);
4674 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4675 State.set(PhiR, VecPhi, 0);
4676 OrigPHIsToFix.push_back(P);
4677
4678 return;
4679 }
4680
4681 assert(PN->getParent() == OrigLoop->getHeader() &&((PN->getParent() == OrigLoop->getHeader() && "Non-header phis should have been handled elsewhere"
) ? static_cast<void> (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4682, __PRETTY_FUNCTION__))
4682 "Non-header phis should have been handled elsewhere")((PN->getParent() == OrigLoop->getHeader() && "Non-header phis should have been handled elsewhere"
) ? static_cast<void> (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4682, __PRETTY_FUNCTION__))
;
4683
4684 VPValue *StartVPV = PhiR->getStartValue();
4685 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr;
4686 // In order to support recurrences we need to be able to vectorize Phi nodes.
4687 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4688 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4689 // this value when we vectorize all of the instructions that use the PHI.
4690 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) {
4691 Value *Iden = nullptr;
4692 bool ScalarPHI =
4693 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4694 Type *VecTy =
4695 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
4696
4697 if (RdxDesc) {
4698 assert(Legal->isReductionVariable(P) && StartV &&((Legal->isReductionVariable(P) && StartV &&
"RdxDesc should only be set for reduction variables; in that case "
"a StartV is also required") ? static_cast<void> (0) :
__assert_fail ("Legal->isReductionVariable(P) && StartV && \"RdxDesc should only be set for reduction variables; in that case \" \"a StartV is also required\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4700, __PRETTY_FUNCTION__))
4699 "RdxDesc should only be set for reduction variables; in that case "((Legal->isReductionVariable(P) && StartV &&
"RdxDesc should only be set for reduction variables; in that case "
"a StartV is also required") ? static_cast<void> (0) :
__assert_fail ("Legal->isReductionVariable(P) && StartV && \"RdxDesc should only be set for reduction variables; in that case \" \"a StartV is also required\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4700, __PRETTY_FUNCTION__))
4700 "a StartV is also required")((Legal->isReductionVariable(P) && StartV &&
"RdxDesc should only be set for reduction variables; in that case "
"a StartV is also required") ? static_cast<void> (0) :
__assert_fail ("Legal->isReductionVariable(P) && StartV && \"RdxDesc should only be set for reduction variables; in that case \" \"a StartV is also required\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4700, __PRETTY_FUNCTION__))
;
4701 RecurKind RK = RdxDesc->getRecurrenceKind();
4702 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
4703 // MinMax reduction have the start value as their identify.
4704 if (ScalarPHI) {
4705 Iden = StartV;
4706 } else {
4707 IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4708 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4709 StartV = Iden =
4710 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
4711 }
4712 } else {
4713 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity(
4714 RK, VecTy->getScalarType(), RdxDesc->getFastMathFlags());
4715 Iden = IdenC;
4716
4717 if (!ScalarPHI) {
4718 Iden = ConstantVector::getSplat(State.VF, IdenC);
4719 IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4720 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4721 Constant *Zero = Builder.getInt32(0);
4722 StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
4723 }
4724 }
4725 }
4726
4727 for (unsigned Part = 0; Part < State.UF; ++Part) {
4728 // This is phase one of vectorizing PHIs.
4729 Value *EntryPart = PHINode::Create(
4730 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4731 State.set(PhiR, EntryPart, Part);
4732 if (StartV) {
4733 // Make sure to add the reduction start value only to the
4734 // first unroll part.
4735 Value *StartVal = (Part == 0) ? StartV : Iden;
4736 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader);
4737 }
4738 }
4739 return;
4740 }
4741
4742 assert(!Legal->isReductionVariable(P) &&((!Legal->isReductionVariable(P) && "reductions should be handled above"
) ? static_cast<void> (0) : __assert_fail ("!Legal->isReductionVariable(P) && \"reductions should be handled above\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4743, __PRETTY_FUNCTION__))
4743 "reductions should be handled above")((!Legal->isReductionVariable(P) && "reductions should be handled above"
) ? static_cast<void> (0) : __assert_fail ("!Legal->isReductionVariable(P) && \"reductions should be handled above\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4743, __PRETTY_FUNCTION__))
;
4744
4745 setDebugLocFromInst(Builder, P);
4746
4747 // This PHINode must be an induction variable.
4748 // Make sure that we know about it.
4749 assert(Legal->getInductionVars().count(P) && "Not an induction variable")((Legal->getInductionVars().count(P) && "Not an induction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->getInductionVars().count(P) && \"Not an induction variable\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4749, __PRETTY_FUNCTION__))
;
4750
4751 InductionDescriptor II = Legal->getInductionVars().lookup(P);
4752 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4753
4754 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4755 // which can be found from the original scalar operations.
4756 switch (II.getKind()) {
4757 case InductionDescriptor::IK_NoInduction:
4758 llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4758)
;
4759 case InductionDescriptor::IK_IntInduction:
4760 case InductionDescriptor::IK_FpInduction:
4761 llvm_unreachable("Integer/fp induction is handled elsewhere.")::llvm::llvm_unreachable_internal("Integer/fp induction is handled elsewhere."
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4761)
;
4762 case InductionDescriptor::IK_PtrInduction: {
4763 // Handle the pointer induction variable case.
4764 assert(P->getType()->isPointerTy() && "Unexpected type.")((P->getType()->isPointerTy() && "Unexpected type."
) ? static_cast<void> (0) : __assert_fail ("P->getType()->isPointerTy() && \"Unexpected type.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4764, __PRETTY_FUNCTION__))
;
4765 assert(!VF.isScalable() && "Currently unsupported for scalable vectors")((!VF.isScalable() && "Currently unsupported for scalable vectors"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"Currently unsupported for scalable vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4765, __PRETTY_FUNCTION__))
;
4766
4767 if (Cost->isScalarAfterVectorization(P, State.VF)) {
4768 // This is the normalized GEP that starts counting at zero.
4769 Value *PtrInd =
4770 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4771 // Determine the number of scalars we need to generate for each unroll
4772 // iteration. If the instruction is uniform, we only need to generate the
4773 // first lane. Otherwise, we generate all VF values.
4774 unsigned Lanes = Cost->isUniformAfterVectorization(P, State.VF)
4775 ? 1
4776 : State.VF.getKnownMinValue();
4777 for (unsigned Part = 0; Part < UF; ++Part) {
4778 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4779 Constant *Idx = ConstantInt::get(
4780 PtrInd->getType(), Lane + Part * State.VF.getKnownMinValue());
4781 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4782 Value *SclrGep =
4783 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4784 SclrGep->setName("next.gep");
4785 State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4786 }
4787 }
4788 return;
4789 }
4790 assert(isa<SCEVConstant>(II.getStep()) &&((isa<SCEVConstant>(II.getStep()) && "Induction step not a SCEV constant!"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(II.getStep()) && \"Induction step not a SCEV constant!\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4791, __PRETTY_FUNCTION__))
4791 "Induction step not a SCEV constant!")((isa<SCEVConstant>(II.getStep()) && "Induction step not a SCEV constant!"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(II.getStep()) && \"Induction step not a SCEV constant!\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4791, __PRETTY_FUNCTION__))
;
4792 Type *PhiType = II.getStep()->getType();
4793
4794 // Build a pointer phi
4795 Value *ScalarStartValue = II.getStartValue();
4796 Type *ScStValueType = ScalarStartValue->getType();
4797 PHINode *NewPointerPhi =
4798 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4799 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4800
4801 // A pointer induction, performed by using a gep
4802 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4803 Instruction *InductionLoc = LoopLatch->getTerminator();
4804 const SCEV *ScalarStep = II.getStep();
4805 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4806 Value *ScalarStepValue =
4807 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4808 Value *InductionGEP = GetElementPtrInst::Create(
4809 ScStValueType->getPointerElementType(), NewPointerPhi,
4810 Builder.CreateMul(
4811 ScalarStepValue,
4812 ConstantInt::get(PhiType, State.VF.getKnownMinValue() * State.UF)),
4813 "ptr.ind", InductionLoc);
4814 NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4815
4816 // Create UF many actual address geps that use the pointer
4817 // phi as base and a vectorized version of the step value
4818 // (<step*0, ..., step*N>) as offset.
4819 for (unsigned Part = 0; Part < State.UF; ++Part) {
4820 Type *VecPhiType = VectorType::get(PhiType, State.VF);
4821 Value *StartOffset =
4822 ConstantInt::get(VecPhiType, Part * State.VF.getKnownMinValue());
4823 // Create a vector of consecutive numbers from zero to VF.
4824 StartOffset =
4825 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4826
4827 Value *GEP = Builder.CreateGEP(
4828 ScStValueType->getPointerElementType(), NewPointerPhi,
4829 Builder.CreateMul(StartOffset,
4830 Builder.CreateVectorSplat(
4831 State.VF.getKnownMinValue(), ScalarStepValue),
4832 "vector.gep"));
4833 State.set(PhiR, GEP, Part);
4834 }
4835 }
4836 }
4837}
4838
4839/// A helper function for checking whether an integer division-related
4840/// instruction may divide by zero (in which case it must be predicated if
4841/// executed conditionally in the scalar code).
4842/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4843/// Non-zero divisors that are non compile-time constants will not be
4844/// converted into multiplication, so we will still end up scalarizing
4845/// the division, but can do so w/o predication.
4846static bool mayDivideByZero(Instruction &I) {
4847 assert((I.getOpcode() == Instruction::UDiv ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4851, __PRETTY_FUNCTION__))
4848 I.getOpcode() == Instruction::SDiv ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4851, __PRETTY_FUNCTION__))
4849 I.getOpcode() == Instruction::URem ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4851, __PRETTY_FUNCTION__))
4850 I.getOpcode() == Instruction::SRem) &&(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4851, __PRETTY_FUNCTION__))
4851 "Unexpected instruction")(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4851, __PRETTY_FUNCTION__))
;
4852 Value *Divisor = I.getOperand(1);
4853 auto *CInt = dyn_cast<ConstantInt>(Divisor);
4854 return !CInt || CInt->isZero();
4855}
4856
4857void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4858 VPUser &User,
4859 VPTransformState &State) {
4860 switch (I.getOpcode()) {
4861 case Instruction::Call:
4862 case Instruction::Br:
4863 case Instruction::PHI:
4864 case Instruction::GetElementPtr:
4865 case Instruction::Select:
4866 llvm_unreachable("This instruction is handled by a different recipe.")::llvm::llvm_unreachable_internal("This instruction is handled by a different recipe."
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4866)
;
4867 case Instruction::UDiv:
4868 case Instruction::SDiv:
4869 case Instruction::SRem:
4870 case Instruction::URem:
4871 case Instruction::Add:
4872 case Instruction::FAdd:
4873 case Instruction::Sub:
4874 case Instruction::FSub:
4875 case Instruction::FNeg:
4876 case Instruction::Mul:
4877 case Instruction::FMul:
4878 case Instruction::FDiv:
4879 case Instruction::FRem:
4880 case Instruction::Shl:
4881 case Instruction::LShr:
4882 case Instruction::AShr:
4883 case Instruction::And:
4884 case Instruction::Or:
4885 case Instruction::Xor: {
4886 // Just widen unops and binops.
4887 setDebugLocFromInst(Builder, &I);
4888
4889 for (unsigned Part = 0; Part < UF; ++Part) {
4890 SmallVector<Value *, 2> Ops;
4891 for (VPValue *VPOp : User.operands())
4892 Ops.push_back(State.get(VPOp, Part));
4893
4894 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4895
4896 if (auto *VecOp = dyn_cast<Instruction>(V))
4897 VecOp->copyIRFlags(&I);
4898
4899 // Use this vector value for all users of the original instruction.
4900 State.set(Def, V, Part);
4901 addMetadata(V, &I);
4902 }
4903
4904 break;
4905 }
4906 case Instruction::ICmp:
4907 case Instruction::FCmp: {
4908 // Widen compares. Generate vector compares.
4909 bool FCmp = (I.getOpcode() == Instruction::FCmp);
4910 auto *Cmp = cast<CmpInst>(&I);
4911 setDebugLocFromInst(Builder, Cmp);
4912 for (unsigned Part = 0; Part < UF; ++Part) {
4913 Value *A = State.get(User.getOperand(0), Part);
4914 Value *B = State.get(User.getOperand(1), Part);
4915 Value *C = nullptr;
4916 if (FCmp) {
4917 // Propagate fast math flags.
4918 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4919 Builder.setFastMathFlags(Cmp->getFastMathFlags());
4920 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4921 } else {
4922 C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4923 }
4924 State.set(Def, C, Part);
4925 addMetadata(C, &I);
4926 }
4927
4928 break;
4929 }
4930
4931 case Instruction::ZExt:
4932 case Instruction::SExt:
4933 case Instruction::FPToUI:
4934 case Instruction::FPToSI:
4935 case Instruction::FPExt:
4936 case Instruction::PtrToInt:
4937 case Instruction::IntToPtr:
4938 case Instruction::SIToFP:
4939 case Instruction::UIToFP:
4940 case Instruction::Trunc:
4941 case Instruction::FPTrunc:
4942 case Instruction::BitCast: {
4943 auto *CI = cast<CastInst>(&I);
4944 setDebugLocFromInst(Builder, CI);
4945
4946 /// Vectorize casts.
4947 Type *DestTy =
4948 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4949
4950 for (unsigned Part = 0; Part < UF; ++Part) {
4951 Value *A = State.get(User.getOperand(0), Part);
4952 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4953 State.set(Def, Cast, Part);
4954 addMetadata(Cast, &I);
4955 }
4956 break;
4957 }
4958 default:
4959 // This instruction is not vectorized by simple widening.
4960 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an unhandled instruction: "
<< I; } } while (false)
;
4961 llvm_unreachable("Unhandled instruction!")::llvm::llvm_unreachable_internal("Unhandled instruction!", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4961)
;
4962 } // end of switch.
4963}
4964
4965void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4966 VPUser &ArgOperands,
4967 VPTransformState &State) {
4968 assert(!isa<DbgInfoIntrinsic>(I) &&((!isa<DbgInfoIntrinsic>(I) && "DbgInfoIntrinsic should have been dropped during VPlan construction"
) ? static_cast<void> (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(I) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4969, __PRETTY_FUNCTION__))
4969 "DbgInfoIntrinsic should have been dropped during VPlan construction")((!isa<DbgInfoIntrinsic>(I) && "DbgInfoIntrinsic should have been dropped during VPlan construction"
) ? static_cast<void> (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(I) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4969, __PRETTY_FUNCTION__))
;
4970 setDebugLocFromInst(Builder, &I);
4971
4972 Module *M = I.getParent()->getParent()->getParent();
4973 auto *CI = cast<CallInst>(&I);
4974
4975 SmallVector<Type *, 4> Tys;
4976 for (Value *ArgOperand : CI->arg_operands())
4977 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4978
4979 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4980
4981 // The flag shows whether we use Intrinsic or a usual Call for vectorized
4982 // version of the instruction.
4983 // Is it beneficial to perform intrinsic call compared to lib call?
4984 bool NeedToScalarize = false;
4985 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4986 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4987 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4988 assert((UseVectorIntrinsic || !NeedToScalarize) &&(((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."
) ? static_cast<void> (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4989, __PRETTY_FUNCTION__))
4989 "Instruction should be scalarized elsewhere.")(((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."
) ? static_cast<void> (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4989, __PRETTY_FUNCTION__))
;
4990 assert((IntrinsicCost.isValid() || CallCost.isValid()) &&(((IntrinsicCost.isValid() || CallCost.isValid()) && "Either the intrinsic cost or vector call cost must be valid"
) ? static_cast<void> (0) : __assert_fail ("(IntrinsicCost.isValid() || CallCost.isValid()) && \"Either the intrinsic cost or vector call cost must be valid\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4991, __PRETTY_FUNCTION__))
4991 "Either the intrinsic cost or vector call cost must be valid")(((IntrinsicCost.isValid() || CallCost.isValid()) && "Either the intrinsic cost or vector call cost must be valid"
) ? static_cast<void> (0) : __assert_fail ("(IntrinsicCost.isValid() || CallCost.isValid()) && \"Either the intrinsic cost or vector call cost must be valid\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4991, __PRETTY_FUNCTION__))
;
4992
4993 for (unsigned Part = 0; Part < UF; ++Part) {
4994 SmallVector<Value *, 4> Args;
4995 for (auto &I : enumerate(ArgOperands.operands())) {
4996 // Some intrinsics have a scalar argument - don't replace it with a
4997 // vector.
4998 Value *Arg;
4999 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
5000 Arg = State.get(I.value(), Part);
5001 else
5002 Arg = State.get(I.value(), VPIteration(0, 0));
5003 Args.push_back(Arg);
5004 }
5005
5006 Function *VectorF;
5007 if (UseVectorIntrinsic) {
5008 // Use vector version of the intrinsic.
5009 Type *TysForDecl[] = {CI->getType()};
5010 if (VF.isVector())
5011 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
5012 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
5013 assert(VectorF && "Can't retrieve vector intrinsic.")((VectorF && "Can't retrieve vector intrinsic.") ? static_cast
<void> (0) : __assert_fail ("VectorF && \"Can't retrieve vector intrinsic.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5013, __PRETTY_FUNCTION__))
;
5014 } else {
5015 // Use vector version of the function call.
5016 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
5017#ifndef NDEBUG
5018 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&((VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
"Can't create vector function.") ? static_cast<void> (
0) : __assert_fail ("VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5019, __PRETTY_FUNCTION__))
5019 "Can't create vector function.")((VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
"Can't create vector function.") ? static_cast<void> (
0) : __assert_fail ("VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5019, __PRETTY_FUNCTION__))
;
5020#endif
5021 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
5022 }
5023 SmallVector<OperandBundleDef, 1> OpBundles;
5024 CI->getOperandBundlesAsDefs(OpBundles);
5025 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
5026
5027 if (isa<FPMathOperator>(V))
5028 V->copyFastMathFlags(CI);
5029
5030 State.set(Def, V, Part);
5031 addMetadata(V, &I);
5032 }
5033}
5034
5035void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
5036 VPUser &Operands,
5037 bool InvariantCond,
5038 VPTransformState &State) {
5039 setDebugLocFromInst(Builder, &I);
5040
5041 // The condition can be loop invariant but still defined inside the
5042 // loop. This means that we can't just use the original 'cond' value.
5043 // We have to take the 'vectorized' value and pick the first lane.
5044 // Instcombine will make this a no-op.
5045 auto *InvarCond = InvariantCond
5046 ? State.get(Operands.getOperand(0), VPIteration(0, 0))
5047 : nullptr;
5048
5049 for (unsigned Part = 0; Part < UF; ++Part) {
5050 Value *Cond =
5051 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
5052 Value *Op0 = State.get(Operands.getOperand(1), Part);
5053 Value *Op1 = State.get(Operands.getOperand(2), Part);
5054 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
5055 State.set(VPDef, Sel, Part);
5056 addMetadata(Sel, &I);
5057 }
5058}
5059
5060void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
5061 // We should not collect Scalars more than once per VF. Right now, this
5062 // function is called from collectUniformsAndScalars(), which already does
5063 // this check. Collecting Scalars for VF=1 does not make any sense.
5064 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&((VF.isVector() && Scalars.find(VF) == Scalars.end() &&
"This function should not be visited twice for the same VF")
? static_cast<void> (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5065, __PRETTY_FUNCTION__))
5065 "This function should not be visited twice for the same VF")((VF.isVector() && Scalars.find(VF) == Scalars.end() &&
"This function should not be visited twice for the same VF")
? static_cast<void> (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5065, __PRETTY_FUNCTION__))
;
5066
5067 SmallSetVector<Instruction *, 8> Worklist;
5068
5069 // These sets are used to seed the analysis with pointers used by memory
5070 // accesses that will remain scalar.
5071 SmallSetVector<Instruction *, 8> ScalarPtrs;
5072 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
5073 auto *Latch = TheLoop->getLoopLatch();
5074
5075 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
5076 // The pointer operands of loads and stores will be scalar as long as the
5077 // memory access is not a gather or scatter operation. The value operand of a
5078 // store will remain scalar if the store is scalarized.
5079 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
5080 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
5081 assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5082, __PRETTY_FUNCTION__))
5082 "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5082, __PRETTY_FUNCTION__))
;
5083 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
5084 if (Ptr == Store->getValueOperand())
5085 return WideningDecision == CM_Scalarize;
5086 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&((Ptr == getLoadStorePointerOperand(MemAccess) && "Ptr is neither a value or pointer operand"
) ? static_cast<void> (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5087, __PRETTY_FUNCTION__))
5087 "Ptr is neither a value or pointer operand")((Ptr == getLoadStorePointerOperand(MemAccess) && "Ptr is neither a value or pointer operand"
) ? static_cast<void> (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5087, __PRETTY_FUNCTION__))
;
5088 return WideningDecision != CM_GatherScatter;
5089 };
5090
5091 // A helper that returns true if the given value is a bitcast or
5092 // getelementptr instruction contained in the loop.
5093 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
5094 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
5095 isa<GetElementPtrInst>(V)) &&
5096 !TheLoop->isLoopInvariant(V);
5097 };
5098
5099 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
5100 if (!isa<PHINode>(Ptr) ||
5101 !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
5102 return false;
5103 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
5104 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
5105 return false;
5106 return isScalarUse(MemAccess, Ptr);
5107 };
5108
5109 // A helper that evaluates a memory access's use of a pointer. If the
5110 // pointer is actually the pointer induction of a loop, it is being
5111 // inserted into Worklist. If the use will be a scalar use, and the
5112 // pointer is only used by memory accesses, we place the pointer in
5113 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
5114 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
5115 if (isScalarPtrInduction(MemAccess, Ptr)) {
5116 Worklist.insert(cast<Instruction>(Ptr));
5117 Instruction *Update = cast<Instruction>(
5118 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
5119 Worklist.insert(Update);
5120 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptrdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Ptr << "\n"; } } while (false)
5121 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Ptr << "\n"; } } while (false)
;
5122 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Updatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Update << "\n"; } } while (false)
5123 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Update << "\n"; } } while (false)
;
5124 return;
5125 }
5126 // We only care about bitcast and getelementptr instructions contained in
5127 // the loop.
5128 if (!isLoopVaryingBitCastOrGEP(Ptr))
5129 return;
5130
5131 // If the pointer has already been identified as scalar (e.g., if it was
5132 // also identified as uniform), there's nothing to do.
5133 auto *I = cast<Instruction>(Ptr);
5134 if (Worklist.count(I))
5135 return;
5136
5137 // If the use of the pointer will be a scalar use, and all users of the
5138 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5139 // place the pointer in PossibleNonScalarPtrs.
5140 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5141 return isa<LoadInst>(U) || isa<StoreInst>(U);
5142 }))
5143 ScalarPtrs.insert(I);
5144 else
5145 PossibleNonScalarPtrs.insert(I);
5146 };
5147
5148 // We seed the scalars analysis with three classes of instructions: (1)
5149 // instructions marked uniform-after-vectorization and (2) bitcast,
5150 // getelementptr and (pointer) phi instructions used by memory accesses
5151 // requiring a scalar use.
5152 //
5153 // (1) Add to the worklist all instructions that have been identified as
5154 // uniform-after-vectorization.
5155 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5156
5157 // (2) Add to the worklist all bitcast and getelementptr instructions used by
5158 // memory accesses requiring a scalar use. The pointer operands of loads and
5159 // stores will be scalar as long as the memory accesses is not a gather or
5160 // scatter operation. The value operand of a store will remain scalar if the
5161 // store is scalarized.
5162 for (auto *BB : TheLoop->blocks())
5163 for (auto &I : *BB) {
5164 if (auto *Load = dyn_cast<LoadInst>(&I)) {
5165 evaluatePtrUse(Load, Load->getPointerOperand());
5166 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5167 evaluatePtrUse(Store, Store->getPointerOperand());
5168 evaluatePtrUse(Store, Store->getValueOperand());
5169 }
5170 }
5171 for (auto *I : ScalarPtrs)
5172 if (!PossibleNonScalarPtrs.count(I)) {
5173 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *I << "\n"; } } while (false)
;
5174 Worklist.insert(I);
5175 }
5176
5177 // Insert the forced scalars.
5178 // FIXME: Currently widenPHIInstruction() often creates a dead vector
5179 // induction variable when the PHI user is scalarized.
5180 auto ForcedScalar = ForcedScalars.find(VF);
5181 if (ForcedScalar != ForcedScalars.end())
5182 for (auto *I : ForcedScalar->second)
5183 Worklist.insert(I);
5184
5185 // Expand the worklist by looking through any bitcasts and getelementptr
5186 // instructions we've already identified as scalar. This is similar to the
5187 // expansion step in collectLoopUniforms(); however, here we're only
5188 // expanding to include additional bitcasts and getelementptr instructions.
5189 unsigned Idx = 0;
5190 while (Idx != Worklist.size()) {
5191 Instruction *Dst = Worklist[Idx++];
5192 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5193 continue;
5194 auto *Src = cast<Instruction>(Dst->getOperand(0));
5195 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5196 auto *J = cast<Instruction>(U);
5197 return !TheLoop->contains(J) || Worklist.count(J) ||
5198 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5199 isScalarUse(J, Src));
5200 })) {
5201 Worklist.insert(Src);
5202 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Src << "\n"; } } while (false)
;
5203 }
5204 }
5205
5206 // An induction variable will remain scalar if all users of the induction
5207 // variable and induction variable update remain scalar.
5208 for (auto &Induction : Legal->getInductionVars()) {
5209 auto *Ind = Induction.first;
5210 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5211
5212 // If tail-folding is applied, the primary induction variable will be used
5213 // to feed a vector compare.
5214 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5215 continue;
5216
5217 // Determine if all users of the induction variable are scalar after
5218 // vectorization.
5219 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5220 auto *I = cast<Instruction>(U);
5221 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5222 });
5223 if (!ScalarInd)
5224 continue;
5225
5226 // Determine if all users of the induction variable update instruction are
5227 // scalar after vectorization.
5228 auto ScalarIndUpdate =
5229 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5230 auto *I = cast<Instruction>(U);
5231 return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5232 });
5233 if (!ScalarIndUpdate)
5234 continue;
5235
5236 // The induction variable and its update instruction will remain scalar.
5237 Worklist.insert(Ind);
5238 Worklist.insert(IndUpdate);
5239 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Ind << "\n"; } } while (false)
;
5240 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
5241 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
;
5242 }
5243
5244 Scalars[VF].insert(Worklist.begin(), Worklist.end());
5245}
5246
5247bool LoopVectorizationCostModel::isScalarWithPredication(
5248 Instruction *I, ElementCount VF) const {
5249 if (!blockNeedsPredication(I->getParent()))
5250 return false;
5251 switch(I->getOpcode()) {
5252 default:
5253 break;
5254 case Instruction::Load:
5255 case Instruction::Store: {
5256 if (!Legal->isMaskRequired(I))
5257 return false;
5258 auto *Ptr = getLoadStorePointerOperand(I);
5259 auto *Ty = getMemInstValueType(I);
5260 // We have already decided how to vectorize this instruction, get that
5261 // result.
5262 if (VF.isVector()) {
5263 InstWidening WideningDecision = getWideningDecision(I, VF);
5264 assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5265, __PRETTY_FUNCTION__))
5265 "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5265, __PRETTY_FUNCTION__))
;
5266 return WideningDecision == CM_Scalarize;
5267 }
5268 const Align Alignment = getLoadStoreAlignment(I);
5269 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5270 isLegalMaskedGather(Ty, Alignment))
5271 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5272 isLegalMaskedScatter(Ty, Alignment));
5273 }
5274 case Instruction::UDiv:
5275 case Instruction::SDiv:
5276 case Instruction::SRem:
5277 case Instruction::URem:
5278 return mayDivideByZero(*I);
5279 }
5280 return false;
5281}
5282
5283bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5284 Instruction *I, ElementCount VF) {
5285 assert(isAccessInterleaved(I) && "Expecting interleaved access.")((isAccessInterleaved(I) && "Expecting interleaved access."
) ? static_cast<void> (0) : __assert_fail ("isAccessInterleaved(I) && \"Expecting interleaved access.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5285, __PRETTY_FUNCTION__))
;
5286 assert(getWideningDecision(I, VF) == CM_Unknown &&((getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."
) ? static_cast<void> (0) : __assert_fail ("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5287, __PRETTY_FUNCTION__))
5287 "Decision should not be set yet.")((getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."
) ? static_cast<void> (0) : __assert_fail ("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5287, __PRETTY_FUNCTION__))
;
5288 auto *Group = getInterleavedAccessGroup(I);
5289 assert(Group && "Must have a group.")((Group && "Must have a group.") ? static_cast<void
> (0) : __assert_fail ("Group && \"Must have a group.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5289, __PRETTY_FUNCTION__))
;
5290
5291 // If the instruction's allocated size doesn't equal it's type size, it
5292 // requires padding and will be scalarized.
5293 auto &DL = I->getModule()->getDataLayout();
5294 auto *ScalarTy = getMemInstValueType(I);
5295 if (hasIrregularType(ScalarTy, DL))
5296 return false;
5297
5298 // Check if masking is required.
5299 // A Group may need masking for one of two reasons: it resides in a block that
5300 // needs predication, or it was decided to use masking to deal with gaps.
5301 bool PredicatedAccessRequiresMasking =
5302 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5303 bool AccessWithGapsRequiresMasking =
5304 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5305 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
5306 return true;
5307
5308 // If masked interleaving is required, we expect that the user/target had
5309 // enabled it, because otherwise it either wouldn't have been created or
5310 // it should have been invalidated by the CostModel.
5311 assert(useMaskedInterleavedAccesses(TTI) &&((useMaskedInterleavedAccesses(TTI) && "Masked interleave-groups for predicated accesses are not enabled."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5312, __PRETTY_FUNCTION__))
5312 "Masked interleave-groups for predicated accesses are not enabled.")((useMaskedInterleavedAccesses(TTI) && "Masked interleave-groups for predicated accesses are not enabled."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5312, __PRETTY_FUNCTION__))
;
5313
5314 auto *Ty = getMemInstValueType(I);
5315 const Align Alignment = getLoadStoreAlignment(I);
5316 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5317 : TTI.isLegalMaskedStore(Ty, Alignment);
5318}
5319
5320bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5321 Instruction *I, ElementCount VF) {
5322 // Get and ensure we have a valid memory instruction.
5323 LoadInst *LI = dyn_cast<LoadInst>(I);
5324 StoreInst *SI = dyn_cast<StoreInst>(I);
5325 assert((LI || SI) && "Invalid memory instruction")(((LI || SI) && "Invalid memory instruction") ? static_cast
<void> (0) : __assert_fail ("(LI || SI) && \"Invalid memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5325, __PRETTY_FUNCTION__))
;
5326
5327 auto *Ptr = getLoadStorePointerOperand(I);
5328
5329 // In order to be widened, the pointer should be consecutive, first of all.
5330 if (!Legal->isConsecutivePtr(Ptr))
5331 return false;
5332
5333 // If the instruction is a store located in a predicated block, it will be
5334 // scalarized.
5335 if (isScalarWithPredication(I))
5336 return false;
5337
5338 // If the instruction's allocated size doesn't equal it's type size, it
5339 // requires padding and will be scalarized.
5340 auto &DL = I->getModule()->getDataLayout();
5341 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5342 if (hasIrregularType(ScalarTy, DL))
5343 return false;
5344
5345 return true;
5346}
5347
5348void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5349 // We should not collect Uniforms more than once per VF. Right now,
5350 // this function is called from collectUniformsAndScalars(), which
5351 // already does this check. Collecting Uniforms for VF=1 does not make any
5352 // sense.
5353
5354 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&((VF.isVector() && Uniforms.find(VF) == Uniforms.end(
) && "This function should not be visited twice for the same VF"
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5355, __PRETTY_FUNCTION__))
5355 "This function should not be visited twice for the same VF")((VF.isVector() && Uniforms.find(VF) == Uniforms.end(
) && "This function should not be visited twice for the same VF"
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5355, __PRETTY_FUNCTION__))
;
5356
5357 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5358 // not analyze again. Uniforms.count(VF) will return 1.
5359 Uniforms[VF].clear();
5360
5361 // We now know that the loop is vectorizable!
5362 // Collect instructions inside the loop that will remain uniform after
5363 // vectorization.
5364
5365 // Global values, params and instructions outside of current loop are out of
5366 // scope.
5367 auto isOutOfScope = [&](Value *V) -> bool {
5368 Instruction *I = dyn_cast<Instruction>(V);
5369 return (!I || !TheLoop->contains(I));
5370 };
5371
5372 SetVector<Instruction *> Worklist;
5373 BasicBlock *Latch = TheLoop->getLoopLatch();
5374
5375 // Instructions that are scalar with predication must not be considered
5376 // uniform after vectorization, because that would create an erroneous
5377 // replicating region where only a single instance out of VF should be formed.
5378 // TODO: optimize such seldom cases if found important, see PR40816.
5379 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5380 if (isOutOfScope(I)) {
5381 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
5382 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
;
5383 return;
5384 }
5385 if (isScalarWithPredication(I, VF)) {
5386 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
5387 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
;
5388 return;
5389 }
5390 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
<< *I << "\n"; } } while (false)
;
5391 Worklist.insert(I);
5392 };
5393
5394 // Start with the conditional branch. If the branch condition is an
5395 // instruction contained in the loop that is only used by the branch, it is
5396 // uniform.
5397 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5398 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5399 addToWorklistIfAllowed(Cmp);
5400
5401 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5402 InstWidening WideningDecision = getWideningDecision(I, VF);
5403 assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5404, __PRETTY_FUNCTION__))
5404 "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5404, __PRETTY_FUNCTION__))
;
5405
5406 // A uniform memory op is itself uniform. We exclude uniform stores
5407 // here as they demand the last lane, not the first one.
5408 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5409 assert(WideningDecision == CM_Scalarize)((WideningDecision == CM_Scalarize) ? static_cast<void>
(0) : __assert_fail ("WideningDecision == CM_Scalarize", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5409, __PRETTY_FUNCTION__))
;
5410 return true;
5411 }
5412
5413 return (WideningDecision == CM_Widen ||
5414 WideningDecision == CM_Widen_Reverse ||
5415 WideningDecision == CM_Interleave);
5416 };
5417
5418
5419 // Returns true if Ptr is the pointer operand of a memory access instruction
5420 // I, and I is known to not require scalarization.
5421 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5422 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5423 };
5424
5425 // Holds a list of values which are known to have at least one uniform use.
5426 // Note that there may be other uses which aren't uniform. A "uniform use"
5427 // here is something which only demands lane 0 of the unrolled iterations;
5428 // it does not imply that all lanes produce the same value (e.g. this is not
5429 // the usual meaning of uniform)
5430 SetVector<Value *> HasUniformUse;
5431
5432 // Scan the loop for instructions which are either a) known to have only
5433 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5434 for (auto *BB : TheLoop->blocks())
5435 for (auto &I : *BB) {
5436 // If there's no pointer operand, there's nothing to do.
5437 auto *Ptr = getLoadStorePointerOperand(&I);
5438 if (!Ptr)
5439 continue;
5440
5441 // A uniform memory op is itself uniform. We exclude uniform stores
5442 // here as they demand the last lane, not the first one.
5443 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5444 addToWorklistIfAllowed(&I);
5445
5446 if (isUniformDecision(&I, VF)) {
5447 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check")((isVectorizedMemAccessUse(&I, Ptr) && "consistency check"
) ? static_cast<void> (0) : __assert_fail ("isVectorizedMemAccessUse(&I, Ptr) && \"consistency check\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5447, __PRETTY_FUNCTION__))
;
5448 HasUniformUse.insert(Ptr);
5449 }
5450 }
5451
5452 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5453 // demanding) users. Since loops are assumed to be in LCSSA form, this
5454 // disallows uses outside the loop as well.
5455 for (auto *V : HasUniformUse) {
5456 if (isOutOfScope(V))
5457 continue;
5458 auto *I = cast<Instruction>(V);
5459 auto UsersAreMemAccesses =
5460 llvm::all_of(I->users(), [&](User *U) -> bool {
5461 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5462 });
5463 if (UsersAreMemAccesses)
5464 addToWorklistIfAllowed(I);
5465 }
5466
5467 // Expand Worklist in topological order: whenever a new instruction
5468 // is added , its users should be already inside Worklist. It ensures
5469 // a uniform instruction will only be used by uniform instructions.
5470 unsigned idx = 0;
5471 while (idx != Worklist.size()) {
5472 Instruction *I = Worklist[idx++];
5473
5474 for (auto OV : I->operand_values()) {
5475 // isOutOfScope operands cannot be uniform instructions.
5476 if (isOutOfScope(OV))
5477 continue;
5478 // First order recurrence Phi's should typically be considered
5479 // non-uniform.
5480 auto *OP = dyn_cast<PHINode>(OV);
5481 if (OP && Legal->isFirstOrderRecurrence(OP))
5482 continue;
5483 // If all the users of the operand are uniform, then add the
5484 // operand into the uniform worklist.
5485 auto *OI = cast<Instruction>(OV);
5486 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5487 auto *J = cast<Instruction>(U);
5488 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5489 }))
5490 addToWorklistIfAllowed(OI);
5491 }
5492 }
5493
5494 // For an instruction to be added into Worklist above, all its users inside
5495 // the loop should also be in Worklist. However, this condition cannot be
5496 // true for phi nodes that form a cyclic dependence. We must process phi
5497 // nodes separately. An induction variable will remain uniform if all users
5498 // of the induction variable and induction variable update remain uniform.
5499 // The code below handles both pointer and non-pointer induction variables.
5500 for (auto &Induction : Legal->getInductionVars()) {
5501 auto *Ind = Induction.first;
5502 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5503
5504 // Determine if all users of the induction variable are uniform after
5505 // vectorization.
5506 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5507 auto *I = cast<Instruction>(U);
5508 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5509 isVectorizedMemAccessUse(I, Ind);
5510 });
5511 if (!UniformInd)
5512 continue;
5513
5514 // Determine if all users of the induction variable update instruction are
5515 // uniform after vectorization.
5516 auto UniformIndUpdate =
5517 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5518 auto *I = cast<Instruction>(U);
5519 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5520 isVectorizedMemAccessUse(I, IndUpdate);
5521 });
5522 if (!UniformIndUpdate)
5523 continue;
5524
5525 // The induction variable and its update instruction will remain uniform.
5526 addToWorklistIfAllowed(Ind);
5527 addToWorklistIfAllowed(IndUpdate);
5528 }
5529
5530 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5531}
5532
5533bool LoopVectorizationCostModel::runtimeChecksRequired() {
5534 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Performing code size checks.\n"
; } } while (false)
;
5535
5536 if (Legal->getRuntimePointerChecking()->Need) {
5537 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5538 "runtime pointer checks needed. Enable vectorization of this "
5539 "loop with '#pragma clang loop vectorize(enable)' when "
5540 "compiling with -Os/-Oz",
5541 "CantVersionLoopWithOptForSize", ORE, TheLoop);
5542 return true;
5543 }
5544
5545 if (!PSE.getUnionPredicate().getPredicates().empty()) {
5546 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5547 "runtime SCEV checks needed. Enable vectorization of this "
5548 "loop with '#pragma clang loop vectorize(enable)' when "
5549 "compiling with -Os/-Oz",
5550 "CantVersionLoopWithOptForSize", ORE, TheLoop);
5551 return true;
5552 }
5553
5554 // FIXME: Avoid specializing for stride==1 instead of bailing out.
5555 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5556 reportVectorizationFailure("Runtime stride check for small trip count",
5557 "runtime stride == 1 checks needed. Enable vectorization of "
5558 "this loop without such check by compiling with -Os/-Oz",
5559 "CantVersionLoopWithOptForSize", ORE, TheLoop);
5560 return true;
5561 }
5562
5563 return false;
5564}
5565
5566Optional<ElementCount>
5567LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5568 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5569 // TODO: It may by useful to do since it's still likely to be dynamically
5570 // uniform if the target can skip.
5571 reportVectorizationFailure(
5572 "Not inserting runtime ptr check for divergent target",
5573 "runtime pointer checks needed. Not enabled for divergent target",
5574 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5575 return None;
5576 }
5577
5578 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5579 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found trip count: "
<< TC << '\n'; } } while (false)
;
5580 if (TC == 1) {
5581 reportVectorizationFailure("Single iteration (non) loop",
5582 "loop trip count is one, irrelevant for vectorization",
5583 "SingleIterationLoop", ORE, TheLoop);
5584 return None;
5585 }
5586
5587 switch (ScalarEpilogueStatus) {
5588 case CM_ScalarEpilogueAllowed:
5589 return computeFeasibleMaxVF(TC, UserVF);
5590 case CM_ScalarEpilogueNotAllowedUsePredicate:
5591 LLVM_FALLTHROUGH[[gnu::fallthrough]];
5592 case CM_ScalarEpilogueNotNeededUsePredicate:
5593 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5594 dbgs() << "LV: vector predicate hint/switch found.\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5595 << "LV: Not allowing scalar epilogue, creating predicated "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5596 << "vector loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
;
5597 break;
5598 case CM_ScalarEpilogueNotAllowedLowTripLoop:
5599 // fallthrough as a special case of OptForSize
5600 case CM_ScalarEpilogueNotAllowedOptSize:
5601 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5602 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
5603 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
;
5604 else
5605 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
<< "count.\n"; } } while (false)
5606 << "count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
<< "count.\n"; } } while (false)
;
5607
5608 // Bail if runtime checks are required, which are not good when optimising
5609 // for size.
5610 if (runtimeChecksRequired())
5611 return None;
5612
5613 break;
5614 }
5615
5616 // The only loops we can vectorize without a scalar epilogue, are loops with
5617 // a bottom-test and a single exiting block. We'd have to handle the fact
5618 // that not every instruction executes on the last iteration. This will
5619 // require a lane mask which varies through the vector loop body. (TODO)
5620 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5621 // If there was a tail-folding hint/switch, but we can't fold the tail by
5622 // masking, fallback to a vectorization with a scalar epilogue.
5623 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5624 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
5625 "scalar epilogue instead.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
;
5626 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5627 return computeFeasibleMaxVF(TC, UserVF);
5628 }
5629 return None;
5630 }
5631
5632 // Now try the tail folding
5633
5634 // Invalidate interleave groups that require an epilogue if we can't mask
5635 // the interleave-group.
5636 if (!useMaskedInterleavedAccesses(TTI)) {
5637 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&((WideningDecisions.empty() && Uniforms.empty() &&
Scalars.empty() && "No decisions should have been taken at this point"
) ? static_cast<void> (0) : __assert_fail ("WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && \"No decisions should have been taken at this point\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5638, __PRETTY_FUNCTION__))
5638 "No decisions should have been taken at this point")((WideningDecisions.empty() && Uniforms.empty() &&
Scalars.empty() && "No decisions should have been taken at this point"
) ? static_cast<void> (0) : __assert_fail ("WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && \"No decisions should have been taken at this point\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5638, __PRETTY_FUNCTION__))
;
5639 // Note: There is no need to invalidate any cost modeling decisions here, as
5640 // non where taken so far.
5641 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5642 }
5643
5644 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
5645 assert(!MaxVF.isScalable() &&((!MaxVF.isScalable() && "Scalable vectors do not yet support tail folding"
) ? static_cast<void> (0) : __assert_fail ("!MaxVF.isScalable() && \"Scalable vectors do not yet support tail folding\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5646, __PRETTY_FUNCTION__))
5646 "Scalable vectors do not yet support tail folding")((!MaxVF.isScalable() && "Scalable vectors do not yet support tail folding"
) ? static_cast<void> (0) : __assert_fail ("!MaxVF.isScalable() && \"Scalable vectors do not yet support tail folding\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5646, __PRETTY_FUNCTION__))
;
5647 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&(((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())
) && "MaxVF must be a power of 2") ? static_cast<void
> (0) : __assert_fail ("(UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && \"MaxVF must be a power of 2\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5648, __PRETTY_FUNCTION__))
5648 "MaxVF must be a power of 2")(((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())
) && "MaxVF must be a power of 2") ? static_cast<void
> (0) : __assert_fail ("(UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && \"MaxVF must be a power of 2\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5648, __PRETTY_FUNCTION__))
;
5649 unsigned MaxVFtimesIC =
5650 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
5651 // Avoid tail folding if the trip count is known to be a multiple of any VF we
5652 // chose.
5653 ScalarEvolution *SE = PSE.getSE();
5654 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5655 const SCEV *ExitCount = SE->getAddExpr(
5656 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5657 const SCEV *Rem = SE->getURemExpr(
5658 SE->applyLoopGuards(ExitCount, TheLoop),
5659 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5660 if (Rem->isZero()) {
5661 // Accept MaxVF if we do not have a tail.
5662 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: No tail will remain for any chosen VF.\n"
; } } while (false)
;
5663 return MaxVF;
5664 }
5665
5666 // If we don't know the precise trip count, or if the trip count that we
5667 // found modulo the vectorization factor is not zero, try to fold the tail
5668 // by masking.
5669 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5670 if (Legal->prepareToFoldTailByMasking()) {
5671 FoldTailByMasking = true;
5672 return MaxVF;
5673 }
5674
5675 // If there was a tail-folding hint/switch, but we can't fold the tail by
5676 // masking, fallback to a vectorization with a scalar epilogue.
5677 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5678 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
5679 "scalar epilogue instead.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
;
5680 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5681 return MaxVF;
5682 }
5683
5684 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5685 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"
; } } while (false)
;
5686 return None;
5687 }
5688
5689 if (TC == 0) {
5690 reportVectorizationFailure(
5691 "Unable to calculate the loop count due to complex control flow",
5692 "unable to calculate the loop count due to complex control flow",
5693 "UnknownLoopCountComplexCFG", ORE, TheLoop);
5694 return None;
5695 }
5696
5697 reportVectorizationFailure(
5698 "Cannot optimize for size and vectorize at the same time.",
5699 "cannot optimize for size and vectorize at the same time. "
5700 "Enable vectorization of this loop with '#pragma clang loop "
5701 "vectorize(enable)' when compiling with -Os/-Oz",
5702 "NoTailLoopWithOptForSize", ORE, TheLoop);
5703 return None;
5704}
5705
5706ElementCount
5707LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5708 ElementCount UserVF) {
5709 bool IgnoreScalableUserVF = UserVF.isScalable() &&
5710 !TTI.supportsScalableVectors() &&
5711 !ForceTargetSupportsScalableVectors;
5712 if (IgnoreScalableUserVF) {
5713 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Ignoring VF=" <<
UserVF << " because target does not support scalable vectors.\n"
; } } while (false)
5714 dbgs() << "LV: Ignoring VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Ignoring VF=" <<
UserVF << " because target does not support scalable vectors.\n"
; } } while (false)
5715 << " because target does not support scalable vectors.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Ignoring VF=" <<
UserVF << " because target does not support scalable vectors.\n"
; } } while (false)
;
5716 ORE->emit([&]() {
5717 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "IgnoreScalableUserVF",
5718 TheLoop->getStartLoc(),
5719 TheLoop->getHeader())
5720 << "Ignoring VF=" << ore::NV("UserVF", UserVF)
5721 << " because target does not support scalable vectors.";
5722 });
5723 }
5724
5725 // Beyond this point two scenarios are handled. If UserVF isn't specified
5726 // then a suitable VF is chosen. If UserVF is specified and there are
5727 // dependencies, check if it's legal. However, if a UserVF is specified and
5728 // there are no dependencies, then there's nothing to do.
5729 if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
5730 if (!canVectorizeReductions(UserVF)) {
5731 reportVectorizationFailure(
5732 "LV: Scalable vectorization not supported for the reduction "
5733 "operations found in this loop. Using fixed-width "
5734 "vectorization instead.",
5735 "Scalable vectorization not supported for the reduction operations "
5736 "found in this loop. Using fixed-width vectorization instead.",
5737 "ScalableVFUnfeasible", ORE, TheLoop);
5738 return computeFeasibleMaxVF(
5739 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
5740 }
5741
5742 if (Legal->isSafeForAnyVectorWidth())
5743 return UserVF;
5744 }
5745
5746 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5747 unsigned SmallestType, WidestType;
5748 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5749 unsigned WidestRegister =
5750 TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
5751 .getFixedSize();
5752
5753 // Get the maximum safe dependence distance in bits computed by LAA.
5754 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5755 // the memory accesses that is most restrictive (involved in the smallest
5756 // dependence distance).
5757 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
5758
5759 // If the user vectorization factor is legally unsafe, clamp it to a safe
5760 // value. Otherwise, return as is.
5761 if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
5762 unsigned MaxSafeElements =
5763 PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
5764 ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements);
5765
5766 if (UserVF.isScalable()) {
5767 Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5768
5769 // Scale VF by vscale before checking if it's safe.
5770 MaxSafeVF = ElementCount::getScalable(
5771 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5772
5773 if (MaxSafeVF.isZero()) {
5774 // The dependence distance is too small to use scalable vectors,
5775 // fallback on fixed.
5776 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Max legal vector width too small, scalable vectorization "
"unfeasible. Using fixed-width vectorization instead.\n"; } }
while (false)
5777 dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Max legal vector width too small, scalable vectorization "
"unfeasible. Using fixed-width vectorization instead.\n"; } }
while (false)
5778 << "LV: Max legal vector width too small, scalable vectorization "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Max legal vector width too small, scalable vectorization "
"unfeasible. Using fixed-width vectorization instead.\n"; } }
while (false)
5779 "unfeasible. Using fixed-width vectorization instead.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Max legal vector width too small, scalable vectorization "
"unfeasible. Using fixed-width vectorization instead.\n"; } }
while (false)
;
5780 ORE->emit([&]() {
5781 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "ScalableVFUnfeasible",
5782 TheLoop->getStartLoc(),
5783 TheLoop->getHeader())
5784 << "Max legal vector width too small, scalable vectorization "
5785 << "unfeasible. Using fixed-width vectorization instead.";
5786 });
5787 return computeFeasibleMaxVF(
5788 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
5789 }
5790 }
5791
5792 LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe VF is: "
<< MaxSafeVF << ".\n"; } } while (false)
;
5793
5794 if (ElementCount::isKnownLE(UserVF, MaxSafeVF))
5795 return UserVF;
5796
5797 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeVF << ".\n"; } } while (false)
5798 << " is unsafe, clamping to max safe VF=" << MaxSafeVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeVF << ".\n"; } } while (false)
5799 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeVF << ".\n"; } } while (false)
;
5800 ORE->emit([&]() {
5801 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
5802 TheLoop->getStartLoc(),
5803 TheLoop->getHeader())
5804 << "User-specified vectorization factor "
5805 << ore::NV("UserVectorizationFactor", UserVF)
5806 << " is unsafe, clamping to maximum safe vectorization factor "
5807 << ore::NV("VectorizationFactor", MaxSafeVF);
5808 });
5809 return MaxSafeVF;
5810 }
5811
5812 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
5813
5814 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5815 // Note that both WidestRegister and WidestType may not be a powers of 2.
5816 auto MaxVectorSize =
5817 ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType));
5818
5819 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestTypedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
5820 << " / " << WidestType << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
;
5821 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< WidestRegister << " bits.\n"; } } while (false
)
5822 << WidestRegister << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< WidestRegister << " bits.\n"; } } while (false
)
;
5823
5824 assert(MaxVectorSize.getFixedValue() <= WidestRegister &&((MaxVectorSize.getFixedValue() <= WidestRegister &&
"Did not expect to pack so many elements" " into one vector!"
) ? static_cast<void> (0) : __assert_fail ("MaxVectorSize.getFixedValue() <= WidestRegister && \"Did not expect to pack so many elements\" \" into one vector!\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5826, __PRETTY_FUNCTION__))
5825 "Did not expect to pack so many elements"((MaxVectorSize.getFixedValue() <= WidestRegister &&
"Did not expect to pack so many elements" " into one vector!"
) ? static_cast<void> (0) : __assert_fail ("MaxVectorSize.getFixedValue() <= WidestRegister && \"Did not expect to pack so many elements\" \" into one vector!\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5826, __PRETTY_FUNCTION__))
5826 " into one vector!")((MaxVectorSize.getFixedValue() <= WidestRegister &&
"Did not expect to pack so many elements" " into one vector!"
) ? static_cast<void> (0) : __assert_fail ("MaxVectorSize.getFixedValue() <= WidestRegister && \"Did not expect to pack so many elements\" \" into one vector!\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5826, __PRETTY_FUNCTION__))
;
5827 if (MaxVectorSize.getFixedValue() == 0) {
5828 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no vector registers.\n"
; } } while (false)
;
5829 return ElementCount::getFixed(1);
5830 } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() &&
5831 isPowerOf2_32(ConstTripCount)) {
5832 // We need to clamp the VF to be the ConstTripCount. There is no point in
5833 // choosing a higher viable VF as done in the loop below.
5834 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
<< ConstTripCount << "\n"; } } while (false)
5835 << ConstTripCount << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
<< ConstTripCount << "\n"; } } while (false)
;
5836 return ElementCount::getFixed(ConstTripCount);
5837 }
5838
5839 ElementCount MaxVF = MaxVectorSize;
5840 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5841 (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5842 // Collect all viable vectorization factors larger than the default MaxVF
5843 // (i.e. MaxVectorSize).
5844 SmallVector<ElementCount, 8> VFs;
5845 auto MaxVectorSizeMaxBW =
5846 ElementCount::getFixed(WidestRegister / SmallestType);
5847 for (ElementCount VS = MaxVectorSize * 2;
5848 ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2)
5849 VFs.push_back(VS);
5850
5851 // For each VF calculate its register usage.
5852 auto RUs = calculateRegisterUsage(VFs);
5853
5854 // Select the largest VF which doesn't require more registers than existing
5855 // ones.
5856 for (int i = RUs.size() - 1; i >= 0; --i) {
5857 bool Selected = true;
5858 for (auto &pair : RUs[i].MaxLocalUsers) {
5859 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5860 if (pair.second > TargetNumRegisters)
5861 Selected = false;
5862 }
5863 if (Selected) {
5864 MaxVF = VFs[i];
5865 break;
5866 }
5867 }
5868 if (ElementCount MinVF =
5869 TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) {
5870 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5871 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
<< MaxVF << ") with target's minimum: " <<
MinVF << '\n'; } } while (false)
5872 << ") with target's minimum: " << MinVF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
<< MaxVF << ") with target's minimum: " <<
MinVF << '\n'; } } while (false)
;
5873 MaxVF = MinVF;
5874 }
5875 }
5876 }
5877 return MaxVF;
5878}
5879
5880VectorizationFactor
5881LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
5882 // FIXME: This can be fixed for scalable vectors later, because at this stage
5883 // the LoopVectorizer will only consider vectorizing a loop with scalable
5884 // vectors when the loop has a hint to enable vectorization for a given VF.
5885 assert(!MaxVF.isScalable() && "scalable vectors not yet supported")((!MaxVF.isScalable() && "scalable vectors not yet supported"
) ? static_cast<void> (0) : __assert_fail ("!MaxVF.isScalable() && \"scalable vectors not yet supported\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5885, __PRETTY_FUNCTION__))
;
5886
5887 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5888 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalar loop costs: "
<< ExpectedCost << ".\n"; } } while (false)
;
5889 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop")((ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"
) ? static_cast<void> (0) : __assert_fail ("ExpectedCost.isValid() && \"Unexpected invalid cost for scalar loop\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5889, __PRETTY_FUNCTION__))
;
5890
5891 auto Width = ElementCount::getFixed(1);
5892 const float ScalarCost = *ExpectedCost.getValue();
5893 float Cost = ScalarCost;
5894
5895 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5896 if (ForceVectorization && MaxVF.isVector()) {
5897 // Ignore scalar width, because the user explicitly wants vectorization.
5898 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5899 // evaluation.
5900 Cost = std::numeric_limits<float>::max();
5901 }
5902
5903 for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF);
5904 i *= 2) {
5905 // Notice that the vector loop needs to be executed less times, so
5906 // we need to divide the cost of the vector loops by the width of
5907 // the vector elements.
5908 VectorizationCostTy C = expectedCost(i);
5909 assert(C.first.isValid() && "Unexpected invalid cost for vector loop")((C.first.isValid() && "Unexpected invalid cost for vector loop"
) ? static_cast<void> (0) : __assert_fail ("C.first.isValid() && \"Unexpected invalid cost for vector loop\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5909, __PRETTY_FUNCTION__))
;
5910 float VectorCost = *C.first.getValue() / (float)i.getFixedValue();
5911 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (int)VectorCost <<
".\n"; } } while (false)
5912 << " costs: " << (int)VectorCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (int)VectorCost <<
".\n"; } } while (false)
;
5913 if (!C.second && !ForceVectorization) {
5914 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
5915 dbgs() << "LV: Not considering vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
5916 << " because it will not generate any vector instructions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
;
5917 continue;
5918 }
5919
5920 // If profitable add it to ProfitableVF list.
5921 if (VectorCost < ScalarCost) {
5922 ProfitableVFs.push_back(VectorizationFactor(
5923 {i, (unsigned)VectorCost}));
5924 }
5925
5926 if (VectorCost < Cost) {
5927 Cost = VectorCost;
5928 Width = i;
5929 }
5930 }
5931
5932 if (!EnableCondStoresVectorization && NumPredStores) {
5933 reportVectorizationFailure("There are conditional stores.",
5934 "store that is conditionally executed prevents vectorization",
5935 "ConditionalStore", ORE, TheLoop);
5936 Width = ElementCount::getFixed(1);
5937 Cost = ScalarCost;
5938 }
5939
5940 LLVM_DEBUG(if (ForceVectorization && !Width.isScalar() && Cost >= ScalarCost) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && !Width
.isScalar() && Cost >= ScalarCost) dbgs() <<
"LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"
; } } while (false)
5941 << "LV: Vectorization seems to be not beneficial, "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && !Width
.isScalar() && Cost >= ScalarCost) dbgs() <<
"LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"
; } } while (false)
5942 << "but was forced by a user.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && !Width
.isScalar() && Cost >= ScalarCost) dbgs() <<
"LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"
; } } while (false)
;
5943 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Selecting VF: " <<
Width << ".\n"; } } while (false)
;
5944 VectorizationFactor Factor = {Width,
5945 (unsigned)(Width.getKnownMinValue() * Cost)};
5946 return Factor;
5947}
5948
5949bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5950 const Loop &L, ElementCount VF) const {
5951 // Cross iteration phis such as reductions need special handling and are
5952 // currently unsupported.
5953 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
5954 return Legal->isFirstOrderRecurrence(&Phi) ||
5955 Legal->isReductionVariable(&Phi);
5956 }))
5957 return false;
5958
5959 // Phis with uses outside of the loop require special handling and are
5960 // currently unsupported.
5961 for (auto &Entry : Legal->getInductionVars()) {
5962 // Look for uses of the value of the induction at the last iteration.
5963 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5964 for (User *U : PostInc->users())
5965 if (!L.contains(cast<Instruction>(U)))
5966 return false;
5967 // Look for uses of penultimate value of the induction.
5968 for (User *U : Entry.first->users())
5969 if (!L.contains(cast<Instruction>(U)))
5970 return false;
5971 }
5972
5973 // Induction variables that are widened require special handling that is
5974 // currently not supported.
5975 if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5976 return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5977 this->isProfitableToScalarize(Entry.first, VF));
5978 }))
5979 return false;
5980
5981 return true;
5982}
5983
5984bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5985 const ElementCount VF) const {
5986 // FIXME: We need a much better cost-model to take different parameters such
5987 // as register pressure, code size increase and cost of extra branches into
5988 // account. For now we apply a very crude heuristic and only consider loops
5989 // with vectorization factors larger than a certain value.
5990 // We also consider epilogue vectorization unprofitable for targets that don't
5991 // consider interleaving beneficial (eg. MVE).
5992 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5993 return false;
5994 if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
5995 return true;
5996 return false;
5997}
5998
5999VectorizationFactor
6000LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
6001 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
6002 VectorizationFactor Result = VectorizationFactor::Disabled();
6003 if (!EnableEpilogueVectorization) {
6004 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization is disabled.\n"
;; } } while (false)
;
6005 return Result;
6006 }
6007
6008 if (!isScalarEpilogueAllowed()) {
6009 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
"allowed.\n";; } } while (false)
6010 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
"allowed.\n";; } } while (false)
6011 "allowed.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
"allowed.\n";; } } while (false)
;
6012 return Result;
6013 }
6014
6015 // FIXME: This can be fixed for scalable vectors later, because at this stage
6016 // the LoopVectorizer will only consider vectorizing a loop with scalable
6017 // vectors when the loop has a hint to enable vectorization for a given VF.
6018 if (MainLoopVF.isScalable()) {
6019 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
"yet supported.\n"; } } while (false)
6020 "yet supported.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
"yet supported.\n"; } } while (false)
;
6021 return Result;
6022 }
6023
6024 // Not really a cost consideration, but check for unsupported cases here to
6025 // simplify the logic.
6026 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
6027 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
"not a supported candidate.\n";; } } while (false)
6028 dbgs() << "LEV: Unable to vectorize epilogue because the loop is "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
"not a supported candidate.\n";; } } while (false)
6029 "not a supported candidate.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
"not a supported candidate.\n";; } } while (false)
;
6030 return Result;
6031 }
6032
6033 if (EpilogueVectorizationForceVF > 1) {
6034 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization factor is forced.\n"
;; } } while (false)
;
6035 if (LVP.hasPlanWithVFs(
6036 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
6037 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
6038 else {
6039 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization forced factor is not viable.\n"
;; } } while (false)
6040 dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization forced factor is not viable.\n"
;; } } while (false)
6041 << "LEV: Epilogue vectorization forced factor is not viable.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization forced factor is not viable.\n"
;; } } while (false)
;
6042 return Result;
6043 }
6044 }
6045
6046 if (TheLoop->getHeader()->getParent()->hasOptSize() ||
6047 TheLoop->getHeader()->getParent()->hasMinSize()) {
6048 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"
;; } } while (false)
6049 dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"
;; } } while (false)
6050 << "LEV: Epilogue vectorization skipped due to opt for size.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"
;; } } while (false)
;
6051 return Result;
6052 }
6053
6054 if (!isEpilogueVectorizationProfitable(MainLoopVF))
6055 return Result;
6056
6057 for (auto &NextVF : ProfitableVFs)
6058 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
6059 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
6060 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
6061 Result = NextVF;
6062
6063 if (Result != VectorizationFactor::Disabled())
6064 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Vectorizing epilogue loop with VF = "
<< Result.Width.getFixedValue() << "\n";; } } while
(false)
6065 << Result.Width.getFixedValue() << "\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Vectorizing epilogue loop with VF = "
<< Result.Width.getFixedValue() << "\n";; } } while
(false)
;
6066 return Result;
6067}
6068
6069std::pair<unsigned, unsigned>
6070LoopVectorizationCostModel::getSmallestAndWidestTypes() {
6071 unsigned MinWidth = -1U;
6072 unsigned MaxWidth = 8;
6073 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
6074
6075 // For each block.
6076 for (BasicBlock *BB : TheLoop->blocks()) {
6077 // For each instruction in the loop.
6078 for (Instruction &I : BB->instructionsWithoutDebug()) {
6079 Type *T = I.getType();
6080
6081 // Skip ignored values.
6082 if (ValuesToIgnore.count(&I))
6083 continue;
6084
6085 // Only examine Loads, Stores and PHINodes.
6086 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
6087 continue;
6088
6089 // Examine PHI nodes that are reduction variables. Update the type to
6090 // account for the recurrence type.
6091 if (auto *PN = dyn_cast<PHINode>(&I)) {
6092 if (!Legal->isReductionVariable(PN))
6093 continue;
6094 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
6095 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
6096 TTI.preferInLoopReduction(RdxDesc.getOpcode(),
6097 RdxDesc.getRecurrenceType(),
6098 TargetTransformInfo::ReductionFlags()))
6099 continue;
6100 T = RdxDesc.getRecurrenceType();
6101 }
6102
6103 // Examine the stored values.
6104 if (auto *ST = dyn_cast<StoreInst>(&I))
6105 T = ST->getValueOperand()->getType();
6106
6107 // Ignore loaded pointer types and stored pointer types that are not
6108 // vectorizable.
6109 //
6110 // FIXME: The check here attempts to predict whether a load or store will
6111 // be vectorized. We only know this for certain after a VF has
6112 // been selected. Here, we assume that if an access can be
6113 // vectorized, it will be. We should also look at extending this
6114 // optimization to non-pointer types.
6115 //
6116 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
6117 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
6118 continue;
6119
6120 MinWidth = std::min(MinWidth,
6121 (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
6122 MaxWidth = std::max(MaxWidth,
6123 (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
6124 }
6125 }
6126
6127 return {MinWidth, MaxWidth};
6128}
6129
6130unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
6131 unsigned LoopCost) {
6132 // -- The interleave heuristics --
6133 // We interleave the loop in order to expose ILP and reduce the loop overhead.
6134 // There are many micro-architectural considerations that we can't predict
6135 // at this level. For example, frontend pressure (on decode or fetch) due to
6136 // code size, or the number and capabilities of the execution ports.
6137 //
6138 // We use the following heuristics to select the interleave count:
6139 // 1. If the code has reductions, then we interleave to break the cross
6140 // iteration dependency.
6141 // 2. If the loop is really small, then we interleave to reduce the loop
6142 // overhead.
6143 // 3. We don't interleave if we think that we will spill registers to memory
6144 // due to the increased register pressure.
6145
6146 if (!isScalarEpilogueAllowed())
6147 return 1;
6148
6149 // We used the distance for the interleave count.
6150 if (Legal->getMaxSafeDepDistBytes() != -1U)
6151 return 1;
6152
6153 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
6154 const bool HasReductions = !Legal->getReductionVars().empty();
6155 // Do not interleave loops with a relatively small known or estimated trip
6156 // count. But we will interleave when InterleaveSmallLoopScalarReduction is
6157 // enabled, and the code has scalar reductions(HasReductions && VF = 1),
6158 // because with the above conditions interleaving can expose ILP and break
6159 // cross iteration dependences for reductions.
6160 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
6161 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
6162 return 1;
6163
6164 RegisterUsage R = calculateRegisterUsage({VF})[0];
6165 // We divide by these constants so assume that we have at least one
6166 // instruction that uses at least one register.
6167 for (auto& pair : R.MaxLocalUsers) {
6168 pair.second = std::max(pair.second, 1U);
6169 }
6170
6171 // We calculate the interleave count using the following formula.
6172 // Subtract the number of loop invariants from the number of available
6173 // registers. These registers are used by all of the interleaved instances.
6174 // Next, divide the remaining registers by the number of registers that is
6175 // required by the loop, in order to estimate how many parallel instances
6176 // fit without causing spills. All of this is rounded down if necessary to be
6177 // a power of two. We want power of two interleave count to simplify any
6178 // addressing operations or alignment considerations.
6179 // We also want power of two interleave counts to ensure that the induction
6180 // variable of the vector loop wraps to zero, when tail is folded by masking;
6181 // this currently happens when OptForSize, in which case IC is set to 1 above.
6182 unsigned IC = UINT_MAX(2147483647 *2U +1U);
6183
6184 for (auto& pair : R.MaxLocalUsers) {
6185 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
6186 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegistersdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
TargetNumRegisters << " registers of " << TTI.getRegisterClassName
(pair.first) << " register class\n"; } } while (false)
6187 << " registers of "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
TargetNumRegisters << " registers of " << TTI.getRegisterClassName
(pair.first) << " register class\n"; } } while (false)
6188 << TTI.getRegisterClassName(pair.first) << " register class\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
TargetNumRegisters << " registers of " << TTI.getRegisterClassName
(pair.first) << " register class\n"; } } while (false)
;
6189 if (VF.isScalar()) {
6190 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6191 TargetNumRegisters = ForceTargetNumScalarRegs;
6192 } else {
6193 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6194 TargetNumRegisters = ForceTargetNumVectorRegs;
6195 }
6196 unsigned MaxLocalUsers = pair.second;
6197 unsigned LoopInvariantRegs = 0;
6198 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6199 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6200
6201 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6202 // Don't count the induction variable as interleaved.
6203 if (EnableIndVarRegisterHeur) {
6204 TmpIC =
6205 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6206 std::max(1U, (MaxLocalUsers - 1)));
6207 }
6208
6209 IC = std::min(IC, TmpIC);
6210 }
6211
6212 // Clamp the interleave ranges to reasonable counts.
6213 unsigned MaxInterleaveCount =
6214 TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6215
6216 // Check if the user has overridden the max.
6217 if (VF.isScalar()) {
6218 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6219 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6220 } else {
6221 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6222 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6223 }
6224
6225 // If trip count is known or estimated compile time constant, limit the
6226 // interleave count to be less than the trip count divided by VF, provided it
6227 // is at least 1.
6228 //
6229 // For scalable vectors we can't know if interleaving is beneficial. It may
6230 // not be beneficial for small loops if none of the lanes in the second vector
6231 // iterations is enabled. However, for larger loops, there is likely to be a
6232 // similar benefit as for fixed-width vectors. For now, we choose to leave
6233 // the InterleaveCount as if vscale is '1', although if some information about
6234 // the vector is known (e.g. min vector size), we can make a better decision.
6235 if (BestKnownTC) {
6236 MaxInterleaveCount =
6237 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6238 // Make sure MaxInterleaveCount is greater than 0.
6239 MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6240 }
6241
6242 assert(MaxInterleaveCount > 0 &&((MaxInterleaveCount > 0 && "Maximum interleave count must be greater than 0"
) ? static_cast<void> (0) : __assert_fail ("MaxInterleaveCount > 0 && \"Maximum interleave count must be greater than 0\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6243, __PRETTY_FUNCTION__))
6243 "Maximum interleave count must be greater than 0")((MaxInterleaveCount > 0 && "Maximum interleave count must be greater than 0"
) ? static_cast<void> (0) : __assert_fail ("MaxInterleaveCount > 0 && \"Maximum interleave count must be greater than 0\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6243, __PRETTY_FUNCTION__))
;
6244
6245 // Clamp the calculated IC to be between the 1 and the max interleave count
6246 // that the target and trip count allows.
6247 if (IC > MaxInterleaveCount)
6248 IC = MaxInterleaveCount;
6249 else
6250 // Make sure IC is greater than 0.
6251 IC = std::max(1u, IC);
6252
6253 assert(IC > 0 && "Interleave count must be greater than 0.")((IC > 0 && "Interleave count must be greater than 0."
) ? static_cast<void> (0) : __assert_fail ("IC > 0 && \"Interleave count must be greater than 0.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6253, __PRETTY_FUNCTION__))
;
6254
6255 // If we did not calculate the cost for VF (because the user selected the VF)
6256 // then we calculate the cost of VF here.
6257 if (LoopCost == 0) {
6258 assert(expectedCost(VF).first.isValid() && "Expected a valid cost")((expectedCost(VF).first.isValid() && "Expected a valid cost"
) ? static_cast<void> (0) : __assert_fail ("expectedCost(VF).first.isValid() && \"Expected a valid cost\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6258, __PRETTY_FUNCTION__))
;
6259 LoopCost = *expectedCost(VF).first.getValue();
6260 }
6261
6262 assert(LoopCost && "Non-zero loop cost expected")((LoopCost && "Non-zero loop cost expected") ? static_cast
<void> (0) : __assert_fail ("LoopCost && \"Non-zero loop cost expected\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6262, __PRETTY_FUNCTION__))
;
6263
6264 // Interleave if we vectorized this loop and there is a reduction that could
6265 // benefit from interleaving.
6266 if (VF.isVector() && HasReductions) {
6267 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving because of reductions.\n"
; } } while (false)
;
6268 return IC;
6269 }
6270
6271 // Note that if we've already vectorized the loop we will have done the
6272 // runtime check and so interleaving won't require further checks.
6273 bool InterleavingRequiresRuntimePointerCheck =
6274 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6275
6276 // We want to interleave small loops in order to reduce the loop overhead and
6277 // potentially expose ILP opportunities.
6278 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop cost is " <<
LoopCost << '\n' << "LV: IC is " << IC <<
'\n' << "LV: VF is " << VF << '\n'; } } while
(false)
6279 << "LV: IC is " << IC << '\n'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop cost is " <<
LoopCost << '\n' << "LV: IC is " << IC <<
'\n' << "LV: VF is " << VF << '\n'; } } while
(false)
6280 << "LV: VF is " << VF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop cost is " <<
LoopCost << '\n' << "LV: IC is " << IC <<
'\n' << "LV: VF is " << VF << '\n'; } } while
(false)
;
6281 const bool AggressivelyInterleaveReductions =
6282 TTI.enableAggressiveInterleaving(HasReductions);
6283 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6284 // We assume that the cost overhead is 1 and we use the cost model
6285 // to estimate the cost of the loop and interleave until the cost of the
6286 // loop overhead is about 5% of the cost of the loop.
6287 unsigned SmallIC =
6288 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6289
6290 // Interleave until store/load ports (estimated by max interleave count) are
6291 // saturated.
6292 unsigned NumStores = Legal->getNumStores();
6293 unsigned NumLoads = Legal->getNumLoads();
6294 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6295 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6296
6297 // If we have a scalar reduction (vector reductions are already dealt with
6298 // by this point), we can increase the critical path length if the loop
6299 // we're interleaving is inside another loop. Limit, by default to 2, so the
6300 // critical path only gets increased by one reduction operation.
6301 if (HasReductions && TheLoop->getLoopDepth() > 1) {
6302 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6303 SmallIC = std::min(SmallIC, F);
6304 StoresIC = std::min(StoresIC, F);
6305 LoadsIC = std::min(LoadsIC, F);
6306 }
6307
6308 if (EnableLoadStoreRuntimeInterleave &&
6309 std::max(StoresIC, LoadsIC) > SmallIC) {
6310 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to saturate store or load ports.\n"
; } } while (false)
6311 dbgs() << "LV: Interleaving to saturate store or load ports.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to saturate store or load ports.\n"
; } } while (false)
;
6312 return std::max(StoresIC, LoadsIC);
6313 }
6314
6315 // If there are scalar reductions and TTI has enabled aggressive
6316 // interleaving for reductions, we will interleave to expose ILP.
6317 if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6318 AggressivelyInterleaveReductions) {
6319 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to expose ILP.\n"
; } } while (false)
;
6320 // Interleave no less than SmallIC but not as aggressive as the normal IC
6321 // to satisfy the rare situation when resources are too limited.
6322 return std::max(IC / 2, SmallIC);
6323 } else {
6324 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to reduce branch cost.\n"
; } } while (false)
;
6325 return SmallIC;
6326 }
6327 }
6328
6329 // Interleave if this is a large loop (small loops are already dealt with by
6330 // this point) that could benefit from interleaving.
6331 if (AggressivelyInterleaveReductions) {
6332 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to expose ILP.\n"
; } } while (false)
;
6333 return IC;
6334 }
6335
6336 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not Interleaving.\n"
; } } while (false)
;
6337 return 1;
6338}
6339
6340SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6341LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6342 // This function calculates the register usage by measuring the highest number
6343 // of values that are alive at a single location. Obviously, this is a very
6344 // rough estimation. We scan the loop in a topological order in order and
6345 // assign a number to each instruction. We use RPO to ensure that defs are
6346 // met before their users. We assume that each instruction that has in-loop
6347 // users starts an interval. We record every time that an in-loop value is
6348 // used, so we have a list of the first and last occurrences of each
6349 // instruction. Next, we transpose this data structure into a multi map that
6350 // holds the list of intervals that *end* at a specific location. This multi
6351 // map allows us to perform a linear search. We scan the instructions linearly
6352 // and record each time that a new interval starts, by placing it in a set.
6353 // If we find this value in the multi-map then we remove it from the set.
6354 // The max register usage is the maximum size of the set.
6355 // We also search for instructions that are defined outside the loop, but are
6356 // used inside the loop. We need this number separately from the max-interval
6357 // usage number because when we unroll, loop-invariant values do not take
6358 // more register.
6359 LoopBlocksDFS DFS(TheLoop);
6360 DFS.perform(LI);
6361
6362 RegisterUsage RU;
6363
6364 // Each 'key' in the map opens a new interval. The values
6365 // of the map are the index of the 'last seen' usage of the
6366 // instruction that is the key.
6367 using IntervalMap = DenseMap<Instruction *, unsigned>;
6368
6369 // Maps instruction to its index.
6370 SmallVector<Instruction *, 64> IdxToInstr;
6371 // Marks the end of each interval.
6372 IntervalMap EndPoint;
6373 // Saves the list of instruction indices that are used in the loop.
6374 SmallPtrSet<Instruction *, 8> Ends;
6375 // Saves the list of values that are used in the loop but are
6376 // defined outside the loop, such as arguments and constants.
6377 SmallPtrSet<Value *, 8> LoopInvariants;
6378
6379 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6380 for (Instruction &I : BB->instructionsWithoutDebug()) {
6381 IdxToInstr.push_back(&I);
6382
6383 // Save the end location of each USE.
6384 for (Value *U : I.operands()) {
6385 auto *Instr = dyn_cast<Instruction>(U);
6386
6387 // Ignore non-instruction values such as arguments, constants, etc.
6388 if (!Instr)
6389 continue;
6390
6391 // If this instruction is outside the loop then record it and continue.
6392 if (!TheLoop->contains(Instr)) {
6393 LoopInvariants.insert(Instr);
6394 continue;
6395 }
6396
6397 // Overwrite previous end points.
6398 EndPoint[Instr] = IdxToInstr.size();
6399 Ends.insert(Instr);
6400 }
6401 }
6402 }
6403
6404 // Saves the list of intervals that end with the index in 'key'.
6405 using InstrList = SmallVector<Instruction *, 2>;
6406 DenseMap<unsigned, InstrList> TransposeEnds;
6407
6408 // Transpose the EndPoints to a list of values that end at each index.
6409 for (auto &Interval : EndPoint)
6410 TransposeEnds[Interval.second].push_back(Interval.first);
6411
6412 SmallPtrSet<Instruction *, 8> OpenIntervals;
6413 SmallVector<RegisterUsage, 8> RUs(VFs.size());
6414 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6415
6416 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): Calculating max register usage:\n"
; } } while (false)
;
6417
6418 // A lambda that gets the register usage for the given type and VF.
6419 const auto &TTICapture = TTI;
6420 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
6421 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6422 return 0U;
6423 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
6424 };
6425
6426 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6427 Instruction *I = IdxToInstr[i];
6428
6429 // Remove all of the instructions that end at this location.
6430 InstrList &List = TransposeEnds[i];
6431 for (Instruction *ToRemove : List)
6432 OpenIntervals.erase(ToRemove);
6433
6434 // Ignore instructions that are never used within the loop.
6435 if (!Ends.count(I))
6436 continue;
6437
6438 // Skip ignored values.
6439 if (ValuesToIgnore.count(I))
6440 continue;
6441
6442 // For each VF find the maximum usage of registers.
6443 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6444 // Count the number of live intervals.
6445 SmallMapVector<unsigned, unsigned, 4> RegUsage;
6446
6447 if (VFs[j].isScalar()) {
6448 for (auto Inst : OpenIntervals) {
6449 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6450 if (RegUsage.find(ClassID) == RegUsage.end())
6451 RegUsage[ClassID] = 1;
6452 else
6453 RegUsage[ClassID] += 1;
6454 }
6455 } else {
6456 collectUniformsAndScalars(VFs[j]);
6457 for (auto Inst : OpenIntervals) {
6458 // Skip ignored values for VF > 1.
6459 if (VecValuesToIgnore.count(Inst))
6460 continue;
6461 if (isScalarAfterVectorization(Inst, VFs[j])) {
6462 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6463 if (RegUsage.find(ClassID) == RegUsage.end())
6464 RegUsage[ClassID] = 1;
6465 else
6466 RegUsage[ClassID] += 1;
6467 } else {
6468 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6469 if (RegUsage.find(ClassID) == RegUsage.end())
6470 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6471 else
6472 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6473 }
6474 }
6475 }
6476
6477 for (auto& pair : RegUsage) {
6478 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6479 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6480 else
6481 MaxUsages[j][pair.first] = pair.second;
6482 }
6483 }
6484
6485 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): At #" <<
i << " Interval # " << OpenIntervals.size() <<
'\n'; } } while (false)
6486 << OpenIntervals.size() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): At #" <<
i << " Interval # " << OpenIntervals.size() <<
'\n'; } } while (false)
;
6487
6488 // Add the current instruction to the list of open intervals.
6489 OpenIntervals.insert(I);
6490 }
6491
6492 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6493 SmallMapVector<unsigned, unsigned, 4> Invariant;
6494
6495 for (auto Inst : LoopInvariants) {
6496 unsigned Usage =
6497 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6498 unsigned ClassID =
6499 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6500 if (Invariant.find(ClassID) == Invariant.end())
6501 Invariant[ClassID] = Usage;
6502 else
6503 Invariant[ClassID] += Usage;
6504 }
6505
6506 LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6507 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6508 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6509 << " item\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6510 for (const auto &pair : MaxUsages[i]) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6511 dbgs() << "LV(REG): RegisterClass: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6512 << TTI.getRegisterClassName(pair.first) << ", " << pair.seconddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6513 << " registers\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6514 }do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6515 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6516 << " item\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6517 for (const auto &pair : Invariant) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6518 dbgs() << "LV(REG): RegisterClass: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6519 << TTI.getRegisterClassName(pair.first) << ", " << pair.seconddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6520 << " registers\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6521 }do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6522 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
;
6523
6524 RU.LoopInvariantRegs = Invariant;
6525 RU.MaxLocalUsers = MaxUsages[i];
6526 RUs[i] = RU;
6527 }
6528
6529 return RUs;
6530}
6531
6532bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6533 // TODO: Cost model for emulated masked load/store is completely
6534 // broken. This hack guides the cost model to use an artificially
6535 // high enough value to practically disable vectorization with such
6536 // operations, except where previously deployed legality hack allowed
6537 // using very low cost values. This is to avoid regressions coming simply
6538 // from moving "masked load/store" check from legality to cost model.
6539 // Masked Load/Gather emulation was previously never allowed.
6540 // Limited number of Masked Store/Scatter emulation was allowed.
6541 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction")((isPredicatedInst(I) && "Expecting a scalar emulated instruction"
) ? static_cast<void> (0) : __assert_fail ("isPredicatedInst(I) && \"Expecting a scalar emulated instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6541, __PRETTY_FUNCTION__))
;
6542 return isa<LoadInst>(I) ||
6543 (isa<StoreInst>(I) &&
6544 NumPredStores > NumberOfStoresToPredicate);
6545}
6546
6547void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6548 // If we aren't vectorizing the loop, or if we've already collected the
6549 // instructions to scalarize, there's nothing to do. Collection may already
6550 // have occurred if we have a user-selected VF and are now computing the
6551 // expected cost for interleaving.
6552 if (VF.isScalar() || VF.isZero() ||
6553 InstsToScalarize.find(VF) != InstsToScalarize.end())
6554 return;
6555
6556 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6557 // not profitable to scalarize any instructions, the presence of VF in the
6558 // map will indicate that we've analyzed it already.
6559 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6560
6561 // Find all the instructions that are scalar with predication in the loop and
6562 // determine if it would be better to not if-convert the blocks they are in.
6563 // If so, we also record the instructions to scalarize.
6564 for (BasicBlock *BB : TheLoop->blocks()) {
6565 if (!blockNeedsPredication(BB))
6566 continue;
6567 for (Instruction &I : *BB)
6568 if (isScalarWithPredication(&I)) {
6569 ScalarCostsTy ScalarCosts;
6570 // Do not apply discount logic if hacked cost is needed
6571 // for emulated masked memrefs.
6572 if (!useEmulatedMaskMemRefHack(&I) &&
6573 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6574 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6575 // Remember that BB will remain after vectorization.
6576 PredicatedBBsAfterVectorization.insert(BB);
6577 }
6578 }
6579}
6580
6581int LoopVectorizationCostModel::computePredInstDiscount(
6582 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6583 assert(!isUniformAfterVectorization(PredInst, VF) &&((!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"
) ? static_cast<void> (0) : __assert_fail ("!isUniformAfterVectorization(PredInst, VF) && \"Instruction marked uniform-after-vectorization will be predicated\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6584, __PRETTY_FUNCTION__))
6584 "Instruction marked uniform-after-vectorization will be predicated")((!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"
) ? static_cast<void> (0) : __assert_fail ("!isUniformAfterVectorization(PredInst, VF) && \"Instruction marked uniform-after-vectorization will be predicated\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6584, __PRETTY_FUNCTION__))
;
6585
6586 // Initialize the discount to zero, meaning that the scalar version and the
6587 // vector version cost the same.
6588 InstructionCost Discount = 0;
6589
6590 // Holds instructions to analyze. The instructions we visit are mapped in
6591 // ScalarCosts. Those instructions are the ones that would be scalarized if
6592 // we find that the scalar version costs less.
6593 SmallVector<Instruction *, 8> Worklist;
6594
6595 // Returns true if the given instruction can be scalarized.
6596 auto canBeScalarized = [&](Instruction *I) -> bool {
6597 // We only attempt to scalarize instructions forming a single-use chain
6598 // from the original predicated block that would otherwise be vectorized.
6599 // Although not strictly necessary, we give up on instructions we know will
6600 // already be scalar to avoid traversing chains that are unlikely to be
6601 // beneficial.
6602 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6603 isScalarAfterVectorization(I, VF))
6604 return false;
6605
6606 // If the instruction is scalar with predication, it will be analyzed
6607 // separately. We ignore it within the context of PredInst.
6608 if (isScalarWithPredication(I))
6609 return false;
6610
6611 // If any of the instruction's operands are uniform after vectorization,
6612 // the instruction cannot be scalarized. This prevents, for example, a
6613 // masked load from being scalarized.
6614 //
6615 // We assume we will only emit a value for lane zero of an instruction
6616 // marked uniform after vectorization, rather than VF identical values.
6617 // Thus, if we scalarize an instruction that uses a uniform, we would
6618 // create uses of values corresponding to the lanes we aren't emitting code
6619 // for. This behavior can be changed by allowing getScalarValue to clone
6620 // the lane zero values for uniforms rather than asserting.
6621 for (Use &U : I->operands())
6622 if (auto *J = dyn_cast<Instruction>(U.get()))
6623 if (isUniformAfterVectorization(J, VF))
6624 return false;
6625
6626 // Otherwise, we can scalarize the instruction.
6627 return true;
6628 };
6629
6630 // Compute the expected cost discount from scalarizing the entire expression
6631 // feeding the predicated instruction. We currently only consider expressions
6632 // that are single-use instruction chains.
6633 Worklist.push_back(PredInst);
6634 while (!Worklist.empty()) {
6635 Instruction *I = Worklist.pop_back_val();
6636
6637 // If we've already analyzed the instruction, there's nothing to do.
6638 if (ScalarCosts.find(I) != ScalarCosts.end())
6639 continue;
6640
6641 // Compute the cost of the vector instruction. Note that this cost already
6642 // includes the scalarization overhead of the predicated instruction.
6643 InstructionCost VectorCost = getInstructionCost(I, VF).first;
6644
6645 // Compute the cost of the scalarized instruction. This cost is the cost of
6646 // the instruction as if it wasn't if-converted and instead remained in the
6647 // predicated block. We will scale this cost by block probability after
6648 // computing the scalarization overhead.
6649 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6649, __PRETTY_FUNCTION__))
;
6650 InstructionCost ScalarCost =
6651 VF.getKnownMinValue() *
6652 getInstructionCost(I, ElementCount::getFixed(1)).first;
6653
6654 // Compute the scalarization overhead of needed insertelement instructions
6655 // and phi nodes.
6656 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6657 ScalarCost += TTI.getScalarizationOverhead(
6658 cast<VectorType>(ToVectorTy(I->getType(), VF)),
6659 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6660 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6660, __PRETTY_FUNCTION__))
;
6661 ScalarCost +=
6662 VF.getKnownMinValue() *
6663 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6664 }
6665
6666 // Compute the scalarization overhead of needed extractelement
6667 // instructions. For each of the instruction's operands, if the operand can
6668 // be scalarized, add it to the worklist; otherwise, account for the
6669 // overhead.
6670 for (Use &U : I->operands())
6671 if (auto *J = dyn_cast<Instruction>(U.get())) {
6672 assert(VectorType::isValidElementType(J->getType()) &&((VectorType::isValidElementType(J->getType()) && "Instruction has non-scalar type"
) ? static_cast<void> (0) : __assert_fail ("VectorType::isValidElementType(J->getType()) && \"Instruction has non-scalar type\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6673, __PRETTY_FUNCTION__))
6673 "Instruction has non-scalar type")((VectorType::isValidElementType(J->getType()) && "Instruction has non-scalar type"
) ? static_cast<void> (0) : __assert_fail ("VectorType::isValidElementType(J->getType()) && \"Instruction has non-scalar type\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6673, __PRETTY_FUNCTION__))
;
6674 if (canBeScalarized(J))
6675 Worklist.push_back(J);
6676 else if (needsExtract(J, VF)) {
6677 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6677, __PRETTY_FUNCTION__))
;
6678 ScalarCost += TTI.getScalarizationOverhead(
6679 cast<VectorType>(ToVectorTy(J->getType(), VF)),
6680 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6681 }
6682 }
6683
6684 // Scale the total scalar cost by block probability.
6685 ScalarCost /= getReciprocalPredBlockProb();
6686
6687 // Compute the discount. A non-negative discount means the vector version
6688 // of the instruction costs more, and scalarizing would be beneficial.
6689 Discount += VectorCost - ScalarCost;
6690 ScalarCosts[I] = ScalarCost;
6691 }
6692
6693 return *Discount.getValue();
6694}
6695
6696LoopVectorizationCostModel::VectorizationCostTy
6697LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6698 VectorizationCostTy Cost;
6699
6700 // For each block.
6701 for (BasicBlock *BB : TheLoop->blocks()) {
6702 VectorizationCostTy BlockCost;
6703
6704 // For each instruction in the old loop.
6705 for (Instruction &I : BB->instructionsWithoutDebug()) {
6706 // Skip ignored values.
6707 if (ValuesToIgnore.count(&I) ||
6708 (VF.isVector() && VecValuesToIgnore.count(&I)))
6709 continue;
6710
6711 VectorizationCostTy C = getInstructionCost(&I, VF);
6712
6713 // Check if we should override the cost.
6714 if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6715 C.first = InstructionCost(ForceTargetInstructionCost);
6716
6717 BlockCost.first += C.first;
6718 BlockCost.second |= C.second;
6719 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.firstdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
<< C.first << " for VF " << VF << " For instruction: "
<< I << '\n'; } } while (false)
6720 << " for VF " << VF << " For instruction: " << Ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
<< C.first << " for VF " << VF << " For instruction: "
<< I << '\n'; } } while (false)
6721 << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
<< C.first << " for VF " << VF << " For instruction: "
<< I << '\n'; } } while (false)
;
6722 }
6723
6724 // If we are vectorizing a predicated block, it will have been
6725 // if-converted. This means that the block's instructions (aside from
6726 // stores and instructions that may divide by zero) will now be
6727 // unconditionally executed. For the scalar case, we may not always execute
6728 // the predicated block, if it is an if-else block. Thus, scale the block's
6729 // cost by the probability of executing it. blockNeedsPredication from
6730 // Legal is used so as to not include all blocks in tail folded loops.
6731 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6732 BlockCost.first /= getReciprocalPredBlockProb();
6733
6734 Cost.first += BlockCost.first;
6735 Cost.second |= BlockCost.second;
6736 }
6737
6738 return Cost;
6739}
6740
6741/// Gets Address Access SCEV after verifying that the access pattern
6742/// is loop invariant except the induction variable dependence.
6743///
6744/// This SCEV can be sent to the Target in order to estimate the address
6745/// calculation cost.
6746static const SCEV *getAddressAccessSCEV(
6747 Value *Ptr,
6748 LoopVectorizationLegality *Legal,
6749 PredicatedScalarEvolution &PSE,
6750 const Loop *TheLoop) {
6751
6752 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6753 if (!Gep)
6754 return nullptr;
6755
6756 // We are looking for a gep with all loop invariant indices except for one
6757 // which should be an induction variable.
6758 auto SE = PSE.getSE();
6759 unsigned NumOperands = Gep->getNumOperands();
6760 for (unsigned i = 1; i < NumOperands; ++i) {
6761 Value *Opd = Gep->getOperand(i);
6762 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6763 !Legal->isInductionVariable(Opd))
6764 return nullptr;
6765 }
6766
6767 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6768 return PSE.getSCEV(Ptr);
6769}
6770
6771static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6772 return Legal->hasStride(I->getOperand(0)) ||
6773 Legal->hasStride(I->getOperand(1));
6774}
6775
6776InstructionCost
6777LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6778 ElementCount VF) {
6779 assert(VF.isVector() &&((VF.isVector() && "Scalarization cost of instruction implies vectorization."
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && \"Scalarization cost of instruction implies vectorization.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6780, __PRETTY_FUNCTION__))
6780 "Scalarization cost of instruction implies vectorization.")((VF.isVector() && "Scalarization cost of instruction implies vectorization."
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && \"Scalarization cost of instruction implies vectorization.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6780, __PRETTY_FUNCTION__))
;
6781 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6781, __PRETTY_FUNCTION__))
;
6782 Type *ValTy = getMemInstValueType(I);
6783 auto SE = PSE.getSE();
6784
6785 unsigned AS = getLoadStoreAddressSpace(I);
6786 Value *Ptr = getLoadStorePointerOperand(I);
6787 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6788
6789 // Figure out whether the access is strided and get the stride value
6790 // if it's known in compile time
6791 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6792
6793 // Get the cost of the scalar memory instruction and address computation.
6794 InstructionCost Cost =
6795 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6796
6797 // Don't pass *I here, since it is scalar but will actually be part of a
6798 // vectorized loop where the user of it is a vectorized instruction.
6799 const Align Alignment = getLoadStoreAlignment(I);
6800 Cost += VF.getKnownMinValue() *
6801 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6802 AS, TTI::TCK_RecipThroughput);
6803
6804 // Get the overhead of the extractelement and insertelement instructions
6805 // we might create due to scalarization.
6806 Cost += getScalarizationOverhead(I, VF);
6807
6808 // If we have a predicated load/store, it will need extra i1 extracts and
6809 // conditional branches, but may not be executed for each vector lane. Scale
6810 // the cost by the probability of executing the predicated block.
6811 if (isPredicatedInst(I)) {
6812 Cost /= getReciprocalPredBlockProb();
6813
6814 // Add the cost of an i1 extract and a branch
6815 auto *Vec_i1Ty =
6816 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6817 Cost += TTI.getScalarizationOverhead(
6818 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
6819 /*Insert=*/false, /*Extract=*/true);
6820 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6821
6822 if (useEmulatedMaskMemRefHack(I))
6823 // Artificially setting to a high enough value to practically disable
6824 // vectorization with such operations.
6825 Cost = 3000000;
6826 }
6827
6828 return Cost;
6829}
6830
6831InstructionCost
6832LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6833 ElementCount VF) {
6834 Type *ValTy = getMemInstValueType(I);
6835 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6836 Value *Ptr = getLoadStorePointerOperand(I);
6837 unsigned AS = getLoadStoreAddressSpace(I);
6838 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6839 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6840
6841 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&(((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Stride should be 1 or -1 for consecutive memory access") ? static_cast
<void> (0) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Stride should be 1 or -1 for consecutive memory access\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6842, __PRETTY_FUNCTION__))
6842 "Stride should be 1 or -1 for consecutive memory access")(((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Stride should be 1 or -1 for consecutive memory access") ? static_cast
<void> (0) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Stride should be 1 or -1 for consecutive memory access\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6842, __PRETTY_FUNCTION__))
;
6843 const Align Alignment = getLoadStoreAlignment(I);
6844 InstructionCost Cost = 0;
6845 if (Legal->isMaskRequired(I))
6846 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6847 CostKind);
6848 else
6849 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6850 CostKind, I);
6851
6852 bool Reverse = ConsecutiveStride < 0;
6853 if (Reverse)
6854 Cost +=
6855 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6856 return Cost;
6857}
6858
6859InstructionCost
6860LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6861 ElementCount VF) {
6862 assert(Legal->isUniformMemOp(*I))((Legal->isUniformMemOp(*I)) ? static_cast<void> (0)
: __assert_fail ("Legal->isUniformMemOp(*I)", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6862, __PRETTY_FUNCTION__))
;
6863
6864 Type *ValTy = getMemInstValueType(I);
6865 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6866 const Align Alignment = getLoadStoreAlignment(I);
6867 unsigned AS = getLoadStoreAddressSpace(I);
6868 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6869 if (isa<LoadInst>(I)) {
6870 return TTI.getAddressComputationCost(ValTy) +
6871 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6872 CostKind) +
6873 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6874 }
6875 StoreInst *SI = cast<StoreInst>(I);
6876
6877 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6878 return TTI.getAddressComputationCost(ValTy) +
6879 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6880 CostKind) +
6881 (isLoopInvariantStoreValue
6882 ? 0
6883 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6884 VF.getKnownMinValue() - 1));
6885}
6886
6887InstructionCost
6888LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6889 ElementCount VF) {
6890 Type *ValTy = getMemInstValueType(I);
6891 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6892 const Align Alignment = getLoadStoreAlignment(I);
6893 const Value *Ptr = getLoadStorePointerOperand(I);
6894
6895 return TTI.getAddressComputationCost(VectorTy) +
6896 TTI.getGatherScatterOpCost(
6897 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6898 TargetTransformInfo::TCK_RecipThroughput, I);
6899}
6900
6901InstructionCost
6902LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6903 ElementCount VF) {
6904 // TODO: Once we have support for interleaving with scalable vectors
6905 // we can calculate the cost properly here.
6906 if (VF.isScalable())
6907 return InstructionCost::getInvalid();
6908
6909 Type *ValTy = getMemInstValueType(I);
6910 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6911 unsigned AS = getLoadStoreAddressSpace(I);
6912
6913 auto Group = getInterleavedAccessGroup(I);
6914 assert(Group && "Fail to get an interleaved access group.")((Group && "Fail to get an interleaved access group."
) ? static_cast<void> (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6914, __PRETTY_FUNCTION__))
;
6915
6916 unsigned InterleaveFactor = Group->getFactor();
6917 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6918
6919 // Holds the indices of existing members in an interleaved load group.
6920 // An interleaved store group doesn't need this as it doesn't allow gaps.
6921 SmallVector<unsigned, 4> Indices;
6922 if (isa<LoadInst>(I)) {
6923 for (unsigned i = 0; i < InterleaveFactor; i++)
6924 if (Group->getMember(i))
6925 Indices.push_back(i);
6926 }
6927
6928 // Calculate the cost of the whole interleaved group.
6929 bool UseMaskForGaps =
6930 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6931 InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6932 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6933 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6934
6935 if (Group->isReverse()) {
6936 // TODO: Add support for reversed masked interleaved access.
6937 assert(!Legal->isMaskRequired(I) &&((!Legal->isMaskRequired(I) && "Reverse masked interleaved access not supported."
) ? static_cast<void> (0) : __assert_fail ("!Legal->isMaskRequired(I) && \"Reverse masked interleaved access not supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6938, __PRETTY_FUNCTION__))
6938 "Reverse masked interleaved access not supported.")((!Legal->isMaskRequired(I) && "Reverse masked interleaved access not supported."
) ? static_cast<void> (0) : __assert_fail ("!Legal->isMaskRequired(I) && \"Reverse masked interleaved access not supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6938, __PRETTY_FUNCTION__))
;
6939 Cost +=
6940 Group->getNumMembers() *
6941 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6942 }
6943 return Cost;
6944}
6945
6946InstructionCost LoopVectorizationCostModel::getReductionPatternCost(
6947 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6948 // Early exit for no inloop reductions
6949 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6950 return InstructionCost::getInvalid();
6951 auto *VectorTy = cast<VectorType>(Ty);
6952
6953 // We are looking for a pattern of, and finding the minimal acceptable cost:
6954 // reduce(mul(ext(A), ext(B))) or
6955 // reduce(mul(A, B)) or
6956 // reduce(ext(A)) or
6957 // reduce(A).
6958 // The basic idea is that we walk down the tree to do that, finding the root
6959 // reduction instruction in InLoopReductionImmediateChains. From there we find
6960 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6961 // of the components. If the reduction cost is lower then we return it for the
6962 // reduction instruction and 0 for the other instructions in the pattern. If
6963 // it is not we return an invalid cost specifying the orignal cost method
6964 // should be used.
6965 Instruction *RetI = I;
6966 if ((RetI->getOpcode() == Instruction::SExt ||
6967 RetI->getOpcode() == Instruction::ZExt)) {
6968 if (!RetI->hasOneUser())
6969 return InstructionCost::getInvalid();
6970 RetI = RetI->user_back();
6971 }
6972 if (RetI->getOpcode() == Instruction::Mul &&
6973 RetI->user_back()->getOpcode() == Instruction::Add) {
6974 if (!RetI->hasOneUser())
6975 return InstructionCost::getInvalid();
6976 RetI = RetI->user_back();
6977 }
6978
6979 // Test if the found instruction is a reduction, and if not return an invalid
6980 // cost specifying the parent to use the original cost modelling.
6981 if (!InLoopReductionImmediateChains.count(RetI))
6982 return InstructionCost::getInvalid();
6983
6984 // Find the reduction this chain is a part of and calculate the basic cost of
6985 // the reduction on its own.
6986 Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6987 Instruction *ReductionPhi = LastChain;
6988 while (!isa<PHINode>(ReductionPhi))
6989 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6990
6991 RecurrenceDescriptor RdxDesc =
6992 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
6993 InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6994 RdxDesc.getOpcode(), VectorTy, false, CostKind);
6995
6996 // Get the operand that was not the reduction chain and match it to one of the
6997 // patterns, returning the better cost if it is found.
6998 Instruction *RedOp = RetI->getOperand(1) == LastChain
6999 ? dyn_cast<Instruction>(RetI->getOperand(0))
7000 : dyn_cast<Instruction>(RetI->getOperand(1));
7001
7002 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
7003
7004 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) &&
7005 !TheLoop->isLoopInvariant(RedOp)) {
7006 bool IsUnsigned = isa<ZExtInst>(RedOp);
7007 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
7008 InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7009 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7010 CostKind);
7011
7012 InstructionCost ExtCost =
7013 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
7014 TTI::CastContextHint::None, CostKind, RedOp);
7015 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
7016 return I == RetI ? *RedCost.getValue() : 0;
7017 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) {
7018 Instruction *Mul = RedOp;
7019 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0));
7020 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1));
7021 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) &&
7022 Op0->getOpcode() == Op1->getOpcode() &&
7023 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
7024 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
7025 bool IsUnsigned = isa<ZExtInst>(Op0);
7026 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
7027 // reduce(mul(ext, ext))
7028 InstructionCost ExtCost =
7029 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType,
7030 TTI::CastContextHint::None, CostKind, Op0);
7031 InstructionCost MulCost =
7032 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
7033
7034 InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7035 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7036 CostKind);
7037
7038 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost)
7039 return I == RetI ? *RedCost.getValue() : 0;
7040 } else {
7041 InstructionCost MulCost =
7042 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
7043
7044 InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7045 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
7046 CostKind);
7047
7048 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
7049 return I == RetI ? *RedCost.getValue() : 0;
7050 }
7051 }
7052
7053 return I == RetI ? BaseCost : InstructionCost::getInvalid();
7054}
7055
7056InstructionCost
7057LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
7058 ElementCount VF) {
7059 // Calculate scalar cost only. Vectorization cost should be ready at this
7060 // moment.
7061 if (VF.isScalar()) {
7062 Type *ValTy = getMemInstValueType(I);
7063 const Align Alignment = getLoadStoreAlignment(I);
7064 unsigned AS = getLoadStoreAddressSpace(I);
7065
7066 return TTI.getAddressComputationCost(ValTy) +
7067 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
7068 TTI::TCK_RecipThroughput, I);
7069 }
7070 return getWideningCost(I, VF);
7071}
7072
7073LoopVectorizationCostModel::VectorizationCostTy
7074LoopVectorizationCostModel::getInstructionCost(Instruction *I,
7075 ElementCount VF) {
7076 // If we know that this instruction will remain uniform, check the cost of
7077 // the scalar version.
7078 if (isUniformAfterVectorization(I, VF))
7079 VF = ElementCount::getFixed(1);
7080
7081 if (VF.isVector() && isProfitableToScalarize(I, VF))
7082 return VectorizationCostTy(InstsToScalarize[VF][I], false);
7083
7084 // Forced scalars do not have any scalarization overhead.
7085 auto ForcedScalar = ForcedScalars.find(VF);
7086 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
7087 auto InstSet = ForcedScalar->second;
7088 if (InstSet.count(I))
7089 return VectorizationCostTy(
7090 (getInstructionCost(I, ElementCount::getFixed(1)).first *
7091 VF.getKnownMinValue()),
7092 false);
7093 }
7094
7095 Type *VectorTy;
7096 InstructionCost C = getInstructionCost(I, VF, VectorTy);
7097
7098 bool TypeNotScalarized =
7099 VF.isVector() && VectorTy->isVectorTy() &&
7100 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
7101 return VectorizationCostTy(C, TypeNotScalarized);
7102}
7103
7104InstructionCost
7105LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
7106 ElementCount VF) const {
7107
7108 if (VF.isScalable())
7109 return InstructionCost::getInvalid();
7110
7111 if (VF.isScalar())
7112 return 0;
7113
7114 InstructionCost Cost = 0;
7115 Type *RetTy = ToVectorTy(I->getType(), VF);
7116 if (!RetTy->isVoidTy() &&
7117 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
7118 Cost += TTI.getScalarizationOverhead(
7119 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
7120 true, false);
7121
7122 // Some targets keep addresses scalar.
7123 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
7124 return Cost;
7125
7126 // Some targets support efficient element stores.
7127 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
7128 return Cost;
7129
7130 // Collect operands to consider.
7131 CallInst *CI = dyn_cast<CallInst>(I);
7132 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
7133
7134 // Skip operands that do not require extraction/scalarization and do not incur
7135 // any overhead.
7136 SmallVector<Type *> Tys;
7137 for (auto *V : filterExtractingOperands(Ops, VF))
7138 Tys.push_back(MaybeVectorizeType(V->getType(), VF));
7139 return Cost + TTI.getOperandsScalarizationOverhead(
7140 filterExtractingOperands(Ops, VF), Tys);
7141}
7142
7143void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
7144 if (VF.isScalar())
7145 return;
7146 NumPredStores = 0;
7147 for (BasicBlock *BB : TheLoop->blocks()) {
7148 // For each instruction in the old loop.
7149 for (Instruction &I : *BB) {
7150 Value *Ptr = getLoadStorePointerOperand(&I);
7151 if (!Ptr)
7152 continue;
7153
7154 // TODO: We should generate better code and update the cost model for
7155 // predicated uniform stores. Today they are treated as any other
7156 // predicated store (see added test cases in
7157 // invariant-store-vectorization.ll).
7158 if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
7159 NumPredStores++;
7160
7161 if (Legal->isUniformMemOp(I)) {
7162 // TODO: Avoid replicating loads and stores instead of
7163 // relying on instcombine to remove them.
7164 // Load: Scalar load + broadcast
7165 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
7166 InstructionCost Cost = getUniformMemOpCost(&I, VF);
7167 setWideningDecision(&I, VF, CM_Scalarize, Cost);
7168 continue;
7169 }
7170
7171 // We assume that widening is the best solution when possible.
7172 if (memoryInstructionCanBeWidened(&I, VF)) {
7173 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
7174 int ConsecutiveStride =
7175 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
7176 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&(((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Expected consecutive stride.") ? static_cast<void> (0
) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Expected consecutive stride.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7177, __PRETTY_FUNCTION__))
7177 "Expected consecutive stride.")(((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Expected consecutive stride.") ? static_cast<void> (0
) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Expected consecutive stride.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7177, __PRETTY_FUNCTION__))
;
7178 InstWidening Decision =
7179 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
7180 setWideningDecision(&I, VF, Decision, Cost);
7181 continue;
7182 }
7183
7184 // Choose between Interleaving, Gather/Scatter or Scalarization.
7185 InstructionCost InterleaveCost = InstructionCost::getInvalid();
7186 unsigned NumAccesses = 1;
7187 if (isAccessInterleaved(&I)) {
7188 auto Group = getInterleavedAccessGroup(&I);
7189 assert(Group && "Fail to get an interleaved access group.")((Group && "Fail to get an interleaved access group."
) ? static_cast<void> (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7189, __PRETTY_FUNCTION__))
;
7190
7191 // Make one decision for the whole group.
7192 if (getWideningDecision(&I, VF) != CM_Unknown)
7193 continue;
7194
7195 NumAccesses = Group->getNumMembers();
7196 if (interleavedAccessCanBeWidened(&I, VF))
7197 InterleaveCost = getInterleaveGroupCost(&I, VF);
7198 }
7199
7200 InstructionCost GatherScatterCost =
7201 isLegalGatherOrScatter(&I)
7202 ? getGatherScatterCost(&I, VF) * NumAccesses
7203 : InstructionCost::getInvalid();
7204
7205 InstructionCost ScalarizationCost =
7206 !VF.isScalable() ? getMemInstScalarizationCost(&I, VF) * NumAccesses
7207 : InstructionCost::getInvalid();
7208
7209 // Choose better solution for the current VF,
7210 // write down this decision and use it during vectorization.
7211 InstructionCost Cost;
7212 InstWidening Decision;
7213 if (InterleaveCost <= GatherScatterCost &&
7214 InterleaveCost < ScalarizationCost) {
7215 Decision = CM_Interleave;
7216 Cost = InterleaveCost;
7217 } else if (GatherScatterCost < ScalarizationCost) {
7218 Decision = CM_GatherScatter;
7219 Cost = GatherScatterCost;
7220 } else {
7221 assert(!VF.isScalable() &&((!VF.isScalable() && "We cannot yet scalarise for scalable vectors"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"We cannot yet scalarise for scalable vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7222, __PRETTY_FUNCTION__))
7222 "We cannot yet scalarise for scalable vectors")((!VF.isScalable() && "We cannot yet scalarise for scalable vectors"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"We cannot yet scalarise for scalable vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7222, __PRETTY_FUNCTION__))
;
7223 Decision = CM_Scalarize;
7224 Cost = ScalarizationCost;
7225 }
7226 // If the instructions belongs to an interleave group, the whole group
7227 // receives the same decision. The whole group receives the cost, but
7228 // the cost will actually be assigned to one instruction.
7229 if (auto Group = getInterleavedAccessGroup(&I))
7230 setWideningDecision(Group, VF, Decision, Cost);
7231 else
7232 setWideningDecision(&I, VF, Decision, Cost);
7233 }
7234 }
7235
7236 // Make sure that any load of address and any other address computation
7237 // remains scalar unless there is gather/scatter support. This avoids
7238 // inevitable extracts into address registers, and also has the benefit of
7239 // activating LSR more, since that pass can't optimize vectorized
7240 // addresses.
7241 if (TTI.prefersVectorizedAddressing())
7242 return;
7243
7244 // Start with all scalar pointer uses.
7245 SmallPtrSet<Instruction *, 8> AddrDefs;
7246 for (BasicBlock *BB : TheLoop->blocks())
7247 for (Instruction &I : *BB) {
7248 Instruction *PtrDef =
7249 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7250 if (PtrDef && TheLoop->contains(PtrDef) &&
7251 getWideningDecision(&I, VF) != CM_GatherScatter)
7252 AddrDefs.insert(PtrDef);
7253 }
7254
7255 // Add all instructions used to generate the addresses.
7256 SmallVector<Instruction *, 4> Worklist;
7257 append_range(Worklist, AddrDefs);
7258 while (!Worklist.empty()) {
7259 Instruction *I = Worklist.pop_back_val();
7260 for (auto &Op : I->operands())
7261 if (auto *InstOp = dyn_cast<Instruction>(Op))
7262 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7263 AddrDefs.insert(InstOp).second)
7264 Worklist.push_back(InstOp);
7265 }
7266
7267 for (auto *I : AddrDefs) {
7268 if (isa<LoadInst>(I)) {
7269 // Setting the desired widening decision should ideally be handled in
7270 // by cost functions, but since this involves the task of finding out
7271 // if the loaded register is involved in an address computation, it is
7272 // instead changed here when we know this is the case.
7273 InstWidening Decision = getWideningDecision(I, VF);
7274 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7275 // Scalarize a widened load of address.
7276 setWideningDecision(
7277 I, VF, CM_Scalarize,
7278 (VF.getKnownMinValue() *
7279 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7280 else if (auto Group = getInterleavedAccessGroup(I)) {
7281 // Scalarize an interleave group of address loads.
7282 for (unsigned I = 0; I < Group->getFactor(); ++I) {
7283 if (Instruction *Member = Group->getMember(I))
7284 setWideningDecision(
7285 Member, VF, CM_Scalarize,
7286 (VF.getKnownMinValue() *
7287 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7288 }
7289 }
7290 } else
7291 // Make sure I gets scalarized and a cost estimate without
7292 // scalarization overhead.
7293 ForcedScalars[VF].insert(I);
7294 }
7295}
7296
7297InstructionCost
7298LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7299 Type *&VectorTy) {
7300 Type *RetTy = I->getType();
7301 if (canTruncateToMinimalBitwidth(I, VF))
7302 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7303 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
7304 auto SE = PSE.getSE();
7305 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7306
7307 // TODO: We need to estimate the cost of intrinsic calls.
7308 switch (I->getOpcode()) {
7309 case Instruction::GetElementPtr:
7310 // We mark this instruction as zero-cost because the cost of GEPs in
7311 // vectorized code depends on whether the corresponding memory instruction
7312 // is scalarized or not. Therefore, we handle GEPs with the memory
7313 // instruction cost.
7314 return 0;
7315 case Instruction::Br: {
7316 // In cases of scalarized and predicated instructions, there will be VF
7317 // predicated blocks in the vectorized loop. Each branch around these
7318 // blocks requires also an extract of its vector compare i1 element.
7319 bool ScalarPredicatedBB = false;
7320 BranchInst *BI = cast<BranchInst>(I);
7321 if (VF.isVector() && BI->isConditional() &&
7322 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7323 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7324 ScalarPredicatedBB = true;
7325
7326 if (ScalarPredicatedBB) {
7327 // Return cost for branches around scalarized and predicated blocks.
7328 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7328, __PRETTY_FUNCTION__))
;
7329 auto *Vec_i1Ty =
7330 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7331 return (TTI.getScalarizationOverhead(
7332 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
7333 false, true) +
7334 (TTI.getCFInstrCost(Instruction::Br, CostKind) *
7335 VF.getKnownMinValue()));
7336 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7337 // The back-edge branch will remain, as will all scalar branches.
7338 return TTI.getCFInstrCost(Instruction::Br, CostKind);
7339 else
7340 // This branch will be eliminated by if-conversion.
7341 return 0;
7342 // Note: We currently assume zero cost for an unconditional branch inside
7343 // a predicated block since it will become a fall-through, although we
7344 // may decide in the future to call TTI for all branches.
7345 }
7346 case Instruction::PHI: {
7347 auto *Phi = cast<PHINode>(I);
7348
7349 // First-order recurrences are replaced by vector shuffles inside the loop.
7350 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7351 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7352 return TTI.getShuffleCost(
7353 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7354 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7355
7356 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7357 // converted into select instructions. We require N - 1 selects per phi
7358 // node, where N is the number of incoming values.
7359 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7360 return (Phi->getNumIncomingValues() - 1) *
7361 TTI.getCmpSelInstrCost(
7362 Instruction::Select, ToVectorTy(Phi->getType(), VF),
7363 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7364 CmpInst::BAD_ICMP_PREDICATE, CostKind);
7365
7366 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7367 }
7368 case Instruction::UDiv:
7369 case Instruction::SDiv:
7370 case Instruction::URem:
7371 case Instruction::SRem:
7372 // If we have a predicated instruction, it may not be executed for each
7373 // vector lane. Get the scalarization cost and scale this amount by the
7374 // probability of executing the predicated block. If the instruction is not
7375 // predicated, we fall through to the next case.
7376 if (VF.isVector() && isScalarWithPredication(I)) {
7377 InstructionCost Cost = 0;
7378
7379 // These instructions have a non-void type, so account for the phi nodes
7380 // that we will create. This cost is likely to be zero. The phi node
7381 // cost, if any, should be scaled by the block probability because it
7382 // models a copy at the end of each predicated block.
7383 Cost += VF.getKnownMinValue() *
7384 TTI.getCFInstrCost(Instruction::PHI, CostKind);
7385
7386 // The cost of the non-predicated instruction.
7387 Cost += VF.getKnownMinValue() *
7388 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7389
7390 // The cost of insertelement and extractelement instructions needed for
7391 // scalarization.
7392 Cost += getScalarizationOverhead(I, VF);
7393
7394 // Scale the cost by the probability of executing the predicated blocks.
7395 // This assumes the predicated block for each vector lane is equally
7396 // likely.
7397 return Cost / getReciprocalPredBlockProb();
7398 }
7399 LLVM_FALLTHROUGH[[gnu::fallthrough]];
7400 case Instruction::Add:
7401 case Instruction::FAdd:
7402 case Instruction::Sub:
7403 case Instruction::FSub:
7404 case Instruction::Mul:
7405 case Instruction::FMul:
7406 case Instruction::FDiv:
7407 case Instruction::FRem:
7408 case Instruction::Shl:
7409 case Instruction::LShr:
7410 case Instruction::AShr:
7411 case Instruction::And:
7412 case Instruction::Or:
7413 case Instruction::Xor: {
7414 // Since we will replace the stride by 1 the multiplication should go away.
7415 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7416 return 0;
7417
7418 // Detect reduction patterns
7419 InstructionCost RedCost;
7420 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7421 .isValid())
7422 return RedCost;
7423
7424 // Certain instructions can be cheaper to vectorize if they have a constant
7425 // second vector operand. One example of this are shifts on x86.
7426 Value *Op2 = I->getOperand(1);
7427 TargetTransformInfo::OperandValueProperties Op2VP;
7428 TargetTransformInfo::OperandValueKind Op2VK =
7429 TTI.getOperandInfo(Op2, Op2VP);
7430 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7431 Op2VK = TargetTransformInfo::OK_UniformValue;
7432
7433 SmallVector<const Value *, 4> Operands(I->operand_values());
7434 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7435 return N * TTI.getArithmeticInstrCost(
7436 I->getOpcode(), VectorTy, CostKind,
7437 TargetTransformInfo::OK_AnyValue,
7438 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7439 }
7440 case Instruction::FNeg: {
7441 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7441, __PRETTY_FUNCTION__))
;
7442 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7443 return N * TTI.getArithmeticInstrCost(
7444 I->getOpcode(), VectorTy, CostKind,
7445 TargetTransformInfo::OK_AnyValue,
7446 TargetTransformInfo::OK_AnyValue,
7447 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
7448 I->getOperand(0), I);
7449 }
7450 case Instruction::Select: {
7451 SelectInst *SI = cast<SelectInst>(I);
7452 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7453 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7454
7455 const Value *Op0, *Op1;
7456 using namespace llvm::PatternMatch;
7457 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7458 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7459 // select x, y, false --> x & y
7460 // select x, true, y --> x | y
7461 TTI::OperandValueProperties Op1VP = TTI::OP_None;
7462 TTI::OperandValueProperties Op2VP = TTI::OP_None;
7463 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7464 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7465 assert(Op0->getType()->getScalarSizeInBits() == 1 &&((Op0->getType()->getScalarSizeInBits() == 1 &&
Op1->getType()->getScalarSizeInBits() == 1) ? static_cast
<void> (0) : __assert_fail ("Op0->getType()->getScalarSizeInBits() == 1 && Op1->getType()->getScalarSizeInBits() == 1"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7466, __PRETTY_FUNCTION__))
7466 Op1->getType()->getScalarSizeInBits() == 1)((Op0->getType()->getScalarSizeInBits() == 1 &&
Op1->getType()->getScalarSizeInBits() == 1) ? static_cast
<void> (0) : __assert_fail ("Op0->getType()->getScalarSizeInBits() == 1 && Op1->getType()->getScalarSizeInBits() == 1"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7466, __PRETTY_FUNCTION__))
;
7467
7468 SmallVector<const Value *, 2> Operands{Op0, Op1};
7469 return TTI.getArithmeticInstrCost(
7470 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7471 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7472 }
7473
7474 Type *CondTy = SI->getCondition()->getType();
7475 if (!ScalarCond)
7476 CondTy = VectorType::get(CondTy, VF);
7477 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
7478 CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7479 }
7480 case Instruction::ICmp:
7481 case Instruction::FCmp: {
7482 Type *ValTy = I->getOperand(0)->getType();
7483 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7484 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7485 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7486 VectorTy = ToVectorTy(ValTy, VF);
7487 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7488 CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7489 }
7490 case Instruction::Store:
7491 case Instruction::Load: {
7492 ElementCount Width = VF;
7493 if (Width.isVector()) {
7494 InstWidening Decision = getWideningDecision(I, Width);
7495 assert(Decision != CM_Unknown &&((Decision != CM_Unknown && "CM decision should be taken at this point"
) ? static_cast<void> (0) : __assert_fail ("Decision != CM_Unknown && \"CM decision should be taken at this point\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7496, __PRETTY_FUNCTION__))
7496 "CM decision should be taken at this point")((Decision != CM_Unknown && "CM decision should be taken at this point"
) ? static_cast<void> (0) : __assert_fail ("Decision != CM_Unknown && \"CM decision should be taken at this point\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7496, __PRETTY_FUNCTION__))
;
7497 if (Decision == CM_Scalarize)
7498 Width = ElementCount::getFixed(1);
7499 }
7500 VectorTy = ToVectorTy(getMemInstValueType(I), Width);
7501 return getMemoryInstructionCost(I, VF);
7502 }
7503 case Instruction::ZExt:
7504 case Instruction::SExt:
7505 case Instruction::FPToUI:
7506 case Instruction::FPToSI:
7507 case Instruction::FPExt:
7508 case Instruction::PtrToInt:
7509 case Instruction::IntToPtr:
7510 case Instruction::SIToFP:
7511 case Instruction::UIToFP:
7512 case Instruction::Trunc:
7513 case Instruction::FPTrunc:
7514 case Instruction::BitCast: {
7515 // Computes the CastContextHint from a Load/Store instruction.
7516 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7517 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Expected a load or a store!") ? static_cast<void> (0)
: __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected a load or a store!\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7518, __PRETTY_FUNCTION__))
7518 "Expected a load or a store!")(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Expected a load or a store!") ? static_cast<void> (0)
: __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected a load or a store!\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7518, __PRETTY_FUNCTION__))
;
7519
7520 if (VF.isScalar() || !TheLoop->contains(I))
7521 return TTI::CastContextHint::Normal;
7522
7523 switch (getWideningDecision(I, VF)) {
7524 case LoopVectorizationCostModel::CM_GatherScatter:
7525 return TTI::CastContextHint::GatherScatter;
7526 case LoopVectorizationCostModel::CM_Interleave:
7527 return TTI::CastContextHint::Interleave;
7528 case LoopVectorizationCostModel::CM_Scalarize:
7529 case LoopVectorizationCostModel::CM_Widen:
7530 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7531 : TTI::CastContextHint::Normal;
7532 case LoopVectorizationCostModel::CM_Widen_Reverse:
7533 return TTI::CastContextHint::Reversed;
7534 case LoopVectorizationCostModel::CM_Unknown:
7535 llvm_unreachable("Instr did not go through cost modelling?")::llvm::llvm_unreachable_internal("Instr did not go through cost modelling?"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7535)
;
7536 }
7537
7538 llvm_unreachable("Unhandled case!")::llvm::llvm_unreachable_internal("Unhandled case!", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7538)
;
7539 };
7540
7541 unsigned Opcode = I->getOpcode();
7542 TTI::CastContextHint CCH = TTI::CastContextHint::None;
7543 // For Trunc, the context is the only user, which must be a StoreInst.
7544 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7545 if (I->hasOneUse())
7546 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7547 CCH = ComputeCCH(Store);
7548 }
7549 // For Z/Sext, the context is the operand, which must be a LoadInst.
7550 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7551 Opcode == Instruction::FPExt) {
7552 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7553 CCH = ComputeCCH(Load);
7554 }
7555
7556 // We optimize the truncation of induction variables having constant
7557 // integer steps. The cost of these truncations is the same as the scalar
7558 // operation.
7559 if (isOptimizableIVTruncate(I, VF)) {
7560 auto *Trunc = cast<TruncInst>(I);
7561 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7562 Trunc->getSrcTy(), CCH, CostKind, Trunc);
7563 }
7564
7565 // Detect reduction patterns
7566 InstructionCost RedCost;
7567 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7568 .isValid())
7569 return RedCost;
7570
7571 Type *SrcScalarTy = I->getOperand(0)->getType();
7572 Type *SrcVecTy =
7573 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7574 if (canTruncateToMinimalBitwidth(I, VF)) {
7575 // This cast is going to be shrunk. This may remove the cast or it might
7576 // turn it into slightly different cast. For example, if MinBW == 16,
7577 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7578 //
7579 // Calculate the modified src and dest types.
7580 Type *MinVecTy = VectorTy;
7581 if (Opcode == Instruction::Trunc) {
7582 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7583 VectorTy =
7584 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7585 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7586 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7587 VectorTy =
7588 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7589 }
7590 }
7591
7592 unsigned N;
7593 if (isScalarAfterVectorization(I, VF)) {
7594 assert(!VF.isScalable() && "VF is assumed to be non scalable")((!VF.isScalable() && "VF is assumed to be non scalable"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7594, __PRETTY_FUNCTION__))
;
7595 N = VF.getKnownMinValue();
7596 } else
7597 N = 1;
7598 return N *
7599 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7600 }
7601 case Instruction::Call: {
7602 bool NeedToScalarize;
7603 CallInst *CI = cast<CallInst>(I);
7604 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7605 if (getVectorIntrinsicIDForCall(CI, TLI)) {
7606 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7607 return std::min(CallCost, IntrinsicCost);
7608 }
7609 return CallCost;
7610 }
7611 case Instruction::ExtractValue:
7612 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7613 default:
7614 // The cost of executing VF copies of the scalar instruction. This opcode
7615 // is unknown. Assume that it is the same as 'mul'.
7616 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
7617 Instruction::Mul, VectorTy, CostKind) +
7618 getScalarizationOverhead(I, VF);
7619 } // end of switch.
7620}
7621
7622char LoopVectorize::ID = 0;
7623
7624static const char lv_name[] = "Loop Vectorization";
7625
7626INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)static void *initializeLoopVectorizePassOnce(PassRegistry &
Registry) {
7627INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);
7628INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)initializeBasicAAWrapperPassPass(Registry);
7629INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)initializeAAResultsWrapperPassPass(Registry);
7630INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)initializeGlobalsAAWrapperPassPass(Registry);
7631INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)initializeAssumptionCacheTrackerPass(Registry);
7632INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)initializeBlockFrequencyInfoWrapperPassPass(Registry);
7633INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)initializeDominatorTreeWrapperPassPass(Registry);
7634INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)initializeScalarEvolutionWrapperPassPass(Registry);
7635INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)initializeLoopInfoWrapperPassPass(Registry);
7636INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)initializeLoopAccessLegacyAnalysisPass(Registry);
7637INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)initializeDemandedBitsWrapperPassPass(Registry);
7638INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)initializeOptimizationRemarkEmitterWrapperPassPass(Registry);
7639INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)initializeProfileSummaryInfoWrapperPassPass(Registry);
7640INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)initializeInjectTLIMappingsLegacyPass(Registry);
7641INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)PassInfo *PI = new PassInfo( lv_name, "loop-vectorize", &
LoopVectorize::ID, PassInfo::NormalCtor_t(callDefaultCtor<
LoopVectorize>), false, false); Registry.registerPass(*PI,
true); return PI; } static llvm::once_flag InitializeLoopVectorizePassFlag
; void llvm::initializeLoopVectorizePass(PassRegistry &Registry
) { llvm::call_once(InitializeLoopVectorizePassFlag, initializeLoopVectorizePassOnce
, std::ref(Registry)); }
7642
7643namespace llvm {
7644
7645Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7646
7647Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7648 bool VectorizeOnlyWhenForced) {
7649 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7650}
7651
7652} // end namespace llvm
7653
7654bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7655 // Check if the pointer operand of a load or store instruction is
7656 // consecutive.
7657 if (auto *Ptr = getLoadStorePointerOperand(Inst))
7658 return Legal->isConsecutivePtr(Ptr);
7659 return false;
7660}
7661
7662void LoopVectorizationCostModel::collectValuesToIgnore() {
7663 // Ignore ephemeral values.
7664 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7665
7666 // Ignore type-promoting instructions we identified during reduction
7667 // detection.
7668 for (auto &Reduction : Legal->getReductionVars()) {
7669 RecurrenceDescriptor &RedDes = Reduction.second;
7670 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7671 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7672 }
7673 // Ignore type-casting instructions we identified during induction
7674 // detection.
7675 for (auto &Induction : Legal->getInductionVars()) {
7676 InductionDescriptor &IndDes = Induction.second;
7677 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7678 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7679 }
7680}
7681
7682void LoopVectorizationCostModel::collectInLoopReductions() {
7683 for (auto &Reduction : Legal->getReductionVars()) {
7684 PHINode *Phi = Reduction.first;
7685 RecurrenceDescriptor &RdxDesc = Reduction.second;
7686
7687 // We don't collect reductions that are type promoted (yet).
7688 if (RdxDesc.getRecurrenceType() != Phi->getType())
7689 continue;
7690
7691 // If the target would prefer this reduction to happen "in-loop", then we
7692 // want to record it as such.
7693 unsigned Opcode = RdxDesc.getOpcode();
7694 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7695 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7696 TargetTransformInfo::ReductionFlags()))
7697 continue;
7698
7699 // Check that we can correctly put the reductions into the loop, by
7700 // finding the chain of operations that leads from the phi to the loop
7701 // exit value.
7702 SmallVector<Instruction *, 4> ReductionOperations =
7703 RdxDesc.getReductionOpChain(Phi, TheLoop);
7704 bool InLoop = !ReductionOperations.empty();
7705 if (InLoop) {
7706 InLoopReductionChains[Phi] = ReductionOperations;
7707 // Add the elements to InLoopReductionImmediateChains for cost modelling.
7708 Instruction *LastChain = Phi;
7709 for (auto *I : ReductionOperations) {
7710 InLoopReductionImmediateChains[I] = LastChain;
7711 LastChain = I;
7712 }
7713 }
7714 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
InLoop ? "inloop" : "out of loop") << " reduction for phi: "
<< *Phi << "\n"; } } while (false)
7715 << " reduction for phi: " << *Phi << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
InLoop ? "inloop" : "out of loop") << " reduction for phi: "
<< *Phi << "\n"; } } while (false)
;
7716 }
7717}
7718
7719// TODO: we could return a pair of values that specify the max VF and
7720// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7721// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7722// doesn't have a cost model that can choose which plan to execute if
7723// more than one is generated.
7724static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7725 LoopVectorizationCostModel &CM) {
7726 unsigned WidestType;
7727 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7728 return WidestVectorRegBits / WidestType;
7729}
7730
7731VectorizationFactor
7732LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7733 assert(!UserVF.isScalable() && "scalable vectors not yet supported")((!UserVF.isScalable() && "scalable vectors not yet supported"
) ? static_cast<void> (0) : __assert_fail ("!UserVF.isScalable() && \"scalable vectors not yet supported\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7733, __PRETTY_FUNCTION__))
;
7734 ElementCount VF = UserVF;
7735 // Outer loop handling: They may require CFG and instruction level
7736 // transformations before even evaluating whether vectorization is profitable.
7737 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7738 // the vectorization pipeline.
7739 if (!OrigLoop->isInnermost()) {
7740 // If the user doesn't provide a vectorization factor, determine a
7741 // reasonable one.
7742 if (UserVF.isZero()) {
7743 VF = ElementCount::getFixed(determineVPlanVF(
7744 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7745 .getFixedSize(),
7746 CM));
7747 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: VPlan computed VF "
<< VF << ".\n"; } } while (false)
;
7748
7749 // Make sure we have a VF > 1 for stress testing.
7750 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7751 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: VPlan stress testing: "
<< "overriding computed VF.\n"; } } while (false)
7752 << "overriding computed VF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: VPlan stress testing: "
<< "overriding computed VF.\n"; } } while (false)
;
7753 VF = ElementCount::getFixed(4);
7754 }
7755 }
7756 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.")((EnableVPlanNativePath && "VPlan-native path is not enabled."
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is not enabled.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7756, __PRETTY_FUNCTION__))
;
7757 assert(isPowerOf2_32(VF.getKnownMinValue()) &&((isPowerOf2_32(VF.getKnownMinValue()) && "VF needs to be a power of two"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue()) && \"VF needs to be a power of two\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7758, __PRETTY_FUNCTION__))
7758 "VF needs to be a power of two")((isPowerOf2_32(VF.getKnownMinValue()) && "VF needs to be a power of two"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue()) && \"VF needs to be a power of two\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7758, __PRETTY_FUNCTION__))
;
7759 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
!UserVF.isZero() ? "user " : "") << "VF " << VF <<
" to build VPlans.\n"; } } while (false)
7760 << "VF " << VF << " to build VPlans.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
!UserVF.isZero() ? "user " : "") << "VF " << VF <<
" to build VPlans.\n"; } } while (false)
;
7761 buildVPlans(VF, VF);
7762
7763 // For VPlan build stress testing, we bail out after VPlan construction.
7764 if (VPlanBuildStressTest)
7765 return VectorizationFactor::Disabled();
7766
7767 return {VF, 0 /*Cost*/};
7768 }
7769
7770 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
"VPlan-native path.\n"; } } while (false)
7771 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
"VPlan-native path.\n"; } } while (false)
7772 "VPlan-native path.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
"VPlan-native path.\n"; } } while (false)
;
7773 return VectorizationFactor::Disabled();
7774}
7775
7776Optional<VectorizationFactor>
7777LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7778 assert(OrigLoop->isInnermost() && "Inner loop expected.")((OrigLoop->isInnermost() && "Inner loop expected."
) ? static_cast<void> (0) : __assert_fail ("OrigLoop->isInnermost() && \"Inner loop expected.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7778, __PRETTY_FUNCTION__))
;
24
Assuming the condition is true
25
'?' condition is true
7779 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
7780 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
26
Taking false branch
7781 return None;
7782
7783 // Invalidate interleave groups if all blocks of loop will be predicated.
7784 if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
27
Assuming the condition is false
7785 !useMaskedInterleavedAccesses(*TTI)) {
7786 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
7787 dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
7788 << "LV: Invalidate all interleaved groups due to fold-tail by masking "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
7789 "which requires masked-interleaved support.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
;
7790 if (CM.InterleaveInfo.invalidateGroups())
7791 // Invalidating interleave groups also requires invalidating all decisions
7792 // based on them, which includes widening decisions and uniform and scalar
7793 // values.
7794 CM.invalidateCostModelingDecisions();
7795 }
7796
7797 ElementCount MaxVF = MaybeMaxVF.getValue();
7798 assert(MaxVF.isNonZero() && "MaxVF is zero.")((MaxVF.isNonZero() && "MaxVF is zero.") ? static_cast
<void> (0) : __assert_fail ("MaxVF.isNonZero() && \"MaxVF is zero.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7798, __PRETTY_FUNCTION__))
;
28
'?' condition is true
7799
7800 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF);
7801 if (!UserVF.isZero() &&
7802 (UserVFIsLegal
28.1
'UserVFIsLegal' is true
|| (UserVF.isScalable() && MaxVF.isScalable()))) {
7803 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable
7804 // VFs here, this should be reverted to only use legal UserVFs once the
7805 // loop below supports scalable VFs.
7806 ElementCount VF = UserVFIsLegal
28.2
'UserVFIsLegal' is true
? UserVF : MaxVF;
29
'?' condition is true
7807 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
UserVFIsLegal ? "user" : "max") << " VF " << VF <<
".\n"; } } while (false)
30
Assuming 'DebugFlag' is false
31
Loop condition is false. Exiting loop
7808 << " VF " << VF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
UserVFIsLegal ? "user" : "max") << " VF " << VF <<
".\n"; } } while (false)
;
7809 assert(isPowerOf2_32(VF.getKnownMinValue()) &&((isPowerOf2_32(VF.getKnownMinValue()) && "VF needs to be a power of two"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue()) && \"VF needs to be a power of two\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7810, __PRETTY_FUNCTION__))
32
'?' condition is true
7810 "VF needs to be a power of two")((isPowerOf2_32(VF.getKnownMinValue()) && "VF needs to be a power of two"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue()) && \"VF needs to be a power of two\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7810, __PRETTY_FUNCTION__))
;
7811 // Collect the instructions (and their associated costs) that will be more
7812 // profitable to scalarize.
7813 CM.selectUserVectorizationFactor(VF);
7814 CM.collectInLoopReductions();
7815 buildVPlansWithVPRecipes(VF, VF);
33
Calling 'LoopVectorizationPlanner::buildVPlansWithVPRecipes'
7816 LLVM_DEBUG(printPlans(dbgs()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { printPlans(dbgs()); } } while (false)
;
7817 return {{VF, 0}};
7818 }
7819
7820 assert(!MaxVF.isScalable() &&((!MaxVF.isScalable() && "Scalable vectors not yet supported beyond this point"
) ? static_cast<void> (0) : __assert_fail ("!MaxVF.isScalable() && \"Scalable vectors not yet supported beyond this point\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7821, __PRETTY_FUNCTION__))
7821 "Scalable vectors not yet supported beyond this point")((!MaxVF.isScalable() && "Scalable vectors not yet supported beyond this point"
) ? static_cast<void> (0) : __assert_fail ("!MaxVF.isScalable() && \"Scalable vectors not yet supported beyond this point\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7821, __PRETTY_FUNCTION__))
;
7822
7823 for (ElementCount VF = ElementCount::getFixed(1);
7824 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
7825 // Collect Uniform and Scalar instructions after vectorization with VF.
7826 CM.collectUniformsAndScalars(VF);
7827
7828 // Collect the instructions (and their associated costs) that will be more
7829 // profitable to scalarize.
7830 if (VF.isVector())
7831 CM.collectInstsToScalarize(VF);
7832 }
7833
7834 CM.collectInLoopReductions();
7835
7836 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
7837 LLVM_DEBUG(printPlans(dbgs()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { printPlans(dbgs()); } } while (false)
;
7838 if (MaxVF.isScalar())
7839 return VectorizationFactor::Disabled();
7840
7841 // Select the optimal vectorization factor.
7842 auto SelectedVF = CM.selectVectorizationFactor(MaxVF);
7843
7844 // Check if it is profitable to vectorize with runtime checks.
7845 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
7846 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
7847 bool PragmaThresholdReached =
7848 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
7849 bool ThresholdReached =
7850 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
7851 if ((ThresholdReached && !Hints.allowReordering()) ||
7852 PragmaThresholdReached) {
7853 ORE->emit([&]() {
7854 return OptimizationRemarkAnalysisAliasing(
7855 DEBUG_TYPE"loop-vectorize", "CantReorderMemOps", OrigLoop->getStartLoc(),
7856 OrigLoop->getHeader())
7857 << "loop not vectorized: cannot prove it is safe to reorder "
7858 "memory operations";
7859 });
7860 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Too many memory checks needed.\n"
; } } while (false)
;
7861 Hints.emitRemarkWithHints();
7862 return VectorizationFactor::Disabled();
7863 }
7864 }
7865 return SelectedVF;
7866}
7867
7868void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7869 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Setting best plan to VF="
<< VF << ", UF=" << UF << '\n'; } } while
(false)
7870 << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Setting best plan to VF="
<< VF << ", UF=" << UF << '\n'; } } while
(false)
;
7871 BestVF = VF;
7872 BestUF = UF;
7873
7874 erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7875 return !Plan->hasVF(VF);
7876 });
7877 assert(VPlans.size() == 1 && "Best VF has not a single VPlan.")((VPlans.size() == 1 && "Best VF has not a single VPlan."
) ? static_cast<void> (0) : __assert_fail ("VPlans.size() == 1 && \"Best VF has not a single VPlan.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7877, __PRETTY_FUNCTION__))
;
7878}
7879
7880void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7881 DominatorTree *DT) {
7882 // Perform the actual loop transformation.
7883
7884 // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7885 assert(BestVF.hasValue() && "Vectorization Factor is missing")((BestVF.hasValue() && "Vectorization Factor is missing"
) ? static_cast<void> (0) : __assert_fail ("BestVF.hasValue() && \"Vectorization Factor is missing\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7885, __PRETTY_FUNCTION__))
;
7886 assert(VPlans.size() == 1 && "Not a single VPlan to execute.")((VPlans.size() == 1 && "Not a single VPlan to execute."
) ? static_cast<void> (0) : __assert_fail ("VPlans.size() == 1 && \"Not a single VPlan to execute.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7886, __PRETTY_FUNCTION__))
;
7887
7888 VPTransformState State{
7889 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()};
7890 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7891 State.TripCount = ILV.getOrCreateTripCount(nullptr);
7892 State.CanonicalIV = ILV.Induction;
7893
7894 ILV.printDebugTracesAtStart();
7895
7896 //===------------------------------------------------===//
7897 //
7898 // Notice: any optimization or new instruction that go
7899 // into the code below should also be implemented in
7900 // the cost-model.
7901 //
7902 //===------------------------------------------------===//
7903
7904 // 2. Copy and widen instructions from the old loop into the new loop.
7905 VPlans.front()->execute(&State);
7906
7907 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7908 // predication, updating analyses.
7909 ILV.fixVectorizedLoop(State);
7910
7911 ILV.printDebugTracesAtEnd();
7912}
7913
7914#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7915void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7916 for (const auto &Plan : VPlans)
7917 if (PrintVPlansInDotFormat)
7918 Plan->printDOT(O);
7919 else
7920 Plan->print(O);
7921}
7922#endif
7923
7924void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7925 SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7926
7927 // We create new control-flow for the vectorized loop, so the original exit
7928 // conditions will be dead after vectorization if it's only used by the
7929 // terminator
7930 SmallVector<BasicBlock*> ExitingBlocks;
7931 OrigLoop->getExitingBlocks(ExitingBlocks);
7932 for (auto *BB : ExitingBlocks) {
7933 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
7934 if (!Cmp || !Cmp->hasOneUse())
7935 continue;
7936
7937 // TODO: we should introduce a getUniqueExitingBlocks on Loop
7938 if (!DeadInstructions.insert(Cmp).second)
7939 continue;
7940
7941 // The operands of the icmp is often a dead trunc, used by IndUpdate.
7942 // TODO: can recurse through operands in general
7943 for (Value *Op : Cmp->operands()) {
7944 if (isa<TruncInst>(Op) && Op->hasOneUse())
7945 DeadInstructions.insert(cast<Instruction>(Op));
7946 }
7947 }
7948
7949 // We create new "steps" for induction variable updates to which the original
7950 // induction variables map. An original update instruction will be dead if
7951 // all its users except the induction variable are dead.
7952 auto *Latch = OrigLoop->getLoopLatch();
7953 for (auto &Induction : Legal->getInductionVars()) {
7954 PHINode *Ind = Induction.first;
7955 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7956
7957 // If the tail is to be folded by masking, the primary induction variable,
7958 // if exists, isn't dead: it will be used for masking. Don't kill it.
7959 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7960 continue;
7961
7962 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7963 return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7964 }))
7965 DeadInstructions.insert(IndUpdate);
7966
7967 // We record as "Dead" also the type-casting instructions we had identified
7968 // during induction analysis. We don't need any handling for them in the
7969 // vectorized loop because we have proven that, under a proper runtime
7970 // test guarding the vectorized loop, the value of the phi, and the casted
7971 // value of the phi, are the same. The last instruction in this casting chain
7972 // will get its scalar/vector/widened def from the scalar/vector/widened def
7973 // of the respective phi node. Any other casts in the induction def-use chain
7974 // have no other uses outside the phi update chain, and will be ignored.
7975 InductionDescriptor &IndDes = Induction.second;
7976 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7977 DeadInstructions.insert(Casts.begin(), Casts.end());
7978 }
7979}
7980
7981Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7982
7983Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7984
7985Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7986 Instruction::BinaryOps BinOp) {
7987 // When unrolling and the VF is 1, we only need to add a simple scalar.
7988 Type *Ty = Val->getType();
7989 assert(!Ty->isVectorTy() && "Val must be a scalar")((!Ty->isVectorTy() && "Val must be a scalar") ? static_cast
<void> (0) : __assert_fail ("!Ty->isVectorTy() && \"Val must be a scalar\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7989, __PRETTY_FUNCTION__))
;
7990
7991 if (Ty->isFloatingPointTy()) {
7992 Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7993
7994 // Floating-point operations inherit FMF via the builder's flags.
7995 Value *MulOp = Builder.CreateFMul(C, Step);
7996 return Builder.CreateBinOp(BinOp, Val, MulOp);
7997 }
7998 Constant *C = ConstantInt::get(Ty, StartIdx);
7999 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
8000}
8001
8002static void AddRuntimeUnrollDisableMetaData(Loop *L) {
8003 SmallVector<Metadata *, 4> MDs;
8004 // Reserve first location for self reference to the LoopID metadata node.
8005 MDs.push_back(nullptr);
8006 bool IsUnrollMetadata = false;
8007 MDNode *LoopID = L->getLoopID();
8008 if (LoopID) {
8009 // First find existing loop unrolling disable metadata.
8010 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
8011 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
8012 if (MD) {
8013 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
8014 IsUnrollMetadata =
8015 S && S->getString().startswith("llvm.loop.unroll.disable");
8016 }
8017 MDs.push_back(LoopID->getOperand(i));
8018 }
8019 }
8020
8021 if (!IsUnrollMetadata) {
8022 // Add runtime unroll disable metadata.
8023 LLVMContext &Context = L->getHeader()->getContext();
8024 SmallVector<Metadata *, 1> DisableOperands;
8025 DisableOperands.push_back(
8026 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
8027 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
8028 MDs.push_back(DisableNode);
8029 MDNode *NewLoopID = MDNode::get(Context, MDs);
8030 // Set operand 0 to refer to the loop id itself.
8031 NewLoopID->replaceOperandWith(0, NewLoopID);
8032 L->setLoopID(NewLoopID);
8033 }
8034}
8035
8036//===--------------------------------------------------------------------===//
8037// EpilogueVectorizerMainLoop
8038//===--------------------------------------------------------------------===//
8039
8040/// This function is partially responsible for generating the control flow
8041/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8042BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
8043 MDNode *OrigLoopID = OrigLoop->getLoopID();
8044 Loop *Lp = createVectorLoopSkeleton("");
8045
8046 // Generate the code to check the minimum iteration count of the vector
8047 // epilogue (see below).
8048 EPI.EpilogueIterationCountCheck =
8049 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
8050 EPI.EpilogueIterationCountCheck->setName("iter.check");
8051
8052 // Generate the code to check any assumptions that we've made for SCEV
8053 // expressions.
8054 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
8055
8056 // Generate the code that checks at runtime if arrays overlap. We put the
8057 // checks into a separate block to make the more common case of few elements
8058 // faster.
8059 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
8060
8061 // Generate the iteration count check for the main loop, *after* the check
8062 // for the epilogue loop, so that the path-length is shorter for the case
8063 // that goes directly through the vector epilogue. The longer-path length for
8064 // the main loop is compensated for, by the gain from vectorizing the larger
8065 // trip count. Note: the branch will get updated later on when we vectorize
8066 // the epilogue.
8067 EPI.MainLoopIterationCountCheck =
8068 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
8069
8070 // Generate the induction variable.
8071 OldInduction = Legal->getPrimaryInduction();
8072 Type *IdxTy = Legal->getWidestInductionType();
8073 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8074 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
8075 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8076 EPI.VectorTripCount = CountRoundDown;
8077 Induction =
8078 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8079 getDebugLocFromInstOrOperands(OldInduction));
8080
8081 // Skip induction resume value creation here because they will be created in
8082 // the second pass. If we created them here, they wouldn't be used anyway,
8083 // because the vplan in the second pass still contains the inductions from the
8084 // original loop.
8085
8086 return completeLoopSkeleton(Lp, OrigLoopID);
8087}
8088
8089void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
8090 LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue
() << ", Main Loop UF:" << EPI.MainLoopUF <<
", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue
() << ", Epilogue Loop UF:" << EPI.EpilogueUF <<
"\n"; }; } } while (false)
8091 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue
() << ", Main Loop UF:" << EPI.MainLoopUF <<
", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue
() << ", Epilogue Loop UF:" << EPI.EpilogueUF <<
"\n"; }; } } while (false)
8092 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue
() << ", Main Loop UF:" << EPI.MainLoopUF <<
", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue
() << ", Epilogue Loop UF:" << EPI.EpilogueUF <<
"\n"; }; } } while (false)
8093 << ", Main Loop UF:" << EPI.MainLoopUFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue
() << ", Main Loop UF:" << EPI.MainLoopUF <<
", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue
() << ", Epilogue Loop UF:" << EPI.EpilogueUF <<
"\n"; }; } } while (false)
8094 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue
() << ", Main Loop UF:" << EPI.MainLoopUF <<
", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue
() << ", Epilogue Loop UF:" << EPI.EpilogueUF <<
"\n"; }; } } while (false)
8095 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue
() << ", Main Loop UF:" << EPI.MainLoopUF <<
", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue
() << ", Epilogue Loop UF:" << EPI.EpilogueUF <<
"\n"; }; } } while (false)
8096 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue
() << ", Main Loop UF:" << EPI.MainLoopUF <<
", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue
() << ", Epilogue Loop UF:" << EPI.EpilogueUF <<
"\n"; }; } } while (false)
;
8097}
8098
8099void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
8100 DEBUG_WITH_TYPE(VerboseDebug, {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
(VerboseDebug)) { { dbgs() << "intermediate fn:\n" <<
*Induction->getFunction() << "\n"; }; } } while (false
)
8101 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
(VerboseDebug)) { { dbgs() << "intermediate fn:\n" <<
*Induction->getFunction() << "\n"; }; } } while (false
)
8102 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
(VerboseDebug)) { { dbgs() << "intermediate fn:\n" <<
*Induction->getFunction() << "\n"; }; } } while (false
)
;
8103}
8104
8105BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
8106 Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
8107 assert(L && "Expected valid Loop.")((L && "Expected valid Loop.") ? static_cast<void>
(0) : __assert_fail ("L && \"Expected valid Loop.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8107, __PRETTY_FUNCTION__))
;
8108 assert(Bypass && "Expected valid bypass basic block.")((Bypass && "Expected valid bypass basic block.") ? static_cast
<void> (0) : __assert_fail ("Bypass && \"Expected valid bypass basic block.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8108, __PRETTY_FUNCTION__))
;
8109 unsigned VFactor =
8110 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
8111 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
8112 Value *Count = getOrCreateTripCount(L);
8113 // Reuse existing vector loop preheader for TC checks.
8114 // Note that new preheader block is generated for vector loop.
8115 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
8116 IRBuilder<> Builder(TCCheckBlock->getTerminator());
8117
8118 // Generate code to check if the loop's trip count is less than VF * UF of the
8119 // main vector loop.
8120 auto P =
8121 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8122
8123 Value *CheckMinIters = Builder.CreateICmp(
8124 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
8125 "min.iters.check");
8126
8127 if (!ForEpilogue)
8128 TCCheckBlock->setName("vector.main.loop.iter.check");
8129
8130 // Create new preheader for vector loop.
8131 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
8132 DT, LI, nullptr, "vector.ph");
8133
8134 if (ForEpilogue) {
8135 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8137, __PRETTY_FUNCTION__))
8136 DT->getNode(Bypass)->getIDom()) &&((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8137, __PRETTY_FUNCTION__))
8137 "TC check is expected to dominate Bypass")((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8137, __PRETTY_FUNCTION__))
;
8138
8139 // Update dominator for Bypass & LoopExit.
8140 DT->changeImmediateDominator(Bypass, TCCheckBlock);
8141 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
8142
8143 LoopBypassBlocks.push_back(TCCheckBlock);
8144
8145 // Save the trip count so we don't have to regenerate it in the
8146 // vec.epilog.iter.check. This is safe to do because the trip count
8147 // generated here dominates the vector epilog iter check.
8148 EPI.TripCount = Count;
8149 }
8150
8151 ReplaceInstWithInst(
8152 TCCheckBlock->getTerminator(),
8153 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8154
8155 return TCCheckBlock;
8156}
8157
8158//===--------------------------------------------------------------------===//
8159// EpilogueVectorizerEpilogueLoop
8160//===--------------------------------------------------------------------===//
8161
8162/// This function is partially responsible for generating the control flow
8163/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8164BasicBlock *
8165EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
8166 MDNode *OrigLoopID = OrigLoop->getLoopID();
8167 Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
8168
8169 // Now, compare the remaining count and if there aren't enough iterations to
8170 // execute the vectorized epilogue skip to the scalar part.
8171 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
8172 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
8173 LoopVectorPreHeader =
8174 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
8175 LI, nullptr, "vec.epilog.ph");
8176 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
8177 VecEpilogueIterationCountCheck);
8178
8179 // Adjust the control flow taking the state info from the main loop
8180 // vectorization into account.
8181 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&((EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck
&& "expected this to be saved from the previous pass."
) ? static_cast<void> (0) : __assert_fail ("EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && \"expected this to be saved from the previous pass.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8182, __PRETTY_FUNCTION__))
8182 "expected this to be saved from the previous pass.")((EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck
&& "expected this to be saved from the previous pass."
) ? static_cast<void> (0) : __assert_fail ("EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && \"expected this to be saved from the previous pass.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8182, __PRETTY_FUNCTION__))
;
8183 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8184 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8185
8186 DT->changeImmediateDominator(LoopVectorPreHeader,
8187 EPI.MainLoopIterationCountCheck);
8188
8189 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8190 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8191
8192 if (EPI.SCEVSafetyCheck)
8193 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8194 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8195 if (EPI.MemSafetyCheck)
8196 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8197 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8198
8199 DT->changeImmediateDominator(
8200 VecEpilogueIterationCountCheck,
8201 VecEpilogueIterationCountCheck->getSinglePredecessor());
8202
8203 DT->changeImmediateDominator(LoopScalarPreHeader,
8204 EPI.EpilogueIterationCountCheck);
8205 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
8206
8207 // Keep track of bypass blocks, as they feed start values to the induction
8208 // phis in the scalar loop preheader.
8209 if (EPI.SCEVSafetyCheck)
8210 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8211 if (EPI.MemSafetyCheck)
8212 LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8213 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8214
8215 // Generate a resume induction for the vector epilogue and put it in the
8216 // vector epilogue preheader
8217 Type *IdxTy = Legal->getWidestInductionType();
8218 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
8219 LoopVectorPreHeader->getFirstNonPHI());
8220 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
8221 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
8222 EPI.MainLoopIterationCountCheck);
8223
8224 // Generate the induction variable.
8225 OldInduction = Legal->getPrimaryInduction();
8226 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8227 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
8228 Value *StartIdx = EPResumeVal;
8229 Induction =
8230 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8231 getDebugLocFromInstOrOperands(OldInduction));
8232
8233 // Generate induction resume values. These variables save the new starting
8234 // indexes for the scalar loop. They are used to test if there are any tail
8235 // iterations left once the vector loop has completed.
8236 // Note that when the vectorized epilogue is skipped due to iteration count
8237 // check, then the resume value for the induction variable comes from
8238 // the trip count of the main vector loop, hence passing the AdditionalBypass
8239 // argument.
8240 createInductionResumeValues(Lp, CountRoundDown,
8241 {VecEpilogueIterationCountCheck,
8242 EPI.VectorTripCount} /* AdditionalBypass */);
8243
8244 AddRuntimeUnrollDisableMetaData(Lp);
8245 return completeLoopSkeleton(Lp, OrigLoopID);
8246}
8247
8248BasicBlock *
8249EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8250 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
8251
8252 assert(EPI.TripCount &&((EPI.TripCount && "Expected trip count to have been safed in the first pass."
) ? static_cast<void> (0) : __assert_fail ("EPI.TripCount && \"Expected trip count to have been safed in the first pass.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8253, __PRETTY_FUNCTION__))
8253 "Expected trip count to have been safed in the first pass.")((EPI.TripCount && "Expected trip count to have been safed in the first pass."
) ? static_cast<void> (0) : __assert_fail ("EPI.TripCount && \"Expected trip count to have been safed in the first pass.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8253, __PRETTY_FUNCTION__))
;
8254 assert((((!isa<Instruction>(EPI.TripCount) || DT->dominates
(cast<Instruction>(EPI.TripCount)->getParent(), Insert
)) && "saved trip count does not dominate insertion point."
) ? static_cast<void> (0) : __assert_fail ("(!isa<Instruction>(EPI.TripCount) || DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && \"saved trip count does not dominate insertion point.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8257, __PRETTY_FUNCTION__))
8255 (!isa<Instruction>(EPI.TripCount) ||(((!isa<Instruction>(EPI.TripCount) || DT->dominates
(cast<Instruction>(EPI.TripCount)->getParent(), Insert
)) && "saved trip count does not dominate insertion point."
) ? static_cast<void> (0) : __assert_fail ("(!isa<Instruction>(EPI.TripCount) || DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && \"saved trip count does not dominate insertion point.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8257, __PRETTY_FUNCTION__))
8256 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&(((!isa<Instruction>(EPI.TripCount) || DT->dominates
(cast<Instruction>(EPI.TripCount)->getParent(), Insert
)) && "saved trip count does not dominate insertion point."
) ? static_cast<void> (0) : __assert_fail ("(!isa<Instruction>(EPI.TripCount) || DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && \"saved trip count does not dominate insertion point.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8257, __PRETTY_FUNCTION__))
8257 "saved trip count does not dominate insertion point.")(((!isa<Instruction>(EPI.TripCount) || DT->dominates
(cast<Instruction>(EPI.TripCount)->getParent(), Insert
)) && "saved trip count does not dominate insertion point."
) ? static_cast<void> (0) : __assert_fail ("(!isa<Instruction>(EPI.TripCount) || DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && \"saved trip count does not dominate insertion point.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8257, __PRETTY_FUNCTION__))
;
8258 Value *TC = EPI.TripCount;
8259 IRBuilder<> Builder(Insert->getTerminator());
8260 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8261
8262 // Generate code to check if the loop's trip count is less than VF * UF of the
8263 // vector epilogue loop.
8264 auto P =
8265 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8266
8267 Value *CheckMinIters = Builder.CreateICmp(
8268 P, Count,
8269 ConstantInt::get(Count->getType(),
8270 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
8271 "min.epilog.iters.check");
8272
8273 ReplaceInstWithInst(
8274 Insert->getTerminator(),
8275 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8276
8277 LoopBypassBlocks.push_back(Insert);
8278 return Insert;
8279}
8280
8281void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8282 LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue
() << ", Main Loop UF:" << EPI.MainLoopUF <<
", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue
() << ", Epilogue Loop UF:" << EPI.EpilogueUF <<
"\n"; }; } } while (false)
8283 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue
() << ", Main Loop UF:" << EPI.MainLoopUF <<
", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue
() << ", Epilogue Loop UF:" << EPI.EpilogueUF <<
"\n"; }; } } while (false)
8284 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue
() << ", Main Loop UF:" << EPI.MainLoopUF <<
", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue
() << ", Epilogue Loop UF:" << EPI.EpilogueUF <<
"\n"; }; } } while (false)
8285 << ", Main Loop UF:" << EPI.MainLoopUFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue
() << ", Main Loop UF:" << EPI.MainLoopUF <<
", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue
() << ", Epilogue Loop UF:" << EPI.EpilogueUF <<
"\n"; }; } } while (false)
8286 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue
() << ", Main Loop UF:" << EPI.MainLoopUF <<
", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue
() << ", Epilogue Loop UF:" << EPI.EpilogueUF <<
"\n"; }; } } while (false)
8287 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue
() << ", Main Loop UF:" << EPI.MainLoopUF <<
", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue
() << ", Epilogue Loop UF:" << EPI.EpilogueUF <<
"\n"; }; } } while (false)
8288 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue
() << ", Main Loop UF:" << EPI.MainLoopUF <<
", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue
() << ", Epilogue Loop UF:" << EPI.EpilogueUF <<
"\n"; }; } } while (false)
;
8289}
8290
8291void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8292 DEBUG_WITH_TYPE(VerboseDebug, {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
(VerboseDebug)) { { dbgs() << "final fn:\n" << *Induction
->getFunction() << "\n"; }; } } while (false)
8293 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
(VerboseDebug)) { { dbgs() << "final fn:\n" << *Induction
->getFunction() << "\n"; }; } } while (false)
8294 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
(VerboseDebug)) { { dbgs() << "final fn:\n" << *Induction
->getFunction() << "\n"; }; } } while (false)
;
8295}
8296
8297bool LoopVectorizationPlanner::getDecisionAndClampRange(
8298 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8299 assert(!Range.isEmpty() && "Trying to test an empty VF range.")((!Range.isEmpty() && "Trying to test an empty VF range."
) ? static_cast<void> (0) : __assert_fail ("!Range.isEmpty() && \"Trying to test an empty VF range.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8299, __PRETTY_FUNCTION__))
;
8300 bool PredicateAtRangeStart = Predicate(Range.Start);
8301
8302 for (ElementCount TmpVF = Range.Start * 2;
8303 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8304 if (Predicate(TmpVF) != PredicateAtRangeStart) {
8305 Range.End = TmpVF;
8306 break;
8307 }
8308
8309 return PredicateAtRangeStart;
8310}
8311
8312/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8313/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8314/// of VF's starting at a given VF and extending it as much as possible. Each
8315/// vectorization decision can potentially shorten this sub-range during
8316/// buildVPlan().
8317void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8318 ElementCount MaxVF) {
8319 auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8320 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8321 VFRange SubRange = {VF, MaxVFPlusOne};
8322 VPlans.push_back(buildVPlan(SubRange));
8323 VF = SubRange.End;
8324 }
8325}
8326
8327VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8328 VPlanPtr &Plan) {
8329 assert(is_contained(predecessors(Dst), Src) && "Invalid edge")((is_contained(predecessors(Dst), Src) && "Invalid edge"
) ? static_cast<void> (0) : __assert_fail ("is_contained(predecessors(Dst), Src) && \"Invalid edge\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8329, __PRETTY_FUNCTION__))
;
8330
8331 // Look for cached value.
8332 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8333 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8334 if (ECEntryIt != EdgeMaskCache.end())
8335 return ECEntryIt->second;
8336
8337 VPValue *SrcMask = createBlockInMask(Src, Plan);
8338
8339 // The terminator has to be a branch inst!
8340 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8341 assert(BI && "Unexpected terminator found")((BI && "Unexpected terminator found") ? static_cast<
void> (0) : __assert_fail ("BI && \"Unexpected terminator found\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8341, __PRETTY_FUNCTION__))
;
8342
8343 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8344 return EdgeMaskCache[Edge] = SrcMask;
8345
8346 // If source is an exiting block, we know the exit edge is dynamically dead
8347 // in the vector loop, and thus we don't need to restrict the mask. Avoid
8348 // adding uses of an otherwise potentially dead instruction.
8349 if (OrigLoop->isLoopExiting(Src))
8350 return EdgeMaskCache[Edge] = SrcMask;
8351
8352 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8353 assert(EdgeMask && "No Edge Mask found for condition")((EdgeMask && "No Edge Mask found for condition") ? static_cast
<void> (0) : __assert_fail ("EdgeMask && \"No Edge Mask found for condition\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8353, __PRETTY_FUNCTION__))
;
8354
8355 if (BI->getSuccessor(0) != Dst)
8356 EdgeMask = Builder.createNot(EdgeMask);
8357
8358 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8359 // The condition is 'SrcMask && EdgeMask', which is equivalent to
8360 // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8361 // The select version does not introduce new UB if SrcMask is false and
8362 // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8363 VPValue *False = Plan->getOrAddVPValue(
8364 ConstantInt::getFalse(BI->getCondition()->getType()));
8365 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
8366 }
8367
8368 return EdgeMaskCache[Edge] = EdgeMask;
8369}
8370
8371VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8372 assert(OrigLoop->contains(BB) && "Block is not a part of a loop")((OrigLoop->contains(BB) && "Block is not a part of a loop"
) ? static_cast<void> (0) : __assert_fail ("OrigLoop->contains(BB) && \"Block is not a part of a loop\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8372, __PRETTY_FUNCTION__))
;
8373
8374 // Look for cached value.
8375 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8376 if (BCEntryIt != BlockMaskCache.end())
8377 return BCEntryIt->second;
8378
8379 // All-one mask is modelled as no-mask following the convention for masked
8380 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8381 VPValue *BlockMask = nullptr;
8382
8383 if (OrigLoop->getHeader() == BB) {
8384 if (!CM.blockNeedsPredication(BB))
8385 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8386
8387 // Create the block in mask as the first non-phi instruction in the block.
8388 VPBuilder::InsertPointGuard Guard(Builder);
8389 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
8390 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
8391
8392 // Introduce the early-exit compare IV <= BTC to form header block mask.
8393 // This is used instead of IV < TC because TC may wrap, unlike BTC.
8394 // Start by constructing the desired canonical IV.
8395 VPValue *IV = nullptr;
8396 if (Legal->getPrimaryInduction())
8397 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
8398 else {
8399 auto IVRecipe = new VPWidenCanonicalIVRecipe();
8400 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
8401 IV = IVRecipe->getVPValue();
8402 }
8403 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8404 bool TailFolded = !CM.isScalarEpilogueAllowed();
8405
8406 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
8407 // While ActiveLaneMask is a binary op that consumes the loop tripcount
8408 // as a second argument, we only pass the IV here and extract the
8409 // tripcount from the transform state where codegen of the VP instructions
8410 // happen.
8411 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
8412 } else {
8413 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8414 }
8415 return BlockMaskCache[BB] = BlockMask;
8416 }
8417
8418 // This is the block mask. We OR all incoming edges.
8419 for (auto *Predecessor : predecessors(BB)) {
8420 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8421 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8422 return BlockMaskCache[BB] = EdgeMask;
8423
8424 if (!BlockMask) { // BlockMask has its initialized nullptr value.
8425 BlockMask = EdgeMask;
8426 continue;
8427 }
8428
8429 BlockMask = Builder.createOr(BlockMask, EdgeMask);
8430 }
8431
8432 return BlockMaskCache[BB] = BlockMask;
8433}
8434
8435VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8436 ArrayRef<VPValue *> Operands,
8437 VFRange &Range,
8438 VPlanPtr &Plan) {
8439 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Must be called with either a load or store") ? static_cast<
void> (0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Must be called with either a load or store\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8440, __PRETTY_FUNCTION__))
8440 "Must be called with either a load or store")(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Must be called with either a load or store") ? static_cast<
void> (0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Must be called with either a load or store\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8440, __PRETTY_FUNCTION__))
;
8441
8442 auto willWiden = [&](ElementCount VF) -> bool {
8443 if (VF.isScalar())
8444 return false;
8445 LoopVectorizationCostModel::InstWidening Decision =
8446 CM.getWideningDecision(I, VF);
8447 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&((Decision != LoopVectorizationCostModel::CM_Unknown &&
"CM decision should be taken at this point.") ? static_cast<
void> (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8448, __PRETTY_FUNCTION__))
8448 "CM decision should be taken at this point.")((Decision != LoopVectorizationCostModel::CM_Unknown &&
"CM decision should be taken at this point.") ? static_cast<
void> (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8448, __PRETTY_FUNCTION__))
;
8449 if (Decision == LoopVectorizationCostModel::CM_Interleave)
8450 return true;
8451 if (CM.isScalarAfterVectorization(I, VF) ||
8452 CM.isProfitableToScalarize(I, VF))
8453 return false;
8454 return Decision != LoopVectorizationCostModel::CM_Scalarize;
8455 };
8456
8457 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8458 return nullptr;
8459
8460 VPValue *Mask = nullptr;
8461 if (Legal->isMaskRequired(I))
8462 Mask = createBlockInMask(I->getParent(), Plan);
8463
8464 if (LoadInst *Load = dyn_cast<LoadInst>(I))
8465 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask);
8466
8467 StoreInst *Store = cast<StoreInst>(I);
8468 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8469 Mask);
8470}
8471
8472VPWidenIntOrFpInductionRecipe *
8473VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi,
8474 ArrayRef<VPValue *> Operands) const {
8475 // Check if this is an integer or fp induction. If so, build the recipe that
8476 // produces its scalar and vector values.
8477 InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8478 if (II.getKind() == InductionDescriptor::IK_IntInduction ||
8479 II.getKind() == InductionDescriptor::IK_FpInduction) {
8480 assert(II.getStartValue() ==((II.getStartValue() == Phi->getIncomingValueForBlock(OrigLoop
->getLoopPreheader())) ? static_cast<void> (0) : __assert_fail
("II.getStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8481, __PRETTY_FUNCTION__))
8481 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))((II.getStartValue() == Phi->getIncomingValueForBlock(OrigLoop
->getLoopPreheader())) ? static_cast<void> (0) : __assert_fail
("II.getStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8481, __PRETTY_FUNCTION__))
;
8482 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts();
8483 return new VPWidenIntOrFpInductionRecipe(
8484 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front());
8485 }
8486
8487 return nullptr;
8488}
8489
8490VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8491 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
8492 VPlan &Plan) const {
8493 // Optimize the special case where the source is a constant integer
8494 // induction variable. Notice that we can only optimize the 'trunc' case
8495 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8496 // (c) other casts depend on pointer size.
8497
8498 // Determine whether \p K is a truncation based on an induction variable that
8499 // can be optimized.
8500 auto isOptimizableIVTruncate =
8501 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8502 return [=](ElementCount VF) -> bool {
8503 return CM.isOptimizableIVTruncate(K, VF);
8504 };
8505 };
8506
8507 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8508 isOptimizableIVTruncate(I), Range)) {
8509
8510 InductionDescriptor II =
8511 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
8512 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8513 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
8514 Start, nullptr, I);
8515 }
8516 return nullptr;
8517}
8518
8519VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8520 ArrayRef<VPValue *> Operands,
8521 VPlanPtr &Plan) {
8522 // If all incoming values are equal, the incoming VPValue can be used directly
8523 // instead of creating a new VPBlendRecipe.
8524 VPValue *FirstIncoming = Operands[0];
8525 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8526 return FirstIncoming == Inc;
8527 })) {
8528 return Operands[0];
8529 }
8530
8531 // We know that all PHIs in non-header blocks are converted into selects, so
8532 // we don't have to worry about the insertion order and we can just use the
8533 // builder. At this point we generate the predication tree. There may be
8534 // duplications since this is a simple recursive scan, but future
8535 // optimizations will clean it up.
8536 SmallVector<VPValue *, 2> OperandsWithMask;
8537 unsigned NumIncoming = Phi->getNumIncomingValues();
8538
8539 for (unsigned In = 0; In < NumIncoming; In++) {
8540 VPValue *EdgeMask =
8541 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8542 assert((EdgeMask || NumIncoming == 1) &&(((EdgeMask || NumIncoming == 1) && "Multiple predecessors with one having a full mask"
) ? static_cast<void> (0) : __assert_fail ("(EdgeMask || NumIncoming == 1) && \"Multiple predecessors with one having a full mask\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8543, __PRETTY_FUNCTION__))
8543 "Multiple predecessors with one having a full mask")(((EdgeMask || NumIncoming == 1) && "Multiple predecessors with one having a full mask"
) ? static_cast<void> (0) : __assert_fail ("(EdgeMask || NumIncoming == 1) && \"Multiple predecessors with one having a full mask\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8543, __PRETTY_FUNCTION__))
;
8544 OperandsWithMask.push_back(Operands[In]);
8545 if (EdgeMask)
8546 OperandsWithMask.push_back(EdgeMask);
8547 }
8548 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8549}
8550
8551VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8552 ArrayRef<VPValue *> Operands,
8553 VFRange &Range) const {
8554
8555 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8556 [this, CI](ElementCount VF) {
8557 return CM.isScalarWithPredication(CI, VF);
8558 },
8559 Range);
8560
8561 if (IsPredicated)
8562 return nullptr;
8563
8564 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8565 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8566 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8567 ID == Intrinsic::pseudoprobe ||
8568 ID == Intrinsic::experimental_noalias_scope_decl))
8569 return nullptr;
8570
8571 auto willWiden = [&](ElementCount VF) -> bool {
8572 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8573 // The following case may be scalarized depending on the VF.
8574 // The flag shows whether we use Intrinsic or a usual Call for vectorized
8575 // version of the instruction.
8576 // Is it beneficial to perform intrinsic call compared to lib call?
8577 bool NeedToScalarize = false;
8578 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8579 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8580 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8581 assert((IntrinsicCost.isValid() || CallCost.isValid()) &&(((IntrinsicCost.isValid() || CallCost.isValid()) && "Either the intrinsic cost or vector call cost must be valid"
) ? static_cast<void> (0) : __assert_fail ("(IntrinsicCost.isValid() || CallCost.isValid()) && \"Either the intrinsic cost or vector call cost must be valid\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8582, __PRETTY_FUNCTION__))
8582 "Either the intrinsic cost or vector call cost must be valid")(((IntrinsicCost.isValid() || CallCost.isValid()) && "Either the intrinsic cost or vector call cost must be valid"
) ? static_cast<void> (0) : __assert_fail ("(IntrinsicCost.isValid() || CallCost.isValid()) && \"Either the intrinsic cost or vector call cost must be valid\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8582, __PRETTY_FUNCTION__))
;
8583 return UseVectorIntrinsic || !NeedToScalarize;
8584 };
8585
8586 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8587 return nullptr;
8588
8589 ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands());
8590 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8591}
8592
8593bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8594 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&((!isa<BranchInst>(I) && !isa<PHINode>(I)
&& !isa<LoadInst>(I) && !isa<StoreInst
>(I) && "Instruction should have been handled earlier"
) ? static_cast<void> (0) : __assert_fail ("!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && !isa<StoreInst>(I) && \"Instruction should have been handled earlier\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8595, __PRETTY_FUNCTION__))
8595 !isa<StoreInst>(I) && "Instruction should have been handled earlier")((!isa<BranchInst>(I) && !isa<PHINode>(I)
&& !isa<LoadInst>(I) && !isa<StoreInst
>(I) && "Instruction should have been handled earlier"
) ? static_cast<void> (0) : __assert_fail ("!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && !isa<StoreInst>(I) && \"Instruction should have been handled earlier\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8595, __PRETTY_FUNCTION__))
;
8596 // Instruction should be widened, unless it is scalar after vectorization,
8597 // scalarization is profitable or it is predicated.
8598 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8599 return CM.isScalarAfterVectorization(I, VF) ||
8600 CM.isProfitableToScalarize(I, VF) ||
8601 CM.isScalarWithPredication(I, VF);
8602 };
8603 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8604 Range);
8605}
8606
8607VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8608 ArrayRef<VPValue *> Operands) const {
8609 auto IsVectorizableOpcode = [](unsigned Opcode) {
8610 switch (Opcode) {
8611 case Instruction::Add:
8612 case Instruction::And:
8613 case Instruction::AShr:
8614 case Instruction::BitCast:
8615 case Instruction::FAdd:
8616 case Instruction::FCmp:
8617 case Instruction::FDiv:
8618 case Instruction::FMul:
8619 case Instruction::FNeg:
8620 case Instruction::FPExt:
8621 case Instruction::FPToSI:
8622 case Instruction::FPToUI:
8623 case Instruction::FPTrunc:
8624 case Instruction::FRem:
8625 case Instruction::FSub:
8626 case Instruction::ICmp:
8627 case Instruction::IntToPtr:
8628 case Instruction::LShr:
8629 case Instruction::Mul:
8630 case Instruction::Or:
8631 case Instruction::PtrToInt:
8632 case Instruction::SDiv:
8633 case Instruction::Select:
8634 case Instruction::SExt:
8635 case Instruction::Shl:
8636 case Instruction::SIToFP:
8637 case Instruction::SRem:
8638 case Instruction::Sub:
8639 case Instruction::Trunc:
8640 case Instruction::UDiv:
8641 case Instruction::UIToFP:
8642 case Instruction::URem:
8643 case Instruction::Xor:
8644 case Instruction::ZExt:
8645 return true;
8646 }
8647 return false;
8648 };
8649
8650 if (!IsVectorizableOpcode(I->getOpcode()))
8651 return nullptr;
8652
8653 // Success: widen this instruction.
8654 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8655}
8656
8657VPBasicBlock *VPRecipeBuilder::handleReplication(
8658 Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8659 VPlanPtr &Plan) {
8660 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8661 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8662 Range);
8663
8664 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8665 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
8666 Range);
8667
8668 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8669 IsUniform, IsPredicated);
8670 setRecipe(I, Recipe);
8671 Plan->addVPValue(I, Recipe);
8672
8673 // Find if I uses a predicated instruction. If so, it will use its scalar
8674 // value. Avoid hoisting the insert-element which packs the scalar value into
8675 // a vector value, as that happens iff all users use the vector value.
8676 for (VPValue *Op : Recipe->operands()) {
45
Assuming '__begin1' is not equal to '__end1'
8677 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
46
Assuming the object is a 'VPPredInstPHIRecipe'
8678 if (!PredR
46.1
'PredR' is non-null
)
47
Taking false branch
8679 continue;
8680 auto *RepR =
49
'RepR' initialized to a null pointer value
8681 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
48
Assuming null pointer is passed into cast
8682 assert(RepR->isPredicated() &&((RepR->isPredicated() && "expected Replicate recipe to be predicated"
) ? static_cast<void> (0) : __assert_fail ("RepR->isPredicated() && \"expected Replicate recipe to be predicated\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8683, __PRETTY_FUNCTION__))
50
Called C++ object pointer is null
8683 "expected Replicate recipe to be predicated")((RepR->isPredicated() && "expected Replicate recipe to be predicated"
) ? static_cast<void> (0) : __assert_fail ("RepR->isPredicated() && \"expected Replicate recipe to be predicated\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8683, __PRETTY_FUNCTION__))
;
8684 RepR->setAlsoPack(false);
8685 }
8686
8687 // Finalize the recipe for Instr, first if it is not predicated.
8688 if (!IsPredicated) {
8689 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalarizing:" <<
*I << "\n"; } } while (false)
;
8690 VPBB->appendRecipe(Recipe);
8691 return VPBB;
8692 }
8693 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalarizing and predicating:"
<< *I << "\n"; } } while (false)
;
8694 assert(VPBB->getSuccessors().empty() &&((VPBB->getSuccessors().empty() && "VPBB has successors when handling predicated replication."
) ? static_cast<void> (0) : __assert_fail ("VPBB->getSuccessors().empty() && \"VPBB has successors when handling predicated replication.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8695, __PRETTY_FUNCTION__))
8695 "VPBB has successors when handling predicated replication.")((VPBB->getSuccessors().empty() && "VPBB has successors when handling predicated replication."
) ? static_cast<void> (0) : __assert_fail ("VPBB->getSuccessors().empty() && \"VPBB has successors when handling predicated replication.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8695, __PRETTY_FUNCTION__))
;
8696 // Record predicated instructions for above packing optimizations.
8697 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8698 VPBlockUtils::insertBlockAfter(Region, VPBB);
8699 auto *RegSucc = new VPBasicBlock();
8700 VPBlockUtils::insertBlockAfter(RegSucc, Region);
8701 return RegSucc;
8702}
8703
8704VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8705 VPRecipeBase *PredRecipe,
8706 VPlanPtr &Plan) {
8707 // Instructions marked for predication are replicated and placed under an
8708 // if-then construct to prevent side-effects.
8709
8710 // Generate recipes to compute the block mask for this region.
8711 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8712
8713 // Build the triangular if-then region.
8714 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8715 assert(Instr->getParent() && "Predicated instruction not in any basic block")((Instr->getParent() && "Predicated instruction not in any basic block"
) ? static_cast<void> (0) : __assert_fail ("Instr->getParent() && \"Predicated instruction not in any basic block\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8715, __PRETTY_FUNCTION__))
;
8716 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8717 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8718 auto *PHIRecipe = Instr->getType()->isVoidTy()
8719 ? nullptr
8720 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8721 if (PHIRecipe) {
8722 Plan->removeVPValueFor(Instr);
8723 Plan->addVPValue(Instr, PHIRecipe);
8724 }
8725 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8726 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8727 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8728
8729 // Note: first set Entry as region entry and then connect successors starting
8730 // from it in order, to propagate the "parent" of each VPBasicBlock.
8731 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8732 VPBlockUtils::connectBlocks(Pred, Exit);
8733
8734 return Region;
8735}
8736
8737VPRecipeOrVPValueTy
8738VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8739 ArrayRef<VPValue *> Operands,
8740 VFRange &Range, VPlanPtr &Plan) {
8741 // First, check for specific widening recipes that deal with calls, memory
8742 // operations, inductions and Phi nodes.
8743 if (auto *CI = dyn_cast<CallInst>(Instr))
8744 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8745
8746 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8747 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8748
8749 VPRecipeBase *Recipe;
8750 if (auto Phi = dyn_cast<PHINode>(Instr)) {
8751 if (Phi->getParent() != OrigLoop->getHeader())
8752 return tryToBlend(Phi, Operands, Plan);
8753 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands)))
8754 return toVPRecipeResult(Recipe);
8755
8756 if (Legal->isReductionVariable(Phi)) {
8757 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8758 assert(RdxDesc.getRecurrenceStartValue() ==((RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock
(OrigLoop->getLoopPreheader())) ? static_cast<void> (
0) : __assert_fail ("RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8759, __PRETTY_FUNCTION__))
8759 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))((RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock
(OrigLoop->getLoopPreheader())) ? static_cast<void> (
0) : __assert_fail ("RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8759, __PRETTY_FUNCTION__))
;
8760 VPValue *StartV = Operands[0];
8761 return toVPRecipeResult(new VPWidenPHIRecipe(Phi, RdxDesc, *StartV));
8762 }
8763
8764 return toVPRecipeResult(new VPWidenPHIRecipe(Phi));
8765 }
8766
8767 if (isa<TruncInst>(Instr) &&
8768 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8769 Range, *Plan)))
8770 return toVPRecipeResult(Recipe);
8771
8772 if (!shouldWiden(Instr, Range))
8773 return nullptr;
8774
8775 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8776 return toVPRecipeResult(new VPWidenGEPRecipe(
8777 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8778
8779 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8780 bool InvariantCond =
8781 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8782 return toVPRecipeResult(new VPWidenSelectRecipe(
8783 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8784 }
8785
8786 return toVPRecipeResult(tryToWiden(Instr, Operands));
8787}
8788
8789void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8790 ElementCount MaxVF) {
8791 assert(OrigLoop->isInnermost() && "Inner loop expected.")((OrigLoop->isInnermost() && "Inner loop expected."
) ? static_cast<void> (0) : __assert_fail ("OrigLoop->isInnermost() && \"Inner loop expected.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8791, __PRETTY_FUNCTION__))
;
34
Assuming the condition is true
35
'?' condition is true
8792
8793 // Collect instructions from the original loop that will become trivially dead
8794 // in the vectorized loop. We don't need to vectorize these instructions. For
8795 // example, original induction update instructions can become dead because we
8796 // separately emit induction "steps" when generating code for the new loop.
8797 // Similarly, we create a new latch condition when setting up the structure
8798 // of the new loop, so the old one can become dead.
8799 SmallPtrSet<Instruction *, 4> DeadInstructions;
8800 collectTriviallyDeadInstructions(DeadInstructions);
8801
8802 // Add assume instructions we need to drop to DeadInstructions, to prevent
8803 // them from being added to the VPlan.
8804 // TODO: We only need to drop assumes in blocks that get flattend. If the
8805 // control flow is preserved, we should keep them.
8806 auto &ConditionalAssumes = Legal->getConditionalAssumes();
8807 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8808
8809 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8810 // Dead instructions do not need sinking. Remove them from SinkAfter.
8811 for (Instruction *I : DeadInstructions)
8812 SinkAfter.erase(I);
8813
8814 auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8815 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
36
Loop condition is true. Entering loop body
8816 VFRange SubRange = {VF, MaxVFPlusOne};
8817 VPlans.push_back(
8818 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
37
Calling 'LoopVectorizationPlanner::buildVPlanWithVPRecipes'
8819 VF = SubRange.End;
8820 }
8821}
8822
8823VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8824 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8825 const DenseMap<Instruction *, Instruction *> &SinkAfter) {
8826
8827 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8828
8829 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8830
8831 // ---------------------------------------------------------------------------
8832 // Pre-construction: record ingredients whose recipes we'll need to further
8833 // process after constructing the initial VPlan.
8834 // ---------------------------------------------------------------------------
8835
8836 // Mark instructions we'll need to sink later and their targets as
8837 // ingredients whose recipe we'll need to record.
8838 for (auto &Entry : SinkAfter) {
8839 RecipeBuilder.recordRecipeOf(Entry.first);
8840 RecipeBuilder.recordRecipeOf(Entry.second);
8841 }
8842 for (auto &Reduction : CM.getInLoopReductionChains()) {
38
Assuming '__begin1' is equal to '__end1'
8843 PHINode *Phi = Reduction.first;
8844 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
8845 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8846
8847 RecipeBuilder.recordRecipeOf(Phi);
8848 for (auto &R : ReductionOperations) {
8849 RecipeBuilder.recordRecipeOf(R);
8850 // For min/max reducitons, where we have a pair of icmp/select, we also
8851 // need to record the ICmp recipe, so it can be removed later.
8852 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8853 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8854 }
8855 }
8856
8857 // For each interleave group which is relevant for this (possibly trimmed)
8858 // Range, add it to the set of groups to be later applied to the VPlan and add
8859 // placeholders for its members' Recipes which we'll be replacing with a
8860 // single VPInterleaveRecipe.
8861 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8862 auto applyIG = [IG, this](ElementCount VF) -> bool {
8863 return (VF.isVector() && // Query is illegal for VF == 1
8864 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8865 LoopVectorizationCostModel::CM_Interleave);
8866 };
8867 if (!getDecisionAndClampRange(applyIG, Range))
8868 continue;
8869 InterleaveGroups.insert(IG);
8870 for (unsigned i = 0; i < IG->getFactor(); i++)
8871 if (Instruction *Member = IG->getMember(i))
8872 RecipeBuilder.recordRecipeOf(Member);
8873 };
8874
8875 // ---------------------------------------------------------------------------
8876 // Build initial VPlan: Scan the body of the loop in a topological order to
8877 // visit each basic block after having visited its predecessor basic blocks.
8878 // ---------------------------------------------------------------------------
8879
8880 // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
8881 auto Plan = std::make_unique<VPlan>();
8882 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
8883 Plan->setEntry(VPBB);
8884
8885 // Scan the body of the loop in a topological order to visit each basic block
8886 // after having visited its predecessor basic blocks.
8887 LoopBlocksDFS DFS(OrigLoop);
8888 DFS.perform(LI);
8889
8890 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8891 // Relevant instructions from basic block BB will be grouped into VPRecipe
8892 // ingredients and fill a new VPBasicBlock.
8893 unsigned VPBBsForBB = 0;
8894 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
8895 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
8896 VPBB = FirstVPBBForBB;
8897 Builder.setInsertPoint(VPBB);
8898
8899 // Introduce each ingredient into VPlan.
8900 // TODO: Model and preserve debug instrinsics in VPlan.
8901 for (Instruction &I : BB->instructionsWithoutDebug()) {
8902 Instruction *Instr = &I;
8903
8904 // First filter out irrelevant instructions, to ensure no recipes are
8905 // built for them.
8906 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
39
Assuming 'Instr' is not a 'BranchInst'
40
Assuming the condition is false
41
Taking false branch
8907 continue;
8908
8909 SmallVector<VPValue *, 4> Operands;
8910 auto *Phi = dyn_cast<PHINode>(Instr);
42
Assuming 'Instr' is not a 'PHINode'
8911 if (Phi
42.1
'Phi' is null
&& Phi->getParent() == OrigLoop->getHeader()) {
8912 Operands.push_back(Plan->getOrAddVPValue(
8913 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8914 } else {
8915 auto OpRange = Plan->mapToVPValues(Instr->operands());
8916 Operands = {OpRange.begin(), OpRange.end()};
8917 }
8918 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
43
Taking false branch
8919 Instr, Operands, Range, Plan)) {
8920 // If Instr can be simplified to an existing VPValue, use it.
8921 if (RecipeOrValue.is<VPValue *>()) {
8922 Plan->addVPValue(Instr, RecipeOrValue.get<VPValue *>());
8923 continue;
8924 }
8925 // Otherwise, add the new recipe.
8926 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
8927 for (auto *Def : Recipe->definedValues()) {
8928 auto *UV = Def->getUnderlyingValue();
8929 Plan->addVPValue(UV, Def);
8930 }
8931
8932 RecipeBuilder.setRecipe(Instr, Recipe);
8933 VPBB->appendRecipe(Recipe);
8934 continue;
8935 }
8936
8937 // Otherwise, if all widening options failed, Instruction is to be
8938 // replicated. This may create a successor for VPBB.
8939 VPBasicBlock *NextVPBB =
8940 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
44
Calling 'VPRecipeBuilder::handleReplication'
8941 if (NextVPBB != VPBB) {
8942 VPBB = NextVPBB;
8943 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8944 : "");
8945 }
8946 }
8947 }
8948
8949 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
8950 // may also be empty, such as the last one VPBB, reflecting original
8951 // basic-blocks with no recipes.
8952 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
8953 assert(PreEntry->empty() && "Expecting empty pre-entry block.")((PreEntry->empty() && "Expecting empty pre-entry block."
) ? static_cast<void> (0) : __assert_fail ("PreEntry->empty() && \"Expecting empty pre-entry block.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8953, __PRETTY_FUNCTION__))
;
8954 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
8955 VPBlockUtils::disconnectBlocks(PreEntry, Entry);
8956 delete PreEntry;
8957
8958 // ---------------------------------------------------------------------------
8959 // Transform initial VPlan: Apply previously taken decisions, in order, to
8960 // bring the VPlan to its final state.
8961 // ---------------------------------------------------------------------------
8962
8963 // Apply Sink-After legal constraints.
8964 for (auto &Entry : SinkAfter) {
8965 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8966 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8967 // If the target is in a replication region, make sure to move Sink to the
8968 // block after it, not into the replication region itself.
8969 if (auto *Region =
8970 dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) {
8971 if (Region->isReplicator()) {
8972 assert(Region->getNumSuccessors() == 1 && "Expected SESE region!")((Region->getNumSuccessors() == 1 && "Expected SESE region!"
) ? static_cast<void> (0) : __assert_fail ("Region->getNumSuccessors() == 1 && \"Expected SESE region!\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8972, __PRETTY_FUNCTION__))
;
8973 VPBasicBlock *NextBlock =
8974 cast<VPBasicBlock>(Region->getSuccessors().front());
8975 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
8976 continue;
8977 }
8978 }
8979 Sink->moveAfter(Target);
8980 }
8981
8982 // Interleave memory: for each Interleave Group we marked earlier as relevant
8983 // for this VPlan, replace the Recipes widening its memory instructions with a
8984 // single VPInterleaveRecipe at its insertion point.
8985 for (auto IG : InterleaveGroups) {
8986 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8987 RecipeBuilder.getRecipe(IG->getInsertPos()));
8988 SmallVector<VPValue *, 4> StoredValues;
8989 for (unsigned i = 0; i < IG->getFactor(); ++i)
8990 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
8991 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
8992
8993 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8994 Recipe->getMask());
8995 VPIG->insertBefore(Recipe);
8996 unsigned J = 0;
8997 for (unsigned i = 0; i < IG->getFactor(); ++i)
8998 if (Instruction *Member = IG->getMember(i)) {
8999 if (!Member->getType()->isVoidTy()) {
9000 VPValue *OriginalV = Plan->getVPValue(Member);
9001 Plan->removeVPValueFor(Member);
9002 Plan->addVPValue(Member, VPIG->getVPValue(J));
9003 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9004 J++;
9005 }
9006 RecipeBuilder.getRecipe(Member)->eraseFromParent();
9007 }
9008 }
9009
9010 // Adjust the recipes for any inloop reductions.
9011 if (Range.Start.isVector())
9012 adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
9013
9014 // Finally, if tail is folded by masking, introduce selects between the phi
9015 // and the live-out instruction of each reduction, at the end of the latch.
9016 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
9017 Builder.setInsertPoint(VPBB);
9018 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9019 for (auto &Reduction : Legal->getReductionVars()) {
9020 if (CM.isInLoopReduction(Reduction.first))
9021 continue;
9022 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
9023 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
9024 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
9025 }
9026 }
9027
9028 std::string PlanName;
9029 raw_string_ostream RSO(PlanName);
9030 ElementCount VF = Range.Start;
9031 Plan->addVF(VF);
9032 RSO << "Initial VPlan for VF={" << VF;
9033 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9034 Plan->addVF(VF);
9035 RSO << "," << VF;
9036 }
9037 RSO << "},UF>=1";
9038 RSO.flush();
9039 Plan->setName(PlanName);
9040
9041 return Plan;
9042}
9043
9044VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9045 // Outer loop handling: They may require CFG and instruction level
9046 // transformations before even evaluating whether vectorization is profitable.
9047 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9048 // the vectorization pipeline.
9049 assert(!OrigLoop->isInnermost())((!OrigLoop->isInnermost()) ? static_cast<void> (0) :
__assert_fail ("!OrigLoop->isInnermost()", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9049, __PRETTY_FUNCTION__))
;
9050 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.")((EnableVPlanNativePath && "VPlan-native path is not enabled."
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is not enabled.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9050, __PRETTY_FUNCTION__))
;
9051
9052 // Create new empty VPlan
9053 auto Plan = std::make_unique<VPlan>();
9054
9055 // Build hierarchical CFG
9056 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9057 HCFGBuilder.buildHierarchicalCFG();
9058
9059 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9060 VF *= 2)
9061 Plan->addVF(VF);
9062
9063 if (EnableVPlanPredication) {
9064 VPlanPredicator VPP(*Plan);
9065 VPP.predicate();
9066
9067 // Avoid running transformation to recipes until masked code generation in
9068 // VPlan-native path is in place.
9069 return Plan;
9070 }
9071
9072 SmallPtrSet<Instruction *, 1> DeadInstructions;
9073 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan,
9074 Legal->getInductionVars(),
9075 DeadInstructions, *PSE.getSE());
9076 return Plan;
9077}
9078
9079// Adjust the recipes for any inloop reductions. The chain of instructions
9080// leading from the loop exit instr to the phi need to be converted to
9081// reductions, with one operand being vector and the other being the scalar
9082// reduction chain.
9083void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
9084 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
9085 for (auto &Reduction : CM.getInLoopReductionChains()) {
9086 PHINode *Phi = Reduction.first;
9087 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
9088 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9089
9090 // ReductionOperations are orders top-down from the phi's use to the
9091 // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9092 // which of the two operands will remain scalar and which will be reduced.
9093 // For minmax the chain will be the select instructions.
9094 Instruction *Chain = Phi;
9095 for (Instruction *R : ReductionOperations) {
9096 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9097 RecurKind Kind = RdxDesc.getRecurrenceKind();
9098
9099 VPValue *ChainOp = Plan->getVPValue(Chain);
9100 unsigned FirstOpId;
9101 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9102 assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&((isa<VPWidenSelectRecipe>(WidenRecipe) && "Expected to replace a VPWidenSelectSC"
) ? static_cast<void> (0) : __assert_fail ("isa<VPWidenSelectRecipe>(WidenRecipe) && \"Expected to replace a VPWidenSelectSC\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9103, __PRETTY_FUNCTION__))
9103 "Expected to replace a VPWidenSelectSC")((isa<VPWidenSelectRecipe>(WidenRecipe) && "Expected to replace a VPWidenSelectSC"
) ? static_cast<void> (0) : __assert_fail ("isa<VPWidenSelectRecipe>(WidenRecipe) && \"Expected to replace a VPWidenSelectSC\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9103, __PRETTY_FUNCTION__))
;
9104 FirstOpId = 1;
9105 } else {
9106 assert(isa<VPWidenRecipe>(WidenRecipe) &&((isa<VPWidenRecipe>(WidenRecipe) && "Expected to replace a VPWidenSC"
) ? static_cast<void> (0) : __assert_fail ("isa<VPWidenRecipe>(WidenRecipe) && \"Expected to replace a VPWidenSC\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9107, __PRETTY_FUNCTION__))
9107 "Expected to replace a VPWidenSC")((isa<VPWidenRecipe>(WidenRecipe) && "Expected to replace a VPWidenSC"
) ? static_cast<void> (0) : __assert_fail ("isa<VPWidenRecipe>(WidenRecipe) && \"Expected to replace a VPWidenSC\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9107, __PRETTY_FUNCTION__))
;
9108 FirstOpId = 0;
9109 }
9110 unsigned VecOpId =
9111 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9112 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9113
9114 auto *CondOp = CM.foldTailByMasking()
9115 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9116 : nullptr;
9117 VPReductionRecipe *RedRecipe = new VPReductionRecipe(
9118 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9119 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
9120 Plan->removeVPValueFor(R);
9121 Plan->addVPValue(R, RedRecipe);
9122 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9123 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
9124 WidenRecipe->eraseFromParent();
9125
9126 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9127 VPRecipeBase *CompareRecipe =
9128 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9129 assert(isa<VPWidenRecipe>(CompareRecipe) &&((isa<VPWidenRecipe>(CompareRecipe) && "Expected to replace a VPWidenSC"
) ? static_cast<void> (0) : __assert_fail ("isa<VPWidenRecipe>(CompareRecipe) && \"Expected to replace a VPWidenSC\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9130, __PRETTY_FUNCTION__))
9130 "Expected to replace a VPWidenSC")((isa<VPWidenRecipe>(CompareRecipe) && "Expected to replace a VPWidenSC"
) ? static_cast<void> (0) : __assert_fail ("isa<VPWidenRecipe>(CompareRecipe) && \"Expected to replace a VPWidenSC\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9130, __PRETTY_FUNCTION__))
;
9131 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&((cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() ==
0 && "Expected no remaining users") ? static_cast<
void> (0) : __assert_fail ("cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && \"Expected no remaining users\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9132, __PRETTY_FUNCTION__))
9132 "Expected no remaining users")((cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() ==
0 && "Expected no remaining users") ? static_cast<
void> (0) : __assert_fail ("cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && \"Expected no remaining users\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9132, __PRETTY_FUNCTION__))
;
9133 CompareRecipe->eraseFromParent();
9134 }
9135 Chain = R;
9136 }
9137 }
9138}
9139
9140#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9141void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9142 VPSlotTracker &SlotTracker) const {
9143 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9144 IG->getInsertPos()->printAsOperand(O, false);
9145 O << ", ";
9146 getAddr()->printAsOperand(O, SlotTracker);
9147 VPValue *Mask = getMask();
9148 if (Mask) {
9149 O << ", ";
9150 Mask->printAsOperand(O, SlotTracker);
9151 }
9152 for (unsigned i = 0; i < IG->getFactor(); ++i)
9153 if (Instruction *I = IG->getMember(i))
9154 O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i;
9155}
9156#endif
9157
9158void VPWidenCallRecipe::execute(VPTransformState &State) {
9159 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9160 *this, State);
9161}
9162
9163void VPWidenSelectRecipe::execute(VPTransformState &State) {
9164 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
9165 this, *this, InvariantCond, State);
9166}
9167
9168void VPWidenRecipe::execute(VPTransformState &State) {
9169 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
9170}
9171
9172void VPWidenGEPRecipe::execute(VPTransformState &State) {
9173 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
9174 *this, State.UF, State.VF, IsPtrLoopInvariant,
9175 IsIndexLoopInvariant, State);
9176}
9177
9178void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9179 assert(!State.Instance && "Int or FP induction being replicated.")((!State.Instance && "Int or FP induction being replicated."
) ? static_cast<void> (0) : __assert_fail ("!State.Instance && \"Int or FP induction being replicated.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9179, __PRETTY_FUNCTION__))
;
9180 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
9181 getTruncInst(), getVPValue(0),
9182 getCastValue(), State);
9183}
9184
9185void VPWidenPHIRecipe::execute(VPTransformState &State) {
9186 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc,
9187 this, State);
9188}
9189
9190void VPBlendRecipe::execute(VPTransformState &State) {
9191 State.ILV->setDebugLocFromInst(State.Builder, Phi);
9192 // We know that all PHIs in non-header blocks are converted into
9193 // selects, so we don't have to worry about the insertion order and we
9194 // can just use the builder.
9195 // At this point we generate the predication tree. There may be
9196 // duplications since this is a simple recursive scan, but future
9197 // optimizations will clean it up.
9198
9199 unsigned NumIncoming = getNumIncomingValues();
9200
9201 // Generate a sequence of selects of the form:
9202 // SELECT(Mask3, In3,
9203 // SELECT(Mask2, In2,
9204 // SELECT(Mask1, In1,
9205 // In0)))
9206 // Note that Mask0 is never used: lanes for which no path reaches this phi and
9207 // are essentially undef are taken from In0.
9208 InnerLoopVectorizer::VectorParts Entry(State.UF);
9209 for (unsigned In = 0; In < NumIncoming; ++In) {
9210 for (unsigned Part = 0; Part < State.UF; ++Part) {
9211 // We might have single edge PHIs (blocks) - use an identity
9212 // 'select' for the first PHI operand.
9213 Value *In0 = State.get(getIncomingValue(In), Part);
9214 if (In == 0)
9215 Entry[Part] = In0; // Initialize with the first incoming value.
9216 else {
9217 // Select between the current value and the previous incoming edge
9218 // based on the incoming mask.
9219 Value *Cond = State.get(getMask(In), Part);
9220 Entry[Part] =
9221 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9222 }
9223 }
9224 }
9225 for (unsigned Part = 0; Part < State.UF; ++Part)
9226 State.set(this, Entry[Part], Part);
9227}
9228
9229void VPInterleaveRecipe::execute(VPTransformState &State) {
9230 assert(!State.Instance && "Interleave group being replicated.")((!State.Instance && "Interleave group being replicated."
) ? static_cast<void> (0) : __assert_fail ("!State.Instance && \"Interleave group being replicated.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9230, __PRETTY_FUNCTION__))
;
9231 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9232 getStoredValues(), getMask());
9233}
9234
9235void VPReductionRecipe::execute(VPTransformState &State) {
9236 assert(!State.Instance && "Reduction being replicated.")((!State.Instance && "Reduction being replicated.") ?
static_cast<void> (0) : __assert_fail ("!State.Instance && \"Reduction being replicated.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9236, __PRETTY_FUNCTION__))
;
9237 Value *PrevInChain = State.get(getChainOp(), 0);
9238 for (unsigned Part = 0; Part < State.UF; ++Part) {
9239 RecurKind Kind = RdxDesc->getRecurrenceKind();
9240 bool IsOrdered = useOrderedReductions(*RdxDesc);
9241 Value *NewVecOp = State.get(getVecOp(), Part);
9242 if (VPValue *Cond = getCondOp()) {
9243 Value *NewCond = State.get(Cond, Part);
9244 VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9245 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
9246 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9247 Constant *IdenVec =
9248 ConstantVector::getSplat(VecTy->getElementCount(), Iden);
9249 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9250 NewVecOp = Select;
9251 }
9252 Value *NewRed;
9253 Value *NextInChain;
9254 if (IsOrdered) {
9255 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9256 PrevInChain);
9257 PrevInChain = NewRed;
9258 } else {
9259 PrevInChain = State.get(getChainOp(), Part);
9260 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9261 }
9262 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9263 NextInChain =
9264 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9265 NewRed, PrevInChain);
9266 } else if (IsOrdered)
9267 NextInChain = NewRed;
9268 else {
9269 NextInChain = State.Builder.CreateBinOp(
9270 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
9271 PrevInChain);
9272 }
9273 State.set(this, NextInChain, Part);
9274 }
9275}
9276
9277void VPReplicateRecipe::execute(VPTransformState &State) {
9278 if (State.Instance) { // Generate a single instance.
9279 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector")((!State.VF.isScalable() && "Can't scalarize a scalable vector"
) ? static_cast<void> (0) : __assert_fail ("!State.VF.isScalable() && \"Can't scalarize a scalable vector\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9279, __PRETTY_FUNCTION__))
;
9280 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
9281 *State.Instance, IsPredicated, State);
9282 // Insert scalar instance packing it into a vector.
9283 if (AlsoPack && State.VF.isVector()) {
9284 // If we're constructing lane 0, initialize to start from poison.
9285 if (State.Instance->Lane.isFirstLane()) {
9286 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.")((!State.VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!State.VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9286, __PRETTY_FUNCTION__))
;
9287 Value *Poison = PoisonValue::get(
9288 VectorType::get(getUnderlyingValue()->getType(), State.VF));
9289 State.set(this, Poison, State.Instance->Part);
9290 }
9291 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9292 }
9293 return;
9294 }
9295
9296 // Generate scalar instances for all VF lanes of all UF parts, unless the
9297 // instruction is uniform inwhich case generate only the first lane for each
9298 // of the UF parts.
9299 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9300 assert((!State.VF.isScalable() || IsUniform) &&(((!State.VF.isScalable() || IsUniform) && "Can't scalarize a scalable vector"
) ? static_cast<void> (0) : __assert_fail ("(!State.VF.isScalable() || IsUniform) && \"Can't scalarize a scalable vector\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9301, __PRETTY_FUNCTION__))
9301 "Can't scalarize a scalable vector")(((!State.VF.isScalable() || IsUniform) && "Can't scalarize a scalable vector"
) ? static_cast<void> (0) : __assert_fail ("(!State.VF.isScalable() || IsUniform) && \"Can't scalarize a scalable vector\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9301, __PRETTY_FUNCTION__))
;
9302 for (unsigned Part = 0; Part < State.UF; ++Part)
9303 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9304 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
9305 VPIteration(Part, Lane), IsPredicated,
9306 State);
9307}
9308
9309void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9310 assert(State.Instance && "Branch on Mask works only on single instance.")((State.Instance && "Branch on Mask works only on single instance."
) ? static_cast<void> (0) : __assert_fail ("State.Instance && \"Branch on Mask works only on single instance.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9310, __PRETTY_FUNCTION__))
;
9311
9312 unsigned Part = State.Instance->Part;
9313 unsigned Lane = State.Instance->Lane.getKnownLane();
9314
9315 Value *ConditionBit = nullptr;
9316 VPValue *BlockInMask = getMask();
9317 if (BlockInMask) {
9318 ConditionBit = State.get(BlockInMask, Part);
9319 if (ConditionBit->getType()->isVectorTy())
9320 ConditionBit = State.Builder.CreateExtractElement(
9321 ConditionBit, State.Builder.getInt32(Lane));
9322 } else // Block in mask is all-one.
9323 ConditionBit = State.Builder.getTrue();
9324
9325 // Replace the temporary unreachable terminator with a new conditional branch,
9326 // whose two destinations will be set later when they are created.
9327 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9328 assert(isa<UnreachableInst>(CurrentTerminator) &&((isa<UnreachableInst>(CurrentTerminator) && "Expected to replace unreachable terminator with conditional branch."
) ? static_cast<void> (0) : __assert_fail ("isa<UnreachableInst>(CurrentTerminator) && \"Expected to replace unreachable terminator with conditional branch.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9329, __PRETTY_FUNCTION__))
9329 "Expected to replace unreachable terminator with conditional branch.")((isa<UnreachableInst>(CurrentTerminator) && "Expected to replace unreachable terminator with conditional branch."
) ? static_cast<void> (0) : __assert_fail ("isa<UnreachableInst>(CurrentTerminator) && \"Expected to replace unreachable terminator with conditional branch.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9329, __PRETTY_FUNCTION__))
;
9330 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9331 CondBr->setSuccessor(0, nullptr);
9332 ReplaceInstWithInst(CurrentTerminator, CondBr);
9333}
9334
9335void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9336 assert(State.Instance && "Predicated instruction PHI works per instance.")((State.Instance && "Predicated instruction PHI works per instance."
) ? static_cast<void> (0) : __assert_fail ("State.Instance && \"Predicated instruction PHI works per instance.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9336, __PRETTY_FUNCTION__))
;
9337 Instruction *ScalarPredInst =
9338 cast<Instruction>(State.get(getOperand(0), *State.Instance));
9339 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9340 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9341 assert(PredicatingBB && "Predicated block has no single predecessor.")((PredicatingBB && "Predicated block has no single predecessor."
) ? static_cast<void> (0) : __assert_fail ("PredicatingBB && \"Predicated block has no single predecessor.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9341, __PRETTY_FUNCTION__))
;
9342 assert(isa<VPReplicateRecipe>(getOperand(0)) &&((isa<VPReplicateRecipe>(getOperand(0)) && "operand must be VPReplicateRecipe"
) ? static_cast<void> (0) : __assert_fail ("isa<VPReplicateRecipe>(getOperand(0)) && \"operand must be VPReplicateRecipe\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9343, __PRETTY_FUNCTION__))
9343 "operand must be VPReplicateRecipe")((isa<VPReplicateRecipe>(getOperand(0)) && "operand must be VPReplicateRecipe"
) ? static_cast<void> (0) : __assert_fail ("isa<VPReplicateRecipe>(getOperand(0)) && \"operand must be VPReplicateRecipe\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9343, __PRETTY_FUNCTION__))
;
9344
9345 // By current pack/unpack logic we need to generate only a single phi node: if
9346 // a vector value for the predicated instruction exists at this point it means
9347 // the instruction has vector users only, and a phi for the vector value is
9348 // needed. In this case the recipe of the predicated instruction is marked to
9349 // also do that packing, thereby "hoisting" the insert-element sequence.
9350 // Otherwise, a phi node for the scalar value is needed.
9351 unsigned Part = State.Instance->Part;
9352 if (State.hasVectorValue(getOperand(0), Part)) {
9353 Value *VectorValue = State.get(getOperand(0), Part);
9354 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9355 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9356 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9357 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9358 if (State.hasVectorValue(this, Part))
9359 State.reset(this, VPhi, Part);
9360 else
9361 State.set(this, VPhi, Part);
9362 // NOTE: Currently we need to update the value of the operand, so the next
9363 // predicated iteration inserts its generated value in the correct vector.
9364 State.reset(getOperand(0), VPhi, Part);
9365 } else {
9366 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9367 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9368 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9369 PredicatingBB);
9370 Phi->addIncoming(ScalarPredInst, PredicatedBB);
9371 if (State.hasScalarValue(this, *State.Instance))
9372 State.reset(this, Phi, *State.Instance);
9373 else
9374 State.set(this, Phi, *State.Instance);
9375 // NOTE: Currently we need to update the value of the operand, so the next
9376 // predicated iteration inserts its generated value in the correct vector.
9377 State.reset(getOperand(0), Phi, *State.Instance);
9378 }
9379}
9380
9381void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9382 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9383 State.ILV->vectorizeMemoryInstruction(&Ingredient, State,
9384 StoredValue ? nullptr : getVPValue(),
9385 getAddr(), StoredValue, getMask());
9386}
9387
9388// Determine how to lower the scalar epilogue, which depends on 1) optimising
9389// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9390// predication, and 4) a TTI hook that analyses whether the loop is suitable
9391// for predication.
9392static ScalarEpilogueLowering getScalarEpilogueLowering(
9393 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9394 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9395 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
9396 LoopVectorizationLegality &LVL) {
9397 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9398 // don't look at hints or options, and don't request a scalar epilogue.
9399 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9400 // LoopAccessInfo (due to code dependency and not being able to reliably get
9401 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9402 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9403 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9404 // back to the old way and vectorize with versioning when forced. See D81345.)
9405 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9406 PGSOQueryType::IRPass) &&
9407 Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9408 return CM_ScalarEpilogueNotAllowedOptSize;
9409
9410 // 2) If set, obey the directives
9411 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9412 switch (PreferPredicateOverEpilogue) {
9413 case PreferPredicateTy::ScalarEpilogue:
9414 return CM_ScalarEpilogueAllowed;
9415 case PreferPredicateTy::PredicateElseScalarEpilogue:
9416 return CM_ScalarEpilogueNotNeededUsePredicate;
9417 case PreferPredicateTy::PredicateOrDontVectorize:
9418 return CM_ScalarEpilogueNotAllowedUsePredicate;
9419 };
9420 }
9421
9422 // 3) If set, obey the hints
9423 switch (Hints.getPredicate()) {
9424 case LoopVectorizeHints::FK_Enabled:
9425 return CM_ScalarEpilogueNotNeededUsePredicate;
9426 case LoopVectorizeHints::FK_Disabled:
9427 return CM_ScalarEpilogueAllowed;
9428 };
9429
9430 // 4) if the TTI hook indicates this is profitable, request predication.
9431 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
9432 LVL.getLAI()))
9433 return CM_ScalarEpilogueNotNeededUsePredicate;
9434
9435 return CM_ScalarEpilogueAllowed;
9436}
9437
9438Value *VPTransformState::get(VPValue *Def, unsigned Part) {
9439 // If Values have been set for this Def return the one relevant for \p Part.
9440 if (hasVectorValue(Def, Part))
9441 return Data.PerPartOutput[Def][Part];
9442
9443 if (!hasScalarValue(Def, {Part, 0})) {
9444 Value *IRV = Def->getLiveInIRValue();
9445 Value *B = ILV->getBroadcastInstrs(IRV);
9446 set(Def, B, Part);
9447 return B;
9448 }
9449
9450 Value *ScalarValue = get(Def, {Part, 0});
9451 // If we aren't vectorizing, we can just copy the scalar map values over
9452 // to the vector map.
9453 if (VF.isScalar()) {
9454 set(Def, ScalarValue, Part);
9455 return ScalarValue;
9456 }
9457
9458 auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
9459 bool IsUniform = RepR && RepR->isUniform();
9460
9461 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
9462 // Check if there is a scalar value for the selected lane.
9463 if (!hasScalarValue(Def, {Part, LastLane})) {
9464 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
9465 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&
"unexpected recipe found to be invariant") ? static_cast<
void> (0) : __assert_fail ("isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && \"unexpected recipe found to be invariant\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9466, __PRETTY_FUNCTION__))
9466 "unexpected recipe found to be invariant")((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&
"unexpected recipe found to be invariant") ? static_cast<
void> (0) : __assert_fail ("isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && \"unexpected recipe found to be invariant\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9466, __PRETTY_FUNCTION__))
;
9467 IsUniform = true;
9468 LastLane = 0;
9469 }
9470
9471 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
9472
9473 // Set the insert point after the last scalarized instruction. This
9474 // ensures the insertelement sequence will directly follow the scalar
9475 // definitions.
9476 auto OldIP = Builder.saveIP();
9477 auto NewIP = std::next(BasicBlock::iterator(LastInst));
9478 Builder.SetInsertPoint(&*NewIP);
9479
9480 // However, if we are vectorizing, we need to construct the vector values.
9481 // If the value is known to be uniform after vectorization, we can just
9482 // broadcast the scalar value corresponding to lane zero for each unroll
9483 // iteration. Otherwise, we construct the vector values using
9484 // insertelement instructions. Since the resulting vectors are stored in
9485 // State, we will only generate the insertelements once.
9486 Value *VectorValue = nullptr;
9487 if (IsUniform) {
9488 VectorValue = ILV->getBroadcastInstrs(ScalarValue);
9489 set(Def, VectorValue, Part);
9490 } else {
9491 // Initialize packing with insertelements to start from undef.
9492 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9492, __PRETTY_FUNCTION__))
;
9493 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
9494 set(Def, Undef, Part);
9495 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
9496 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
9497 VectorValue = get(Def, Part);
9498 }
9499 Builder.restoreIP(OldIP);
9500 return VectorValue;
9501}
9502
9503// Process the loop in the VPlan-native vectorization path. This path builds
9504// VPlan upfront in the vectorization pipeline, which allows to apply
9505// VPlan-to-VPlan transformations from the very beginning without modifying the
9506// input LLVM IR.
9507static bool processLoopInVPlanNativePath(
9508 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9509 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9510 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9511 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9512 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9513 LoopVectorizationRequirements &Requirements) {
9514
9515 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9516 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: cannot compute the outer-loop trip count\n"
; } } while (false)
;
9517 return false;
9518 }
9519 assert(EnableVPlanNativePath && "VPlan-native path is disabled.")((EnableVPlanNativePath && "VPlan-native path is disabled."
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is disabled.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9519, __PRETTY_FUNCTION__))
;
9520 Function *F = L->getHeader()->getParent();
9521 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9522
9523 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9524 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
9525
9526 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9527 &Hints, IAI);
9528 // Use the planner for outer loop vectorization.
9529 // TODO: CM is not used at this point inside the planner. Turn CM into an
9530 // optional argument if we don't need it in the future.
9531 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
9532 Requirements, ORE);
9533
9534 // Get user vectorization factor.
9535 ElementCount UserVF = Hints.getWidth();
9536
9537 // Plan how to best vectorize, return the best VF and its cost.
9538 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9539
9540 // If we are stress testing VPlan builds, do not attempt to generate vector
9541 // code. Masked vector code generation support will follow soon.
9542 // Also, do not attempt to vectorize if no vector code will be produced.
9543 if (VPlanBuildStressTest || EnableVPlanPredication ||
9544 VectorizationFactor::Disabled() == VF)
9545 return false;
9546
9547 LVP.setBestPlan(VF.Width, 1);
9548
9549 {
9550 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
9551 F->getParent()->getDataLayout());
9552 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
9553 &CM, BFI, PSI, Checks);
9554 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() <<
"\"\n"; } } while (false)
9555 << L->getHeader()->getParent()->getName() << "\"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() <<
"\"\n"; } } while (false)
;
9556 LVP.executePlan(LB, DT);
9557 }
9558
9559 // Mark the loop as already vectorized to avoid vectorizing again.
9560 Hints.setAlreadyVectorized();
9561 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()))((!verifyFunction(*L->getHeader()->getParent(), &dbgs
())) ? static_cast<void> (0) : __assert_fail ("!verifyFunction(*L->getHeader()->getParent(), &dbgs())"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9561, __PRETTY_FUNCTION__))
;
9562 return true;
9563}
9564
9565// Emit a remark if there are stores to floats that required a floating point
9566// extension. If the vectorized loop was generated with floating point there
9567// will be a performance penalty from the conversion overhead and the change in
9568// the vector width.
9569static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9570 SmallVector<Instruction *, 4> Worklist;
9571 for (BasicBlock *BB : L->getBlocks()) {
9572 for (Instruction &Inst : *BB) {
9573 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9574 if (S->getValueOperand()->getType()->isFloatTy())
9575 Worklist.push_back(S);
9576 }
9577 }
9578 }
9579
9580 // Traverse the floating point stores upwards searching, for floating point
9581 // conversions.
9582 SmallPtrSet<const Instruction *, 4> Visited;
9583 SmallPtrSet<const Instruction *, 4> EmittedRemark;
9584 while (!Worklist.empty()) {
9585 auto *I = Worklist.pop_back_val();
9586 if (!L->contains(I))
9587 continue;
9588 if (!Visited.insert(I).second)
9589 continue;
9590
9591 // Emit a remark if the floating point store required a floating
9592 // point conversion.
9593 // TODO: More work could be done to identify the root cause such as a
9594 // constant or a function return type and point the user to it.
9595 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9596 ORE->emit([&]() {
9597 return OptimizationRemarkAnalysis(LV_NAME"loop-vectorize", "VectorMixedPrecision",
9598 I->getDebugLoc(), L->getHeader())
9599 << "floating point conversion changes vector width. "
9600 << "Mixed floating point precision requires an up/down "
9601 << "cast that will negatively impact performance.";
9602 });
9603
9604 for (Use &Op : I->operands())
9605 if (auto *OpI = dyn_cast<Instruction>(Op))
9606 Worklist.push_back(OpI);
9607 }
9608}
9609
9610LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9611 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9612 !EnableLoopInterleaving),
9613 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9614 !EnableLoopVectorization) {}
9615
9616bool LoopVectorizePass::processLoop(Loop *L) {
9617 assert((EnableVPlanNativePath || L->isInnermost()) &&(((EnableVPlanNativePath || L->isInnermost()) && "VPlan-native path is not enabled. Only process inner loops."
) ? static_cast<void> (0) : __assert_fail ("(EnableVPlanNativePath || L->isInnermost()) && \"VPlan-native path is not enabled. Only process inner loops.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9618, __PRETTY_FUNCTION__))
1
Assuming the condition is false
2
Assuming the condition is true
3
'?' condition is true
9618 "VPlan-native path is not enabled. Only process inner loops.")(((EnableVPlanNativePath || L->isInnermost()) && "VPlan-native path is not enabled. Only process inner loops."
) ? static_cast<void> (0) : __assert_fail ("(EnableVPlanNativePath || L->isInnermost()) && \"VPlan-native path is not enabled. Only process inner loops.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9618, __PRETTY_FUNCTION__))
;
9619
9620#ifndef NDEBUG
9621 const std::string DebugLocStr = getDebugLocString(L);
9622#endif /* NDEBUG */
9623
9624 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \""
<< L->getHeader()->getParent()->getName() <<
"\" from " << DebugLocStr << "\n"; } } while (false
)
4
Assuming 'DebugFlag' is false
5
Loop condition is false. Exiting loop
9625 << L->getHeader()->getParent()->getName() << "\" from "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \""
<< L->getHeader()->getParent()->getName() <<
"\" from " << DebugLocStr << "\n"; } } while (false
)
9626 << DebugLocStr << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \""
<< L->getHeader()->getParent()->getName() <<
"\" from " << DebugLocStr << "\n"; } } while (false
)
;
9627
9628 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
9629
9630 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
6
Assuming 'DebugFlag' is false
7
Loop condition is false. Exiting loop
9631 dbgs() << "LV: Loop hints:"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
9632 << " force="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
9633 << (Hints.getForce() == LoopVectorizeHints::FK_Disableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
9634 ? "disabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
9635 : (Hints.getForce() == LoopVectorizeHints::FK_Enableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
9636 ? "enabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
9637 : "?"))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
9638 << " width=" << Hints.getWidth()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
9639 << " unroll=" << Hints.getInterleave() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
;
9640
9641 // Function containing loop
9642 Function *F = L->getHeader()->getParent();
9643
9644 // Looking at the diagnostic output is the only way to determine if a loop
9645 // was vectorized (other than looking at the IR or machine code), so it
9646 // is important to generate an optimization remark for each loop. Most of
9647 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9648 // generated as OptimizationRemark and OptimizationRemarkMissed are
9649 // less verbose reporting vectorized loops and unvectorized loops that may
9650 // benefit from vectorization, respectively.
9651
9652 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
8
Assuming the condition is false
9
Taking false branch
9653 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent vectorization.\n"
; } } while (false)
;
9654 return false;
9655 }
9656
9657 PredicatedScalarEvolution PSE(*SE, *L);
9658
9659 // Check if it is legal to vectorize the loop.
9660 LoopVectorizationRequirements Requirements;
9661 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
9662 &Requirements, &Hints, DB, AC, BFI, PSI);
9663 if (!LVL.canVectorize(EnableVPlanNativePath)) {
10
Assuming the condition is false
11
Taking false branch
9664 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"
; } } while (false)
;
9665 Hints.emitRemarkWithHints();
9666 return false;
9667 }
9668
9669 // Check the function attributes and profiles to find out if this function
9670 // should be optimized for size.
9671 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9672 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
9673
9674 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9675 // here. They may require CFG and instruction level transformations before
9676 // even evaluating whether vectorization is profitable. Since we cannot modify
9677 // the incoming IR, we need to build VPlan upfront in the vectorization
9678 // pipeline.
9679 if (!L->isInnermost())
12
Assuming the condition is false
13
Taking false branch
9680 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9681 ORE, BFI, PSI, Hints, Requirements);
9682
9683 assert(L->isInnermost() && "Inner loop expected.")((L->isInnermost() && "Inner loop expected.") ? static_cast
<void> (0) : __assert_fail ("L->isInnermost() && \"Inner loop expected.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9683, __PRETTY_FUNCTION__))
;
14
Assuming the condition is true
15
'?' condition is true
9684
9685 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9686 // count by optimizing for size, to minimize overheads.
9687 auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9688 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9689 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "
<< "iteration overheads are incurred."; } } while (false
)
9690 << "This loop is worth vectorizing only if no scalar "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "
<< "iteration overheads are incurred."; } } while (false
)
9691 << "iteration overheads are incurred.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "
<< "iteration overheads are incurred."; } } while (false
)
;
9692 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9693 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " But vectorizing was explicitly forced.\n"
; } } while (false)
;
9694 else {
9695 LLVM_DEBUG(dbgs() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\n"; } } while (false)
;
9696 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9697 }
9698 }
9699
9700 // Check the function attributes to see if implicit floats are allowed.
9701 // FIXME: This check doesn't seem possibly correct -- what if the loop is
9702 // an integer loop and the vector instructions selected are purely integer
9703 // vector instructions?
9704 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
16
Assuming the condition is false
17
Taking false branch
9705 reportVectorizationFailure(
9706 "Can't vectorize when the NoImplicitFloat attribute is used",
9707 "loop not vectorized due to NoImplicitFloat attribute",
9708 "NoImplicitFloat", ORE, L);
9709 Hints.emitRemarkWithHints();
9710 return false;
9711 }
9712
9713 // Check if the target supports potentially unsafe FP vectorization.
9714 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9715 // for the target we're vectorizing for, to make sure none of the
9716 // additional fp-math flags can help.
9717 if (Hints.isPotentiallyUnsafe() &&
9718 TTI->isFPVectorizationPotentiallyUnsafe()) {
9719 reportVectorizationFailure(
9720 "Potentially unsafe FP op prevents vectorization",
9721 "loop not vectorized due to unsafe FP support.",
9722 "UnsafeFP", ORE, L);
9723 Hints.emitRemarkWithHints();
9724 return false;
9725 }
9726
9727 if (!Requirements.canVectorizeFPMath(Hints)) {
18
Taking false branch
9728 ORE->emit([&]() {
9729 auto *ExactFPMathInst = Requirements.getExactFPInst();
9730 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE"loop-vectorize", "CantReorderFPOps",
9731 ExactFPMathInst->getDebugLoc(),
9732 ExactFPMathInst->getParent())
9733 << "loop not vectorized: cannot prove it is safe to reorder "
9734 "floating-point operations";
9735 });
9736 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
"reorder floating-point operations\n"; } } while (false)
9737 "reorder floating-point operations\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
"reorder floating-point operations\n"; } } while (false)
;
9738 Hints.emitRemarkWithHints();
9739 return false;
9740 }
9741
9742 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9743 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9744
9745 // If an override option has been passed in for interleaved accesses, use it.
9746 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
19
Assuming the condition is false
20
Taking false branch
9747 UseInterleaved = EnableInterleavedMemAccesses;
9748
9749 // Analyze interleaved memory accesses.
9750 if (UseInterleaved) {
21
Assuming 'UseInterleaved' is false
22
Taking false branch
9751 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9752 }
9753
9754 // Use the cost model.
9755 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9756 F, &Hints, IAI);
9757 CM.collectValuesToIgnore();
9758
9759 // Use the planner for vectorization.
9760 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
9761 Requirements, ORE);
9762
9763 // Get user vectorization factor and interleave count.
9764 ElementCount UserVF = Hints.getWidth();
9765 unsigned UserIC = Hints.getInterleave();
9766
9767 // Plan how to best vectorize, return the best VF and its cost.
9768 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
23
Calling 'LoopVectorizationPlanner::plan'
9769
9770 VectorizationFactor VF = VectorizationFactor::Disabled();
9771 unsigned IC = 1;
9772
9773 if (MaybeVF) {
9774 VF = *MaybeVF;
9775 // Select the interleave count.
9776 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9777 }
9778
9779 // Identify the diagnostic messages that should be produced.
9780 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9781 bool VectorizeLoop = true, InterleaveLoop = true;
9782 if (VF.Width.isScalar()) {
9783 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vectorization is possible but not beneficial.\n"
; } } while (false)
;
9784 VecDiagMsg = std::make_pair(
9785 "VectorizationNotBeneficial",
9786 "the cost-model indicates that vectorization is not beneficial");
9787 VectorizeLoop = false;
9788 }
9789
9790 if (!MaybeVF && UserIC > 1) {
9791 // Tell the user interleaving was avoided up-front, despite being explicitly
9792 // requested.
9793 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Ignoring UserIC, because vectorization and "
"interleaving should be avoided up front\n"; } } while (false
)
9794 "interleaving should be avoided up front\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Ignoring UserIC, because vectorization and "
"interleaving should be avoided up front\n"; } } while (false
)
;
9795 IntDiagMsg = std::make_pair(
9796 "InterleavingAvoided",
9797 "Ignoring UserIC, because interleaving was avoided up front");
9798 InterleaveLoop = false;
9799 } else if (IC == 1 && UserIC <= 1) {
9800 // Tell the user interleaving is not beneficial.
9801 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is not beneficial.\n"
; } } while (false)
;
9802 IntDiagMsg = std::make_pair(
9803 "InterleavingNotBeneficial",
9804 "the cost-model indicates that interleaving is not beneficial");
9805 InterleaveLoop = false;
9806 if (UserIC == 1) {
9807 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9808 IntDiagMsg.second +=
9809 " and is explicitly disabled or interleave count is set to 1";
9810 }
9811 } else if (IC > 1 && UserIC == 1) {
9812 // Tell the user interleaving is beneficial, but it explicitly disabled.
9813 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."
; } } while (false)
9814 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."
; } } while (false)
;
9815 IntDiagMsg = std::make_pair(
9816 "InterleavingBeneficialButDisabled",
9817 "the cost-model indicates that interleaving is beneficial "
9818 "but is explicitly disabled or interleave count is set to 1");
9819 InterleaveLoop = false;
9820 }
9821
9822 // Override IC if user provided an interleave count.
9823 IC = UserIC > 0 ? UserIC : IC;
9824
9825 // Emit diagnostic messages, if any.
9826 const char *VAPassName = Hints.vectorizeAnalysisPassName();
9827 if (!VectorizeLoop && !InterleaveLoop) {
9828 // Do not vectorize or interleaving the loop.
9829 ORE->emit([&]() {
9830 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9831 L->getStartLoc(), L->getHeader())
9832 << VecDiagMsg.second;
9833 });
9834 ORE->emit([&]() {
9835 return OptimizationRemarkMissed(LV_NAME"loop-vectorize", IntDiagMsg.first,
9836 L->getStartLoc(), L->getHeader())
9837 << IntDiagMsg.second;
9838 });
9839 return false;
9840 } else if (!VectorizeLoop && InterleaveLoop) {
9841 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleave Count is "
<< IC << '\n'; } } while (false)
;
9842 ORE->emit([&]() {
9843 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9844 L->getStartLoc(), L->getHeader())
9845 << VecDiagMsg.second;
9846 });
9847 } else if (VectorizeLoop && !InterleaveLoop) {
9848 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Widthdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
9849 << ") in " << DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
;
9850 ORE->emit([&]() {
9851 return OptimizationRemarkAnalysis(LV_NAME"loop-vectorize", IntDiagMsg.first,
9852 L->getStartLoc(), L->getHeader())
9853 << IntDiagMsg.second;
9854 });
9855 } else if (VectorizeLoop && InterleaveLoop) {
9856 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Widthdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
9857 << ") in " << DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
;
9858 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleave Count is "
<< IC << '\n'; } } while (false)
;
9859 }
9860
9861 bool DisableRuntimeUnroll = false;
9862 MDNode *OrigLoopID = L->getLoopID();
9863 {
9864 // Optimistically generate runtime checks. Drop them if they turn out to not
9865 // be profitable. Limit the scope of Checks, so the cleanup happens
9866 // immediately after vector codegeneration is done.
9867 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
9868 F->getParent()->getDataLayout());
9869 if (!VF.Width.isScalar() || IC > 1)
9870 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
9871 LVP.setBestPlan(VF.Width, IC);
9872
9873 using namespace ore;
9874 if (!VectorizeLoop) {
9875 assert(IC > 1 && "interleave count should not be 1 or 0")((IC > 1 && "interleave count should not be 1 or 0"
) ? static_cast<void> (0) : __assert_fail ("IC > 1 && \"interleave count should not be 1 or 0\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9875, __PRETTY_FUNCTION__))
;
9876 // If we decided that it is not legal to vectorize the loop, then
9877 // interleave it.
9878 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
9879 &CM, BFI, PSI, Checks);
9880 LVP.executePlan(Unroller, DT);
9881
9882 ORE->emit([&]() {
9883 return OptimizationRemark(LV_NAME"loop-vectorize", "Interleaved", L->getStartLoc(),
9884 L->getHeader())
9885 << "interleaved loop (interleaved count: "
9886 << NV("InterleaveCount", IC) << ")";
9887 });
9888 } else {
9889 // If we decided that it is *legal* to vectorize the loop, then do it.
9890
9891 // Consider vectorizing the epilogue too if it's profitable.
9892 VectorizationFactor EpilogueVF =
9893 CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
9894 if (EpilogueVF.Width.isVector()) {
9895
9896 // The first pass vectorizes the main loop and creates a scalar epilogue
9897 // to be vectorized by executing the plan (potentially with a different
9898 // factor) again shortly afterwards.
9899 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
9900 EpilogueVF.Width.getKnownMinValue(),
9901 1);
9902 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
9903 EPI, &LVL, &CM, BFI, PSI, Checks);
9904
9905 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
9906 LVP.executePlan(MainILV, DT);
9907 ++LoopsVectorized;
9908
9909 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9910 formLCSSARecursively(*L, *DT, LI, SE);
9911
9912 // Second pass vectorizes the epilogue and adjusts the control flow
9913 // edges from the first pass.
9914 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
9915 EPI.MainLoopVF = EPI.EpilogueVF;
9916 EPI.MainLoopUF = EPI.EpilogueUF;
9917 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
9918 ORE, EPI, &LVL, &CM, BFI, PSI,
9919 Checks);
9920 LVP.executePlan(EpilogILV, DT);
9921 ++LoopsEpilogueVectorized;
9922
9923 if (!MainILV.areSafetyChecksAdded())
9924 DisableRuntimeUnroll = true;
9925 } else {
9926 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
9927 &LVL, &CM, BFI, PSI, Checks);
9928 LVP.executePlan(LB, DT);
9929 ++LoopsVectorized;
9930
9931 // Add metadata to disable runtime unrolling a scalar loop when there
9932 // are no runtime checks about strides and memory. A scalar loop that is
9933 // rarely used is not worth unrolling.
9934 if (!LB.areSafetyChecksAdded())
9935 DisableRuntimeUnroll = true;
9936 }
9937 // Report the vectorization decision.
9938 ORE->emit([&]() {
9939 return OptimizationRemark(LV_NAME"loop-vectorize", "Vectorized", L->getStartLoc(),
9940 L->getHeader())
9941 << "vectorized loop (vectorization width: "
9942 << NV("VectorizationFactor", VF.Width)
9943 << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
9944 });
9945 }
9946
9947 if (ORE->allowExtraAnalysis(LV_NAME"loop-vectorize"))
9948 checkMixedPrecision(L, ORE);
9949 }
9950
9951 Optional<MDNode *> RemainderLoopID =
9952 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
9953 LLVMLoopVectorizeFollowupEpilogue});
9954 if (RemainderLoopID.hasValue()) {
9955 L->setLoopID(RemainderLoopID.getValue());
9956 } else {
9957 if (DisableRuntimeUnroll)
9958 AddRuntimeUnrollDisableMetaData(L);
9959
9960 // Mark the loop as already vectorized to avoid vectorizing again.
9961 Hints.setAlreadyVectorized();
9962 }
9963
9964 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()))((!verifyFunction(*L->getHeader()->getParent(), &dbgs
())) ? static_cast<void> (0) : __assert_fail ("!verifyFunction(*L->getHeader()->getParent(), &dbgs())"
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9964, __PRETTY_FUNCTION__))
;
9965 return true;
9966}
9967
9968LoopVectorizeResult LoopVectorizePass::runImpl(
9969 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
9970 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
9971 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
9972 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
9973 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
9974 SE = &SE_;
9975 LI = &LI_;
9976 TTI = &TTI_;
9977 DT = &DT_;
9978 BFI = &BFI_;
9979 TLI = TLI_;
9980 AA = &AA_;
9981 AC = &AC_;
9982 GetLAA = &GetLAA_;
9983 DB = &DB_;
9984 ORE = &ORE_;
9985 PSI = PSI_;
9986
9987 // Don't attempt if
9988 // 1. the target claims to have no vector registers, and
9989 // 2. interleaving won't help ILP.
9990 //
9991 // The second condition is necessary because, even if the target has no
9992 // vector registers, loop vectorization may still enable scalar
9993 // interleaving.
9994 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
9995 TTI->getMaxInterleaveFactor(1) < 2)
9996 return LoopVectorizeResult(false, false);
9997
9998 bool Changed = false, CFGChanged = false;
9999
10000 // The vectorizer requires loops to be in simplified form.
10001 // Since simplification may add new inner loops, it has to run before the
10002 // legality and profitability checks. This means running the loop vectorizer
10003 // will simplify all loops, regardless of whether anything end up being
10004 // vectorized.
10005 for (auto &L : *LI)
10006 Changed |= CFGChanged |=
10007 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10008
10009 // Build up a worklist of inner-loops to vectorize. This is necessary as
10010 // the act of vectorizing or partially unrolling a loop creates new loops
10011 // and can invalidate iterators across the loops.
10012 SmallVector<Loop *, 8> Worklist;
10013
10014 for (Loop *L : *LI)
10015 collectSupportedLoops(*L, LI, ORE, Worklist);
10016
10017 LoopsAnalyzed += Worklist.size();
10018
10019 // Now walk the identified inner loops.
10020 while (!Worklist.empty()) {
10021 Loop *L = Worklist.pop_back_val();
10022
10023 // For the inner loops we actually process, form LCSSA to simplify the
10024 // transform.
10025 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10026
10027 Changed |= CFGChanged |= processLoop(L);
10028 }
10029
10030 // Process each loop nest in the function.
10031 return LoopVectorizeResult(Changed, CFGChanged);
10032}
10033
10034PreservedAnalyses LoopVectorizePass::run(Function &F,
10035 FunctionAnalysisManager &AM) {
10036 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10037 auto &LI = AM.getResult<LoopAnalysis>(F);
10038 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10039 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10040 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10041 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10042 auto &AA = AM.getResult<AAManager>(F);
10043 auto &AC = AM.getResult<AssumptionAnalysis>(F);
10044 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10045 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10046 MemorySSA *MSSA = EnableMSSALoopDependency
10047 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
10048 : nullptr;
10049
10050 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10051 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10052 [&](Loop &L) -> const LoopAccessInfo & {
10053 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE,
10054 TLI, TTI, nullptr, MSSA};
10055 return LAM.getResult<LoopAccessAnalysis>(L, AR);
10056 };
10057 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10058 ProfileSummaryInfo *PSI =
10059 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10060 LoopVectorizeResult Result =
10061 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10062 if (!Result.MadeAnyChange)
10063 return PreservedAnalyses::all();
10064 PreservedAnalyses PA;
10065
10066 // We currently do not preserve loopinfo/dominator analyses with outer loop
10067 // vectorization. Until this is addressed, mark these analyses as preserved
10068 // only for non-VPlan-native path.
10069 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10070 if (!EnableVPlanNativePath) {
10071 PA.preserve<LoopAnalysis>();
10072 PA.preserve<DominatorTreeAnalysis>();
10073 }
10074 PA.preserve<BasicAA>();
10075 PA.preserve<GlobalsAA>();
10076 if (!Result.MadeCFGChange)
10077 PA.preserveSet<CFGAnalyses>();
10078 return PA;
10079}