Bug Summary

File:llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Warning:line 8310, column 35
Potential leak of memory pointed to by 'BlockMask'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopVectorize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/build-llvm/lib/Transforms/Vectorize -resource-dir /usr/lib/llvm-13/lib/clang/13.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/build-llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/build-llvm/include -I /build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-13/lib/clang/13.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/build-llvm/lib/Transforms/Vectorize -fdebug-prefix-map=/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14=. -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2021-03-07-153333-19403-1 -x c++ /build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/Proposal/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanHCFGBuilder.h"
61#include "VPlanPredicator.h"
62#include "VPlanTransforms.h"
63#include "llvm/ADT/APInt.h"
64#include "llvm/ADT/ArrayRef.h"
65#include "llvm/ADT/DenseMap.h"
66#include "llvm/ADT/DenseMapInfo.h"
67#include "llvm/ADT/Hashing.h"
68#include "llvm/ADT/MapVector.h"
69#include "llvm/ADT/None.h"
70#include "llvm/ADT/Optional.h"
71#include "llvm/ADT/STLExtras.h"
72#include "llvm/ADT/SmallPtrSet.h"
73#include "llvm/ADT/SmallVector.h"
74#include "llvm/ADT/Statistic.h"
75#include "llvm/ADT/StringRef.h"
76#include "llvm/ADT/Twine.h"
77#include "llvm/ADT/iterator_range.h"
78#include "llvm/Analysis/AssumptionCache.h"
79#include "llvm/Analysis/BasicAliasAnalysis.h"
80#include "llvm/Analysis/BlockFrequencyInfo.h"
81#include "llvm/Analysis/CFG.h"
82#include "llvm/Analysis/CodeMetrics.h"
83#include "llvm/Analysis/DemandedBits.h"
84#include "llvm/Analysis/GlobalsModRef.h"
85#include "llvm/Analysis/LoopAccessAnalysis.h"
86#include "llvm/Analysis/LoopAnalysisManager.h"
87#include "llvm/Analysis/LoopInfo.h"
88#include "llvm/Analysis/LoopIterator.h"
89#include "llvm/Analysis/MemorySSA.h"
90#include "llvm/Analysis/OptimizationRemarkEmitter.h"
91#include "llvm/Analysis/ProfileSummaryInfo.h"
92#include "llvm/Analysis/ScalarEvolution.h"
93#include "llvm/Analysis/ScalarEvolutionExpressions.h"
94#include "llvm/Analysis/TargetLibraryInfo.h"
95#include "llvm/Analysis/TargetTransformInfo.h"
96#include "llvm/Analysis/VectorUtils.h"
97#include "llvm/IR/Attributes.h"
98#include "llvm/IR/BasicBlock.h"
99#include "llvm/IR/CFG.h"
100#include "llvm/IR/Constant.h"
101#include "llvm/IR/Constants.h"
102#include "llvm/IR/DataLayout.h"
103#include "llvm/IR/DebugInfoMetadata.h"
104#include "llvm/IR/DebugLoc.h"
105#include "llvm/IR/DerivedTypes.h"
106#include "llvm/IR/DiagnosticInfo.h"
107#include "llvm/IR/Dominators.h"
108#include "llvm/IR/Function.h"
109#include "llvm/IR/IRBuilder.h"
110#include "llvm/IR/InstrTypes.h"
111#include "llvm/IR/Instruction.h"
112#include "llvm/IR/Instructions.h"
113#include "llvm/IR/IntrinsicInst.h"
114#include "llvm/IR/Intrinsics.h"
115#include "llvm/IR/LLVMContext.h"
116#include "llvm/IR/Metadata.h"
117#include "llvm/IR/Module.h"
118#include "llvm/IR/Operator.h"
119#include "llvm/IR/Type.h"
120#include "llvm/IR/Use.h"
121#include "llvm/IR/User.h"
122#include "llvm/IR/Value.h"
123#include "llvm/IR/ValueHandle.h"
124#include "llvm/IR/Verifier.h"
125#include "llvm/InitializePasses.h"
126#include "llvm/Pass.h"
127#include "llvm/Support/Casting.h"
128#include "llvm/Support/CommandLine.h"
129#include "llvm/Support/Compiler.h"
130#include "llvm/Support/Debug.h"
131#include "llvm/Support/ErrorHandling.h"
132#include "llvm/Support/InstructionCost.h"
133#include "llvm/Support/MathExtras.h"
134#include "llvm/Support/raw_ostream.h"
135#include "llvm/Transforms/Utils/BasicBlockUtils.h"
136#include "llvm/Transforms/Utils/InjectTLIMappings.h"
137#include "llvm/Transforms/Utils/LoopSimplify.h"
138#include "llvm/Transforms/Utils/LoopUtils.h"
139#include "llvm/Transforms/Utils/LoopVersioning.h"
140#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141#include "llvm/Transforms/Utils/SizeOpts.h"
142#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143#include <algorithm>
144#include <cassert>
145#include <cstdint>
146#include <cstdlib>
147#include <functional>
148#include <iterator>
149#include <limits>
150#include <memory>
151#include <string>
152#include <tuple>
153#include <utility>
154
155using namespace llvm;
156
157#define LV_NAME"loop-vectorize" "loop-vectorize"
158#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"
159
160#ifndef NDEBUG
161const char VerboseDebug[] = DEBUG_TYPE"loop-vectorize" "-verbose";
162#endif
163
164/// @{
165/// Metadata attribute names
166const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
167const char LLVMLoopVectorizeFollowupVectorized[] =
168 "llvm.loop.vectorize.followup_vectorized";
169const char LLVMLoopVectorizeFollowupEpilogue[] =
170 "llvm.loop.vectorize.followup_epilogue";
171/// @}
172
173STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = {"loop-vectorize", "LoopsVectorized"
, "Number of loops vectorized"}
;
174STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = {"loop-vectorize", "LoopsAnalyzed"
, "Number of loops analyzed for vectorization"}
;
175STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized")static llvm::Statistic LoopsEpilogueVectorized = {"loop-vectorize"
, "LoopsEpilogueVectorized", "Number of epilogues vectorized"
}
;
176
177static cl::opt<bool> EnableEpilogueVectorization(
178 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
179 cl::desc("Enable vectorization of epilogue loops."));
180
181static cl::opt<unsigned> EpilogueVectorizationForceVF(
182 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
183 cl::desc("When epilogue vectorization is enabled, and a value greater than "
184 "1 is specified, forces the given VF for all applicable epilogue "
185 "loops."));
186
187static cl::opt<unsigned> EpilogueVectorizationMinVF(
188 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189 cl::desc("Only loops with vectorization factor equal to or larger than "
190 "the specified value are considered for epilogue vectorization."));
191
192/// Loops with a known constant trip count below this number are vectorized only
193/// if no scalar iteration overheads are incurred.
194static cl::opt<unsigned> TinyTripCountVectorThreshold(
195 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
196 cl::desc("Loops with a constant trip count that is smaller than this "
197 "value are vectorized only if no scalar iteration overheads "
198 "are incurred."));
199
200// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
201// that predication is preferred, and this lists all options. I.e., the
202// vectorizer will try to fold the tail-loop (epilogue) into the vector body
203// and predicate the instructions accordingly. If tail-folding fails, there are
204// different fallback strategies depending on these values:
205namespace PreferPredicateTy {
206 enum Option {
207 ScalarEpilogue = 0,
208 PredicateElseScalarEpilogue,
209 PredicateOrDontVectorize
210 };
211} // namespace PreferPredicateTy
212
213static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
214 "prefer-predicate-over-epilogue",
215 cl::init(PreferPredicateTy::ScalarEpilogue),
216 cl::Hidden,
217 cl::desc("Tail-folding and predication preferences over creating a scalar "
218 "epilogue loop."),
219 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
220 "scalar-epilogue",llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
221 "Don't tail-predicate loops, create scalar epilogue")llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
,
222 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
223 "predicate-else-scalar-epilogue",llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
224 "prefer tail-folding, create scalar epilogue if tail "llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
225 "folding fails.")llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
,
226 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
227 "predicate-dont-vectorize",llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
228 "prefers tail-folding, don't attempt vectorization if "llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
229 "tail-folding fails.")llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
));
230
231static cl::opt<bool> MaximizeBandwidth(
232 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
233 cl::desc("Maximize bandwidth when selecting vectorization factor which "
234 "will be determined by the smallest type in loop."));
235
236static cl::opt<bool> EnableInterleavedMemAccesses(
237 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
238 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
239
240/// An interleave-group may need masking if it resides in a block that needs
241/// predication, or in order to mask away gaps.
242static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
243 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
245
246static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
247 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
248 cl::desc("We don't interleave loops with a estimated constant trip count "
249 "below this number"));
250
251static cl::opt<unsigned> ForceTargetNumScalarRegs(
252 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
253 cl::desc("A flag that overrides the target's number of scalar registers."));
254
255static cl::opt<unsigned> ForceTargetNumVectorRegs(
256 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
257 cl::desc("A flag that overrides the target's number of vector registers."));
258
259static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
260 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
261 cl::desc("A flag that overrides the target's max interleave factor for "
262 "scalar loops."));
263
264static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
265 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
266 cl::desc("A flag that overrides the target's max interleave factor for "
267 "vectorized loops."));
268
269static cl::opt<unsigned> ForceTargetInstructionCost(
270 "force-target-instruction-cost", cl::init(0), cl::Hidden,
271 cl::desc("A flag that overrides the target's expected cost for "
272 "an instruction to a single constant value. Mostly "
273 "useful for getting consistent testing."));
274
275static cl::opt<bool> ForceTargetSupportsScalableVectors(
276 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
277 cl::desc(
278 "Pretend that scalable vectors are supported, even if the target does "
279 "not support them. This flag should only be used for testing."));
280
281static cl::opt<unsigned> SmallLoopCost(
282 "small-loop-cost", cl::init(20), cl::Hidden,
283 cl::desc(
284 "The cost of a loop that is considered 'small' by the interleaver."));
285
286static cl::opt<bool> LoopVectorizeWithBlockFrequency(
287 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
288 cl::desc("Enable the use of the block frequency analysis to access PGO "
289 "heuristics minimizing code growth in cold regions and being more "
290 "aggressive in hot regions."));
291
292// Runtime interleave loops for load/store throughput.
293static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
294 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
295 cl::desc(
296 "Enable runtime interleaving until load/store ports are saturated"));
297
298/// Interleave small loops with scalar reductions.
299static cl::opt<bool> InterleaveSmallLoopScalarReduction(
300 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
301 cl::desc("Enable interleaving for loops with small iteration counts that "
302 "contain scalar reductions to expose ILP."));
303
304/// The number of stores in a loop that are allowed to need predication.
305static cl::opt<unsigned> NumberOfStoresToPredicate(
306 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
307 cl::desc("Max number of stores to be predicated behind an if."));
308
309static cl::opt<bool> EnableIndVarRegisterHeur(
310 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
311 cl::desc("Count the induction variable only once when interleaving"));
312
313static cl::opt<bool> EnableCondStoresVectorization(
314 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
315 cl::desc("Enable if predication of stores during vectorization."));
316
317static cl::opt<unsigned> MaxNestedScalarReductionIC(
318 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
319 cl::desc("The maximum interleave count to use when interleaving a scalar "
320 "reduction in a nested loop."));
321
322static cl::opt<bool>
323 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
324 cl::Hidden,
325 cl::desc("Prefer in-loop vector reductions, "
326 "overriding the targets preference."));
327
328static cl::opt<bool> PreferPredicatedReductionSelect(
329 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
330 cl::desc(
331 "Prefer predicating a reduction operation over an after loop select."));
332
333cl::opt<bool> EnableVPlanNativePath(
334 "enable-vplan-native-path", cl::init(false), cl::Hidden,
335 cl::desc("Enable VPlan-native vectorization path with "
336 "support for outer loop vectorization."));
337
338// FIXME: Remove this switch once we have divergence analysis. Currently we
339// assume divergent non-backedge branches when this switch is true.
340cl::opt<bool> EnableVPlanPredication(
341 "enable-vplan-predication", cl::init(false), cl::Hidden,
342 cl::desc("Enable VPlan-native vectorization path predicator with "
343 "support for outer loop vectorization."));
344
345// This flag enables the stress testing of the VPlan H-CFG construction in the
346// VPlan-native vectorization path. It must be used in conjuction with
347// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
348// verification of the H-CFGs built.
349static cl::opt<bool> VPlanBuildStressTest(
350 "vplan-build-stress-test", cl::init(false), cl::Hidden,
351 cl::desc(
352 "Build VPlan for every supported loop nest in the function and bail "
353 "out right after the build (stress test the VPlan H-CFG construction "
354 "in the VPlan-native vectorization path)."));
355
356cl::opt<bool> llvm::EnableLoopInterleaving(
357 "interleave-loops", cl::init(true), cl::Hidden,
358 cl::desc("Enable loop interleaving in Loop vectorization passes"));
359cl::opt<bool> llvm::EnableLoopVectorization(
360 "vectorize-loops", cl::init(true), cl::Hidden,
361 cl::desc("Run the Loop vectorization passes"));
362
363/// A helper function that returns the type of loaded or stored value.
364static Type *getMemInstValueType(Value *I) {
365 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Expected Load or Store instruction") ? static_cast<void>
(0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 366, __PRETTY_FUNCTION__))
366 "Expected Load or Store instruction")(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Expected Load or Store instruction") ? static_cast<void>
(0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 366, __PRETTY_FUNCTION__))
;
367 if (auto *LI = dyn_cast<LoadInst>(I))
368 return LI->getType();
369 return cast<StoreInst>(I)->getValueOperand()->getType();
370}
371
372/// A helper function that returns true if the given type is irregular. The
373/// type is irregular if its allocated size doesn't equal the store size of an
374/// element of the corresponding vector type at the given vectorization factor.
375static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
376 // Determine if an array of VF elements of type Ty is "bitcast compatible"
377 // with a <VF x Ty> vector.
378 if (VF.isVector()) {
379 auto *VectorTy = VectorType::get(Ty, VF);
380 return TypeSize::get(VF.getKnownMinValue() *
381 DL.getTypeAllocSize(Ty).getFixedValue(),
382 VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
383 }
384
385 // If the vectorization factor is one, we just check if an array of type Ty
386 // requires padding between elements.
387 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
388}
389
390/// A helper function that returns the reciprocal of the block probability of
391/// predicated blocks. If we return X, we are assuming the predicated block
392/// will execute once for every X iterations of the loop header.
393///
394/// TODO: We should use actual block probability here, if available. Currently,
395/// we always assume predicated blocks have a 50% chance of executing.
396static unsigned getReciprocalPredBlockProb() { return 2; }
397
398/// A helper function that returns an integer or floating-point constant with
399/// value C.
400static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
401 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
402 : ConstantFP::get(Ty, C);
403}
404
405/// Returns "best known" trip count for the specified loop \p L as defined by
406/// the following procedure:
407/// 1) Returns exact trip count if it is known.
408/// 2) Returns expected trip count according to profile data if any.
409/// 3) Returns upper bound estimate if it is known.
410/// 4) Returns None if all of the above failed.
411static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
412 // Check if exact trip count is known.
413 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
414 return ExpectedTC;
415
416 // Check if there is an expected trip count available from profile data.
417 if (LoopVectorizeWithBlockFrequency)
418 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
419 return EstimatedTC;
420
421 // Check if upper bound estimate is known.
422 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
423 return ExpectedTC;
424
425 return None;
426}
427
428// Forward declare GeneratedRTChecks.
429class GeneratedRTChecks;
430
431namespace llvm {
432
433/// InnerLoopVectorizer vectorizes loops which contain only one basic
434/// block to a specified vectorization factor (VF).
435/// This class performs the widening of scalars into vectors, or multiple
436/// scalars. This class also implements the following features:
437/// * It inserts an epilogue loop for handling loops that don't have iteration
438/// counts that are known to be a multiple of the vectorization factor.
439/// * It handles the code generation for reduction variables.
440/// * Scalarization (implementation using scalars) of un-vectorizable
441/// instructions.
442/// InnerLoopVectorizer does not perform any vectorization-legality
443/// checks, and relies on the caller to check for the different legality
444/// aspects. The InnerLoopVectorizer relies on the
445/// LoopVectorizationLegality class to provide information about the induction
446/// and reduction variables that were found to a given vectorization factor.
447class InnerLoopVectorizer {
448public:
449 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
450 LoopInfo *LI, DominatorTree *DT,
451 const TargetLibraryInfo *TLI,
452 const TargetTransformInfo *TTI, AssumptionCache *AC,
453 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
454 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
455 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
456 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
457 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
458 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
459 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
460 PSI(PSI), RTChecks(RTChecks) {
461 // Query this against the original loop and save it here because the profile
462 // of the original loop header may change as the transformation happens.
463 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
464 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
465 }
466
467 virtual ~InnerLoopVectorizer() = default;
468
469 /// Create a new empty loop that will contain vectorized instructions later
470 /// on, while the old loop will be used as the scalar remainder. Control flow
471 /// is generated around the vectorized (and scalar epilogue) loops consisting
472 /// of various checks and bypasses. Return the pre-header block of the new
473 /// loop.
474 /// In the case of epilogue vectorization, this function is overriden to
475 /// handle the more complex control flow around the loops.
476 virtual BasicBlock *createVectorizedLoopSkeleton();
477
478 /// Widen a single instruction within the innermost loop.
479 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
480 VPTransformState &State);
481
482 /// Widen a single call instruction within the innermost loop.
483 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
484 VPTransformState &State);
485
486 /// Widen a single select instruction within the innermost loop.
487 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
488 bool InvariantCond, VPTransformState &State);
489
490 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
491 void fixVectorizedLoop(VPTransformState &State);
492
493 // Return true if any runtime check is added.
494 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
495
496 /// A type for vectorized values in the new loop. Each value from the
497 /// original loop, when vectorized, is represented by UF vector values in the
498 /// new unrolled loop, where UF is the unroll factor.
499 using VectorParts = SmallVector<Value *, 2>;
500
501 /// Vectorize a single GetElementPtrInst based on information gathered and
502 /// decisions taken during planning.
503 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
504 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
505 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
506
507 /// Vectorize a single PHINode in a block. This method handles the induction
508 /// variable canonicalization. It supports both VF = 1 for unrolled loops and
509 /// arbitrary length vectors.
510 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc,
511 VPValue *StartV, VPValue *Def,
512 VPTransformState &State);
513
514 /// A helper function to scalarize a single Instruction in the innermost loop.
515 /// Generates a sequence of scalar instances for each lane between \p MinLane
516 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
517 /// inclusive. Uses the VPValue operands from \p Operands instead of \p
518 /// Instr's operands.
519 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands,
520 const VPIteration &Instance, bool IfPredicateInstr,
521 VPTransformState &State);
522
523 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
524 /// is provided, the integer induction variable will first be truncated to
525 /// the corresponding type.
526 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc,
527 VPValue *Def, VPValue *CastDef,
528 VPTransformState &State);
529
530 /// Construct the vector value of a scalarized value \p V one lane at a time.
531 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
532 VPTransformState &State);
533
534 /// Try to vectorize interleaved access group \p Group with the base address
535 /// given in \p Addr, optionally masking the vector operations if \p
536 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
537 /// values in the vectorized loop.
538 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
539 ArrayRef<VPValue *> VPDefs,
540 VPTransformState &State, VPValue *Addr,
541 ArrayRef<VPValue *> StoredValues,
542 VPValue *BlockInMask = nullptr);
543
544 /// Vectorize Load and Store instructions with the base address given in \p
545 /// Addr, optionally masking the vector operations if \p BlockInMask is
546 /// non-null. Use \p State to translate given VPValues to IR values in the
547 /// vectorized loop.
548 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
549 VPValue *Def, VPValue *Addr,
550 VPValue *StoredValue, VPValue *BlockInMask);
551
552 /// Set the debug location in the builder using the debug location in
553 /// the instruction.
554 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
555
556 /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
557 void fixNonInductionPHIs(VPTransformState &State);
558
559 /// Create a broadcast instruction. This method generates a broadcast
560 /// instruction (shuffle) for loop invariant values and for the induction
561 /// value. If this is the induction variable then we extend it to N, N+1, ...
562 /// this is needed because each iteration in the loop corresponds to a SIMD
563 /// element.
564 virtual Value *getBroadcastInstrs(Value *V);
565
566protected:
567 friend class LoopVectorizationPlanner;
568
569 /// A small list of PHINodes.
570 using PhiVector = SmallVector<PHINode *, 4>;
571
572 /// A type for scalarized values in the new loop. Each value from the
573 /// original loop, when scalarized, is represented by UF x VF scalar values
574 /// in the new unrolled loop, where UF is the unroll factor and VF is the
575 /// vectorization factor.
576 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
577
578 /// Set up the values of the IVs correctly when exiting the vector loop.
579 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
580 Value *CountRoundDown, Value *EndValue,
581 BasicBlock *MiddleBlock);
582
583 /// Create a new induction variable inside L.
584 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
585 Value *Step, Instruction *DL);
586
587 /// Handle all cross-iteration phis in the header.
588 void fixCrossIterationPHIs(VPTransformState &State);
589
590 /// Fix a first-order recurrence. This is the second phase of vectorizing
591 /// this phi node.
592 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State);
593
594 /// Fix a reduction cross-iteration phi. This is the second phase of
595 /// vectorizing this phi node.
596 void fixReduction(PHINode *Phi, VPTransformState &State);
597
598 /// Clear NSW/NUW flags from reduction instructions if necessary.
599 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc,
600 VPTransformState &State);
601
602 /// Fixup the LCSSA phi nodes in the unique exit block. This simply
603 /// means we need to add the appropriate incoming value from the middle
604 /// block as exiting edges from the scalar epilogue loop (if present) are
605 /// already in place, and we exit the vector loop exclusively to the middle
606 /// block.
607 void fixLCSSAPHIs(VPTransformState &State);
608
609 /// Iteratively sink the scalarized operands of a predicated instruction into
610 /// the block that was created for it.
611 void sinkScalarOperands(Instruction *PredInst);
612
613 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
614 /// represented as.
615 void truncateToMinimalBitwidths(VPTransformState &State);
616
617 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
618 /// to each vector element of Val. The sequence starts at StartIndex.
619 /// \p Opcode is relevant for FP induction variable.
620 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
621 Instruction::BinaryOps Opcode =
622 Instruction::BinaryOpsEnd);
623
624 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
625 /// variable on which to base the steps, \p Step is the size of the step, and
626 /// \p EntryVal is the value from the original loop that maps to the steps.
627 /// Note that \p EntryVal doesn't have to be an induction variable - it
628 /// can also be a truncate instruction.
629 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
630 const InductionDescriptor &ID, VPValue *Def,
631 VPValue *CastDef, VPTransformState &State);
632
633 /// Create a vector induction phi node based on an existing scalar one. \p
634 /// EntryVal is the value from the original loop that maps to the vector phi
635 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
636 /// truncate instruction, instead of widening the original IV, we widen a
637 /// version of the IV truncated to \p EntryVal's type.
638 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
639 Value *Step, Value *Start,
640 Instruction *EntryVal, VPValue *Def,
641 VPValue *CastDef,
642 VPTransformState &State);
643
644 /// Returns true if an instruction \p I should be scalarized instead of
645 /// vectorized for the chosen vectorization factor.
646 bool shouldScalarizeInstruction(Instruction *I) const;
647
648 /// Returns true if we should generate a scalar version of \p IV.
649 bool needsScalarInduction(Instruction *IV) const;
650
651 /// If there is a cast involved in the induction variable \p ID, which should
652 /// be ignored in the vectorized loop body, this function records the
653 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
654 /// cast. We had already proved that the casted Phi is equal to the uncasted
655 /// Phi in the vectorized loop (under a runtime guard), and therefore
656 /// there is no need to vectorize the cast - the same value can be used in the
657 /// vector loop for both the Phi and the cast.
658 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
659 /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
660 ///
661 /// \p EntryVal is the value from the original loop that maps to the vector
662 /// phi node and is used to distinguish what is the IV currently being
663 /// processed - original one (if \p EntryVal is a phi corresponding to the
664 /// original IV) or the "newly-created" one based on the proof mentioned above
665 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
666 /// latter case \p EntryVal is a TruncInst and we must not record anything for
667 /// that IV, but it's error-prone to expect callers of this routine to care
668 /// about that, hence this explicit parameter.
669 void recordVectorLoopValueForInductionCast(
670 const InductionDescriptor &ID, const Instruction *EntryVal,
671 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
672 unsigned Part, unsigned Lane = UINT_MAX(2147483647 *2U +1U));
673
674 /// Generate a shuffle sequence that will reverse the vector Vec.
675 virtual Value *reverseVector(Value *Vec);
676
677 /// Returns (and creates if needed) the original loop trip count.
678 Value *getOrCreateTripCount(Loop *NewLoop);
679
680 /// Returns (and creates if needed) the trip count of the widened loop.
681 Value *getOrCreateVectorTripCount(Loop *NewLoop);
682
683 /// Returns a bitcasted value to the requested vector type.
684 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
685 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
686 const DataLayout &DL);
687
688 /// Emit a bypass check to see if the vector trip count is zero, including if
689 /// it overflows.
690 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
691
692 /// Emit a bypass check to see if all of the SCEV assumptions we've
693 /// had to make are correct. Returns the block containing the checks or
694 /// nullptr if no checks have been added.
695 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
696
697 /// Emit bypass checks to check any memory assumptions we may have made.
698 /// Returns the block containing the checks or nullptr if no checks have been
699 /// added.
700 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
701
702 /// Compute the transformed value of Index at offset StartValue using step
703 /// StepValue.
704 /// For integer induction, returns StartValue + Index * StepValue.
705 /// For pointer induction, returns StartValue[Index * StepValue].
706 /// FIXME: The newly created binary instructions should contain nsw/nuw
707 /// flags, which can be found from the original scalar operations.
708 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
709 const DataLayout &DL,
710 const InductionDescriptor &ID) const;
711
712 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
713 /// vector loop preheader, middle block and scalar preheader. Also
714 /// allocate a loop object for the new vector loop and return it.
715 Loop *createVectorLoopSkeleton(StringRef Prefix);
716
717 /// Create new phi nodes for the induction variables to resume iteration count
718 /// in the scalar epilogue, from where the vectorized loop left off (given by
719 /// \p VectorTripCount).
720 /// In cases where the loop skeleton is more complicated (eg. epilogue
721 /// vectorization) and the resume values can come from an additional bypass
722 /// block, the \p AdditionalBypass pair provides information about the bypass
723 /// block and the end value on the edge from bypass to this loop.
724 void createInductionResumeValues(
725 Loop *L, Value *VectorTripCount,
726 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
727
728 /// Complete the loop skeleton by adding debug MDs, creating appropriate
729 /// conditional branches in the middle block, preparing the builder and
730 /// running the verifier. Take in the vector loop \p L as argument, and return
731 /// the preheader of the completed vector loop.
732 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
733
734 /// Add additional metadata to \p To that was not present on \p Orig.
735 ///
736 /// Currently this is used to add the noalias annotations based on the
737 /// inserted memchecks. Use this for instructions that are *cloned* into the
738 /// vector loop.
739 void addNewMetadata(Instruction *To, const Instruction *Orig);
740
741 /// Add metadata from one instruction to another.
742 ///
743 /// This includes both the original MDs from \p From and additional ones (\see
744 /// addNewMetadata). Use this for *newly created* instructions in the vector
745 /// loop.
746 void addMetadata(Instruction *To, Instruction *From);
747
748 /// Similar to the previous function but it adds the metadata to a
749 /// vector of instructions.
750 void addMetadata(ArrayRef<Value *> To, Instruction *From);
751
752 /// Allow subclasses to override and print debug traces before/after vplan
753 /// execution, when trace information is requested.
754 virtual void printDebugTracesAtStart(){};
755 virtual void printDebugTracesAtEnd(){};
756
757 /// The original loop.
758 Loop *OrigLoop;
759
760 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
761 /// dynamic knowledge to simplify SCEV expressions and converts them to a
762 /// more usable form.
763 PredicatedScalarEvolution &PSE;
764
765 /// Loop Info.
766 LoopInfo *LI;
767
768 /// Dominator Tree.
769 DominatorTree *DT;
770
771 /// Alias Analysis.
772 AAResults *AA;
773
774 /// Target Library Info.
775 const TargetLibraryInfo *TLI;
776
777 /// Target Transform Info.
778 const TargetTransformInfo *TTI;
779
780 /// Assumption Cache.
781 AssumptionCache *AC;
782
783 /// Interface to emit optimization remarks.
784 OptimizationRemarkEmitter *ORE;
785
786 /// LoopVersioning. It's only set up (non-null) if memchecks were
787 /// used.
788 ///
789 /// This is currently only used to add no-alias metadata based on the
790 /// memchecks. The actually versioning is performed manually.
791 std::unique_ptr<LoopVersioning> LVer;
792
793 /// The vectorization SIMD factor to use. Each vector will have this many
794 /// vector elements.
795 ElementCount VF;
796
797 /// The vectorization unroll factor to use. Each scalar is vectorized to this
798 /// many different vector instructions.
799 unsigned UF;
800
801 /// The builder that we use
802 IRBuilder<> Builder;
803
804 // --- Vectorization state ---
805
806 /// The vector-loop preheader.
807 BasicBlock *LoopVectorPreHeader;
808
809 /// The scalar-loop preheader.
810 BasicBlock *LoopScalarPreHeader;
811
812 /// Middle Block between the vector and the scalar.
813 BasicBlock *LoopMiddleBlock;
814
815 /// The (unique) ExitBlock of the scalar loop. Note that
816 /// there can be multiple exiting edges reaching this block.
817 BasicBlock *LoopExitBlock;
818
819 /// The vector loop body.
820 BasicBlock *LoopVectorBody;
821
822 /// The scalar loop body.
823 BasicBlock *LoopScalarBody;
824
825 /// A list of all bypass blocks. The first block is the entry of the loop.
826 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
827
828 /// The new Induction variable which was added to the new block.
829 PHINode *Induction = nullptr;
830
831 /// The induction variable of the old basic block.
832 PHINode *OldInduction = nullptr;
833
834 /// Store instructions that were predicated.
835 SmallVector<Instruction *, 4> PredicatedInstructions;
836
837 /// Trip count of the original loop.
838 Value *TripCount = nullptr;
839
840 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
841 Value *VectorTripCount = nullptr;
842
843 /// The legality analysis.
844 LoopVectorizationLegality *Legal;
845
846 /// The profitablity analysis.
847 LoopVectorizationCostModel *Cost;
848
849 // Record whether runtime checks are added.
850 bool AddedSafetyChecks = false;
851
852 // Holds the end values for each induction variable. We save the end values
853 // so we can later fix-up the external users of the induction variables.
854 DenseMap<PHINode *, Value *> IVEndValues;
855
856 // Vector of original scalar PHIs whose corresponding widened PHIs need to be
857 // fixed up at the end of vector code generation.
858 SmallVector<PHINode *, 8> OrigPHIsToFix;
859
860 /// BFI and PSI are used to check for profile guided size optimizations.
861 BlockFrequencyInfo *BFI;
862 ProfileSummaryInfo *PSI;
863
864 // Whether this loop should be optimized for size based on profile guided size
865 // optimizatios.
866 bool OptForSizeBasedOnProfile;
867
868 /// Structure to hold information about generated runtime checks, responsible
869 /// for cleaning the checks, if vectorization turns out unprofitable.
870 GeneratedRTChecks &RTChecks;
871};
872
873class InnerLoopUnroller : public InnerLoopVectorizer {
874public:
875 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
876 LoopInfo *LI, DominatorTree *DT,
877 const TargetLibraryInfo *TLI,
878 const TargetTransformInfo *TTI, AssumptionCache *AC,
879 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
880 LoopVectorizationLegality *LVL,
881 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
882 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
883 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
884 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
885 BFI, PSI, Check) {}
886
887private:
888 Value *getBroadcastInstrs(Value *V) override;
889 Value *getStepVector(Value *Val, int StartIdx, Value *Step,
890 Instruction::BinaryOps Opcode =
891 Instruction::BinaryOpsEnd) override;
892 Value *reverseVector(Value *Vec) override;
893};
894
895/// Encapsulate information regarding vectorization of a loop and its epilogue.
896/// This information is meant to be updated and used across two stages of
897/// epilogue vectorization.
898struct EpilogueLoopVectorizationInfo {
899 ElementCount MainLoopVF = ElementCount::getFixed(0);
900 unsigned MainLoopUF = 0;
901 ElementCount EpilogueVF = ElementCount::getFixed(0);
902 unsigned EpilogueUF = 0;
903 BasicBlock *MainLoopIterationCountCheck = nullptr;
904 BasicBlock *EpilogueIterationCountCheck = nullptr;
905 BasicBlock *SCEVSafetyCheck = nullptr;
906 BasicBlock *MemSafetyCheck = nullptr;
907 Value *TripCount = nullptr;
908 Value *VectorTripCount = nullptr;
909
910 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
911 unsigned EUF)
912 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
913 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
914 assert(EUF == 1 &&((EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? static_cast<void> (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 915, __PRETTY_FUNCTION__))
915 "A high UF for the epilogue loop is likely not beneficial.")((EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? static_cast<void> (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 915, __PRETTY_FUNCTION__))
;
916 }
917};
918
919/// An extension of the inner loop vectorizer that creates a skeleton for a
920/// vectorized loop that has its epilogue (residual) also vectorized.
921/// The idea is to run the vplan on a given loop twice, firstly to setup the
922/// skeleton and vectorize the main loop, and secondly to complete the skeleton
923/// from the first step and vectorize the epilogue. This is achieved by
924/// deriving two concrete strategy classes from this base class and invoking
925/// them in succession from the loop vectorizer planner.
926class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
927public:
928 InnerLoopAndEpilogueVectorizer(
929 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
930 DominatorTree *DT, const TargetLibraryInfo *TLI,
931 const TargetTransformInfo *TTI, AssumptionCache *AC,
932 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
933 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
934 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
935 GeneratedRTChecks &Checks)
936 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
937 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
938 Checks),
939 EPI(EPI) {}
940
941 // Override this function to handle the more complex control flow around the
942 // three loops.
943 BasicBlock *createVectorizedLoopSkeleton() final override {
944 return createEpilogueVectorizedLoopSkeleton();
945 }
946
947 /// The interface for creating a vectorized skeleton using one of two
948 /// different strategies, each corresponding to one execution of the vplan
949 /// as described above.
950 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
951
952 /// Holds and updates state information required to vectorize the main loop
953 /// and its epilogue in two separate passes. This setup helps us avoid
954 /// regenerating and recomputing runtime safety checks. It also helps us to
955 /// shorten the iteration-count-check path length for the cases where the
956 /// iteration count of the loop is so small that the main vector loop is
957 /// completely skipped.
958 EpilogueLoopVectorizationInfo &EPI;
959};
960
961/// A specialized derived class of inner loop vectorizer that performs
962/// vectorization of *main* loops in the process of vectorizing loops and their
963/// epilogues.
964class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
965public:
966 EpilogueVectorizerMainLoop(
967 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
968 DominatorTree *DT, const TargetLibraryInfo *TLI,
969 const TargetTransformInfo *TTI, AssumptionCache *AC,
970 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
971 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
972 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
973 GeneratedRTChecks &Check)
974 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
975 EPI, LVL, CM, BFI, PSI, Check) {}
976 /// Implements the interface for creating a vectorized skeleton using the
977 /// *main loop* strategy (ie the first pass of vplan execution).
978 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
979
980protected:
981 /// Emits an iteration count bypass check once for the main loop (when \p
982 /// ForEpilogue is false) and once for the epilogue loop (when \p
983 /// ForEpilogue is true).
984 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
985 bool ForEpilogue);
986 void printDebugTracesAtStart() override;
987 void printDebugTracesAtEnd() override;
988};
989
990// A specialized derived class of inner loop vectorizer that performs
991// vectorization of *epilogue* loops in the process of vectorizing loops and
992// their epilogues.
993class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
994public:
995 EpilogueVectorizerEpilogueLoop(
996 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
997 DominatorTree *DT, const TargetLibraryInfo *TLI,
998 const TargetTransformInfo *TTI, AssumptionCache *AC,
999 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
1000 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
1001 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
1002 GeneratedRTChecks &Checks)
1003 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1004 EPI, LVL, CM, BFI, PSI, Checks) {}
1005 /// Implements the interface for creating a vectorized skeleton using the
1006 /// *epilogue loop* strategy (ie the second pass of vplan execution).
1007 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1008
1009protected:
1010 /// Emits an iteration count bypass check after the main vector loop has
1011 /// finished to see if there are any iterations left to execute by either
1012 /// the vector epilogue or the scalar epilogue.
1013 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1014 BasicBlock *Bypass,
1015 BasicBlock *Insert);
1016 void printDebugTracesAtStart() override;
1017 void printDebugTracesAtEnd() override;
1018};
1019} // end namespace llvm
1020
1021/// Look for a meaningful debug location on the instruction or it's
1022/// operands.
1023static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1024 if (!I)
1025 return I;
1026
1027 DebugLoc Empty;
1028 if (I->getDebugLoc() != Empty)
1029 return I;
1030
1031 for (Use &Op : I->operands()) {
1032 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
1033 if (OpInst->getDebugLoc() != Empty)
1034 return OpInst;
1035 }
1036
1037 return I;
1038}
1039
1040void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
1041 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
1042 const DILocation *DIL = Inst->getDebugLoc();
1043 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1044 !isa<DbgInfoIntrinsic>(Inst)) {
1045 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1045, __PRETTY_FUNCTION__))
;
1046 auto NewDIL =
1047 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1048 if (NewDIL)
1049 B.SetCurrentDebugLocation(NewDIL.getValue());
1050 else
1051 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
1052 << "Failed to create new discriminator: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
1053 << DIL->getFilename() << " Line: " << DIL->getLine())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
;
1054 }
1055 else
1056 B.SetCurrentDebugLocation(DIL);
1057 } else
1058 B.SetCurrentDebugLocation(DebugLoc());
1059}
1060
1061/// Write a record \p DebugMsg about vectorization failure to the debug
1062/// output stream. If \p I is passed, it is an instruction that prevents
1063/// vectorization.
1064#ifndef NDEBUG
1065static void debugVectorizationFailure(const StringRef DebugMsg,
1066 Instruction *I) {
1067 dbgs() << "LV: Not vectorizing: " << DebugMsg;
1068 if (I != nullptr)
1069 dbgs() << " " << *I;
1070 else
1071 dbgs() << '.';
1072 dbgs() << '\n';
1073}
1074#endif
1075
1076/// Create an analysis remark that explains why vectorization failed
1077///
1078/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
1079/// RemarkName is the identifier for the remark. If \p I is passed it is an
1080/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
1081/// the location of the remark. \return the remark object that can be
1082/// streamed to.
1083static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1084 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1085 Value *CodeRegion = TheLoop->getHeader();
1086 DebugLoc DL = TheLoop->getStartLoc();
1087
1088 if (I) {
1089 CodeRegion = I->getParent();
1090 // If there is no debug location attached to the instruction, revert back to
1091 // using the loop's.
1092 if (I->getDebugLoc())
1093 DL = I->getDebugLoc();
1094 }
1095
1096 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
1097 R << "loop not vectorized: ";
1098 return R;
1099}
1100
1101/// Return a value for Step multiplied by VF.
1102static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1103 assert(isa<ConstantInt>(Step) && "Expected an integer step")((isa<ConstantInt>(Step) && "Expected an integer step"
) ? static_cast<void> (0) : __assert_fail ("isa<ConstantInt>(Step) && \"Expected an integer step\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1103, __PRETTY_FUNCTION__))
;
1104 Constant *StepVal = ConstantInt::get(
1105 Step->getType(),
1106 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1107 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1108}
1109
1110namespace llvm {
1111
1112/// Return the runtime value for VF.
1113Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
1114 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1115 return VF.isScalable() ? B.CreateVScale(EC) : EC;
1116}
1117
1118void reportVectorizationFailure(const StringRef DebugMsg,
1119 const StringRef OREMsg, const StringRef ORETag,
1120 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
1121 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationFailure(DebugMsg, I);
} } while (false)
;
1122 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1123 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
1124 ORETag, TheLoop, I) << OREMsg);
1125}
1126
1127} // end namespace llvm
1128
1129#ifndef NDEBUG
1130/// \return string containing a file name and a line # for the given loop.
1131static std::string getDebugLocString(const Loop *L) {
1132 std::string Result;
1133 if (L) {
1134 raw_string_ostream OS(Result);
1135 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1136 LoopDbgLoc.print(OS);
1137 else
1138 // Just print the module name.
1139 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1140 OS.flush();
1141 }
1142 return Result;
1143}
1144#endif
1145
1146void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1147 const Instruction *Orig) {
1148 // If the loop was versioned with memchecks, add the corresponding no-alias
1149 // metadata.
1150 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1151 LVer->annotateInstWithNoAlias(To, Orig);
1152}
1153
1154void InnerLoopVectorizer::addMetadata(Instruction *To,
1155 Instruction *From) {
1156 propagateMetadata(To, From);
1157 addNewMetadata(To, From);
1158}
1159
1160void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1161 Instruction *From) {
1162 for (Value *V : To) {
1163 if (Instruction *I = dyn_cast<Instruction>(V))
1164 addMetadata(I, From);
1165 }
1166}
1167
1168namespace llvm {
1169
1170// Loop vectorization cost-model hints how the scalar epilogue loop should be
1171// lowered.
1172enum ScalarEpilogueLowering {
1173
1174 // The default: allowing scalar epilogues.
1175 CM_ScalarEpilogueAllowed,
1176
1177 // Vectorization with OptForSize: don't allow epilogues.
1178 CM_ScalarEpilogueNotAllowedOptSize,
1179
1180 // A special case of vectorisation with OptForSize: loops with a very small
1181 // trip count are considered for vectorization under OptForSize, thereby
1182 // making sure the cost of their loop body is dominant, free of runtime
1183 // guards and scalar iteration overheads.
1184 CM_ScalarEpilogueNotAllowedLowTripLoop,
1185
1186 // Loop hint predicate indicating an epilogue is undesired.
1187 CM_ScalarEpilogueNotNeededUsePredicate,
1188
1189 // Directive indicating we must either tail fold or not vectorize
1190 CM_ScalarEpilogueNotAllowedUsePredicate
1191};
1192
1193/// LoopVectorizationCostModel - estimates the expected speedups due to
1194/// vectorization.
1195/// In many cases vectorization is not profitable. This can happen because of
1196/// a number of reasons. In this class we mainly attempt to predict the
1197/// expected speedup/slowdowns due to the supported instruction set. We use the
1198/// TargetTransformInfo to query the different backends for the cost of
1199/// different operations.
1200class LoopVectorizationCostModel {
1201public:
1202 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1203 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1204 LoopVectorizationLegality *Legal,
1205 const TargetTransformInfo &TTI,
1206 const TargetLibraryInfo *TLI, DemandedBits *DB,
1207 AssumptionCache *AC,
1208 OptimizationRemarkEmitter *ORE, const Function *F,
1209 const LoopVectorizeHints *Hints,
1210 InterleavedAccessInfo &IAI)
1211 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1212 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1213 Hints(Hints), InterleaveInfo(IAI) {}
1214
1215 /// \return An upper bound for the vectorization factor, or None if
1216 /// vectorization and interleaving should be avoided up front.
1217 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
1218
1219 /// \return True if runtime checks are required for vectorization, and false
1220 /// otherwise.
1221 bool runtimeChecksRequired();
1222
1223 /// \return The most profitable vectorization factor and the cost of that VF.
1224 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1225 /// then this vectorization factor will be selected if vectorization is
1226 /// possible.
1227 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
1228 VectorizationFactor
1229 selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1230 const LoopVectorizationPlanner &LVP);
1231
1232 /// Setup cost-based decisions for user vectorization factor.
1233 void selectUserVectorizationFactor(ElementCount UserVF) {
1234 collectUniformsAndScalars(UserVF);
1235 collectInstsToScalarize(UserVF);
1236 }
1237
1238 /// \return The size (in bits) of the smallest and widest types in the code
1239 /// that needs to be vectorized. We ignore values that remain scalar such as
1240 /// 64 bit loop indices.
1241 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1242
1243 /// \return The desired interleave count.
1244 /// If interleave count has been specified by metadata it will be returned.
1245 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1246 /// are the selected vectorization factor and the cost of the selected VF.
1247 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1248
1249 /// Memory access instruction may be vectorized in more than one way.
1250 /// Form of instruction after vectorization depends on cost.
1251 /// This function takes cost-based decisions for Load/Store instructions
1252 /// and collects them in a map. This decisions map is used for building
1253 /// the lists of loop-uniform and loop-scalar instructions.
1254 /// The calculated cost is saved with widening decision in order to
1255 /// avoid redundant calculations.
1256 void setCostBasedWideningDecision(ElementCount VF);
1257
1258 /// A struct that represents some properties of the register usage
1259 /// of a loop.
1260 struct RegisterUsage {
1261 /// Holds the number of loop invariant values that are used in the loop.
1262 /// The key is ClassID of target-provided register class.
1263 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1264 /// Holds the maximum number of concurrent live intervals in the loop.
1265 /// The key is ClassID of target-provided register class.
1266 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1267 };
1268
1269 /// \return Returns information about the register usages of the loop for the
1270 /// given vectorization factors.
1271 SmallVector<RegisterUsage, 8>
1272 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1273
1274 /// Collect values we want to ignore in the cost model.
1275 void collectValuesToIgnore();
1276
1277 /// Split reductions into those that happen in the loop, and those that happen
1278 /// outside. In loop reductions are collected into InLoopReductionChains.
1279 void collectInLoopReductions();
1280
1281 /// \returns The smallest bitwidth each instruction can be represented with.
1282 /// The vector equivalents of these instructions should be truncated to this
1283 /// type.
1284 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1285 return MinBWs;
1286 }
1287
1288 /// \returns True if it is more profitable to scalarize instruction \p I for
1289 /// vectorization factor \p VF.
1290 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1291 assert(VF.isVector() &&((VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1292, __PRETTY_FUNCTION__))
1292 "Profitable to scalarize relevant only for VF > 1.")((VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1292, __PRETTY_FUNCTION__))
;
1293
1294 // Cost model is not run in the VPlan-native path - return conservative
1295 // result until this changes.
1296 if (EnableVPlanNativePath)
1297 return false;
1298
1299 auto Scalars = InstsToScalarize.find(VF);
1300 assert(Scalars != InstsToScalarize.end() &&((Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"
) ? static_cast<void> (0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1301, __PRETTY_FUNCTION__))
1301 "VF not yet analyzed for scalarization profitability")((Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"
) ? static_cast<void> (0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1301, __PRETTY_FUNCTION__))
;
1302 return Scalars->second.find(I) != Scalars->second.end();
1303 }
1304
1305 /// Returns true if \p I is known to be uniform after vectorization.
1306 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1307 if (VF.isScalar())
1308 return true;
1309
1310 // Cost model is not run in the VPlan-native path - return conservative
1311 // result until this changes.
1312 if (EnableVPlanNativePath)
1313 return false;
1314
1315 auto UniformsPerVF = Uniforms.find(VF);
1316 assert(UniformsPerVF != Uniforms.end() &&((UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"
) ? static_cast<void> (0) : __assert_fail ("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1317, __PRETTY_FUNCTION__))
1317 "VF not yet analyzed for uniformity")((UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"
) ? static_cast<void> (0) : __assert_fail ("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1317, __PRETTY_FUNCTION__))
;
1318 return UniformsPerVF->second.count(I);
1319 }
1320
1321 /// Returns true if \p I is known to be scalar after vectorization.
1322 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1323 if (VF.isScalar())
1324 return true;
1325
1326 // Cost model is not run in the VPlan-native path - return conservative
1327 // result until this changes.
1328 if (EnableVPlanNativePath)
1329 return false;
1330
1331 auto ScalarsPerVF = Scalars.find(VF);
1332 assert(ScalarsPerVF != Scalars.end() &&((ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"
) ? static_cast<void> (0) : __assert_fail ("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1333, __PRETTY_FUNCTION__))
1333 "Scalar values are not calculated for VF")((ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"
) ? static_cast<void> (0) : __assert_fail ("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1333, __PRETTY_FUNCTION__))
;
1334 return ScalarsPerVF->second.count(I);
1335 }
1336
1337 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1338 /// for vectorization factor \p VF.
1339 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1340 return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1341 !isProfitableToScalarize(I, VF) &&
1342 !isScalarAfterVectorization(I, VF);
1343 }
1344
1345 /// Decision that was taken during cost calculation for memory instruction.
1346 enum InstWidening {
1347 CM_Unknown,
1348 CM_Widen, // For consecutive accesses with stride +1.
1349 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1350 CM_Interleave,
1351 CM_GatherScatter,
1352 CM_Scalarize
1353 };
1354
1355 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1356 /// instruction \p I and vector width \p VF.
1357 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1358 InstructionCost Cost) {
1359 assert(VF.isVector() && "Expected VF >=2")((VF.isVector() && "Expected VF >=2") ? static_cast
<void> (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1359, __PRETTY_FUNCTION__))
;
1360 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1361 }
1362
1363 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1364 /// interleaving group \p Grp and vector width \p VF.
1365 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1366 ElementCount VF, InstWidening W,
1367 InstructionCost Cost) {
1368 assert(VF.isVector() && "Expected VF >=2")((VF.isVector() && "Expected VF >=2") ? static_cast
<void> (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1368, __PRETTY_FUNCTION__))
;
1369 /// Broadcast this decicion to all instructions inside the group.
1370 /// But the cost will be assigned to one instruction only.
1371 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1372 if (auto *I = Grp->getMember(i)) {
1373 if (Grp->getInsertPos() == I)
1374 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1375 else
1376 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1377 }
1378 }
1379 }
1380
1381 /// Return the cost model decision for the given instruction \p I and vector
1382 /// width \p VF. Return CM_Unknown if this instruction did not pass
1383 /// through the cost modeling.
1384 InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1385 assert(VF.isVector() && "Expected VF to be a vector VF")((VF.isVector() && "Expected VF to be a vector VF") ?
static_cast<void> (0) : __assert_fail ("VF.isVector() && \"Expected VF to be a vector VF\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1385, __PRETTY_FUNCTION__))
;
1386 // Cost model is not run in the VPlan-native path - return conservative
1387 // result until this changes.
1388 if (EnableVPlanNativePath)
1389 return CM_GatherScatter;
1390
1391 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1392 auto Itr = WideningDecisions.find(InstOnVF);
1393 if (Itr == WideningDecisions.end())
1394 return CM_Unknown;
1395 return Itr->second.first;
1396 }
1397
1398 /// Return the vectorization cost for the given instruction \p I and vector
1399 /// width \p VF.
1400 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1401 assert(VF.isVector() && "Expected VF >=2")((VF.isVector() && "Expected VF >=2") ? static_cast
<void> (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1401, __PRETTY_FUNCTION__))
;
1402 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1403 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&((WideningDecisions.find(InstOnVF) != WideningDecisions.end()
&& "The cost is not calculated") ? static_cast<void
> (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1404, __PRETTY_FUNCTION__))
1404 "The cost is not calculated")((WideningDecisions.find(InstOnVF) != WideningDecisions.end()
&& "The cost is not calculated") ? static_cast<void
> (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1404, __PRETTY_FUNCTION__))
;
1405 return WideningDecisions[InstOnVF].second;
1406 }
1407
1408 /// Return True if instruction \p I is an optimizable truncate whose operand
1409 /// is an induction variable. Such a truncate will be removed by adding a new
1410 /// induction variable with the destination type.
1411 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1412 // If the instruction is not a truncate, return false.
1413 auto *Trunc = dyn_cast<TruncInst>(I);
1414 if (!Trunc)
1415 return false;
1416
1417 // Get the source and destination types of the truncate.
1418 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1419 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1420
1421 // If the truncate is free for the given types, return false. Replacing a
1422 // free truncate with an induction variable would add an induction variable
1423 // update instruction to each iteration of the loop. We exclude from this
1424 // check the primary induction variable since it will need an update
1425 // instruction regardless.
1426 Value *Op = Trunc->getOperand(0);
1427 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1428 return false;
1429
1430 // If the truncated value is not an induction variable, return false.
1431 return Legal->isInductionPhi(Op);
1432 }
1433
1434 /// Collects the instructions to scalarize for each predicated instruction in
1435 /// the loop.
1436 void collectInstsToScalarize(ElementCount VF);
1437
1438 /// Collect Uniform and Scalar values for the given \p VF.
1439 /// The sets depend on CM decision for Load/Store instructions
1440 /// that may be vectorized as interleave, gather-scatter or scalarized.
1441 void collectUniformsAndScalars(ElementCount VF) {
1442 // Do the analysis once.
1443 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1444 return;
1445 setCostBasedWideningDecision(VF);
1446 collectLoopUniforms(VF);
1447 collectLoopScalars(VF);
1448 }
1449
1450 /// Returns true if the target machine supports masked store operation
1451 /// for the given \p DataType and kind of access to \p Ptr.
1452 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1453 return Legal->isConsecutivePtr(Ptr) &&
1454 TTI.isLegalMaskedStore(DataType, Alignment);
1455 }
1456
1457 /// Returns true if the target machine supports masked load operation
1458 /// for the given \p DataType and kind of access to \p Ptr.
1459 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1460 return Legal->isConsecutivePtr(Ptr) &&
1461 TTI.isLegalMaskedLoad(DataType, Alignment);
1462 }
1463
1464 /// Returns true if the target machine supports masked scatter operation
1465 /// for the given \p DataType.
1466 bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1467 return TTI.isLegalMaskedScatter(DataType, Alignment);
1468 }
1469
1470 /// Returns true if the target machine supports masked gather operation
1471 /// for the given \p DataType.
1472 bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1473 return TTI.isLegalMaskedGather(DataType, Alignment);
1474 }
1475
1476 /// Returns true if the target machine can represent \p V as a masked gather
1477 /// or scatter operation.
1478 bool isLegalGatherOrScatter(Value *V) {
1479 bool LI = isa<LoadInst>(V);
1480 bool SI = isa<StoreInst>(V);
1481 if (!LI && !SI)
1482 return false;
1483 auto *Ty = getMemInstValueType(V);
1484 Align Align = getLoadStoreAlignment(V);
1485 return (LI && isLegalMaskedGather(Ty, Align)) ||
1486 (SI && isLegalMaskedScatter(Ty, Align));
1487 }
1488
1489 /// Returns true if the target machine supports all of the reduction
1490 /// variables found for the given VF.
1491 bool canVectorizeReductions(ElementCount VF) {
1492 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1493 RecurrenceDescriptor RdxDesc = Reduction.second;
1494 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1495 }));
1496 }
1497
1498 /// Returns true if \p I is an instruction that will be scalarized with
1499 /// predication. Such instructions include conditional stores and
1500 /// instructions that may divide by zero.
1501 /// If a non-zero VF has been calculated, we check if I will be scalarized
1502 /// predication for that VF.
1503 bool isScalarWithPredication(Instruction *I,
1504 ElementCount VF = ElementCount::getFixed(1));
1505
1506 // Returns true if \p I is an instruction that will be predicated either
1507 // through scalar predication or masked load/store or masked gather/scatter.
1508 // Superset of instructions that return true for isScalarWithPredication.
1509 bool isPredicatedInst(Instruction *I) {
1510 if (!blockNeedsPredication(I->getParent()))
1511 return false;
1512 // Loads and stores that need some form of masked operation are predicated
1513 // instructions.
1514 if (isa<LoadInst>(I) || isa<StoreInst>(I))
1515 return Legal->isMaskRequired(I);
1516 return isScalarWithPredication(I);
1517 }
1518
1519 /// Returns true if \p I is a memory instruction with consecutive memory
1520 /// access that can be widened.
1521 bool
1522 memoryInstructionCanBeWidened(Instruction *I,
1523 ElementCount VF = ElementCount::getFixed(1));
1524
1525 /// Returns true if \p I is a memory instruction in an interleaved-group
1526 /// of memory accesses that can be vectorized with wide vector loads/stores
1527 /// and shuffles.
1528 bool
1529 interleavedAccessCanBeWidened(Instruction *I,
1530 ElementCount VF = ElementCount::getFixed(1));
1531
1532 /// Check if \p Instr belongs to any interleaved access group.
1533 bool isAccessInterleaved(Instruction *Instr) {
1534 return InterleaveInfo.isInterleaved(Instr);
1535 }
1536
1537 /// Get the interleaved access group that \p Instr belongs to.
1538 const InterleaveGroup<Instruction> *
1539 getInterleavedAccessGroup(Instruction *Instr) {
1540 return InterleaveInfo.getInterleaveGroup(Instr);
1541 }
1542
1543 /// Returns true if we're required to use a scalar epilogue for at least
1544 /// the final iteration of the original loop.
1545 bool requiresScalarEpilogue() const {
1546 if (!isScalarEpilogueAllowed())
1547 return false;
1548 // If we might exit from anywhere but the latch, must run the exiting
1549 // iteration in scalar form.
1550 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1551 return true;
1552 return InterleaveInfo.requiresScalarEpilogue();
1553 }
1554
1555 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1556 /// loop hint annotation.
1557 bool isScalarEpilogueAllowed() const {
1558 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1559 }
1560
1561 /// Returns true if all loop blocks should be masked to fold tail loop.
1562 bool foldTailByMasking() const { return FoldTailByMasking; }
1563
1564 bool blockNeedsPredication(BasicBlock *BB) {
1565 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1566 }
1567
1568 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1569 /// nodes to the chain of instructions representing the reductions. Uses a
1570 /// MapVector to ensure deterministic iteration order.
1571 using ReductionChainMap =
1572 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1573
1574 /// Return the chain of instructions representing an inloop reduction.
1575 const ReductionChainMap &getInLoopReductionChains() const {
1576 return InLoopReductionChains;
1577 }
1578
1579 /// Returns true if the Phi is part of an inloop reduction.
1580 bool isInLoopReduction(PHINode *Phi) const {
1581 return InLoopReductionChains.count(Phi);
1582 }
1583
1584 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1585 /// with factor VF. Return the cost of the instruction, including
1586 /// scalarization overhead if it's needed.
1587 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1588
1589 /// Estimate cost of a call instruction CI if it were vectorized with factor
1590 /// VF. Return the cost of the instruction, including scalarization overhead
1591 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1592 /// scalarized -
1593 /// i.e. either vector version isn't available, or is too expensive.
1594 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1595 bool &NeedToScalarize);
1596
1597 /// Invalidates decisions already taken by the cost model.
1598 void invalidateCostModelingDecisions() {
1599 WideningDecisions.clear();
1600 Uniforms.clear();
1601 Scalars.clear();
1602 }
1603
1604private:
1605 unsigned NumPredStores = 0;
1606
1607 /// \return An upper bound for the vectorization factor, a power-of-2 larger
1608 /// than zero. One is returned if vectorization should best be avoided due
1609 /// to cost.
1610 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
1611 ElementCount UserVF);
1612
1613 /// The vectorization cost is a combination of the cost itself and a boolean
1614 /// indicating whether any of the contributing operations will actually
1615 /// operate on
1616 /// vector values after type legalization in the backend. If this latter value
1617 /// is
1618 /// false, then all operations will be scalarized (i.e. no vectorization has
1619 /// actually taken place).
1620 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1621
1622 /// Returns the expected execution cost. The unit of the cost does
1623 /// not matter because we use the 'cost' units to compare different
1624 /// vector widths. The cost that is returned is *not* normalized by
1625 /// the factor width.
1626 VectorizationCostTy expectedCost(ElementCount VF);
1627
1628 /// Returns the execution time cost of an instruction for a given vector
1629 /// width. Vector width of one means scalar.
1630 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1631
1632 /// The cost-computation logic from getInstructionCost which provides
1633 /// the vector type as an output parameter.
1634 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1635 Type *&VectorTy);
1636
1637 /// Return the cost of instructions in an inloop reduction pattern, if I is
1638 /// part of that pattern.
1639 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF,
1640 Type *VectorTy,
1641 TTI::TargetCostKind CostKind);
1642
1643 /// Calculate vectorization cost of memory instruction \p I.
1644 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1645
1646 /// The cost computation for scalarized memory instruction.
1647 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1648
1649 /// The cost computation for interleaving group of memory instructions.
1650 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1651
1652 /// The cost computation for Gather/Scatter instruction.
1653 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1654
1655 /// The cost computation for widening instruction \p I with consecutive
1656 /// memory access.
1657 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1658
1659 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1660 /// Load: scalar load + broadcast.
1661 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1662 /// element)
1663 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1664
1665 /// Estimate the overhead of scalarizing an instruction. This is a
1666 /// convenience wrapper for the type-based getScalarizationOverhead API.
1667 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF);
1668
1669 /// Returns whether the instruction is a load or store and will be a emitted
1670 /// as a vector operation.
1671 bool isConsecutiveLoadOrStore(Instruction *I);
1672
1673 /// Returns true if an artificially high cost for emulated masked memrefs
1674 /// should be used.
1675 bool useEmulatedMaskMemRefHack(Instruction *I);
1676
1677 /// Map of scalar integer values to the smallest bitwidth they can be legally
1678 /// represented as. The vector equivalents of these values should be truncated
1679 /// to this type.
1680 MapVector<Instruction *, uint64_t> MinBWs;
1681
1682 /// A type representing the costs for instructions if they were to be
1683 /// scalarized rather than vectorized. The entries are Instruction-Cost
1684 /// pairs.
1685 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1686
1687 /// A set containing all BasicBlocks that are known to present after
1688 /// vectorization as a predicated block.
1689 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1690
1691 /// Records whether it is allowed to have the original scalar loop execute at
1692 /// least once. This may be needed as a fallback loop in case runtime
1693 /// aliasing/dependence checks fail, or to handle the tail/remainder
1694 /// iterations when the trip count is unknown or doesn't divide by the VF,
1695 /// or as a peel-loop to handle gaps in interleave-groups.
1696 /// Under optsize and when the trip count is very small we don't allow any
1697 /// iterations to execute in the scalar loop.
1698 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1699
1700 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1701 bool FoldTailByMasking = false;
1702
1703 /// A map holding scalar costs for different vectorization factors. The
1704 /// presence of a cost for an instruction in the mapping indicates that the
1705 /// instruction will be scalarized when vectorizing with the associated
1706 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1707 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1708
1709 /// Holds the instructions known to be uniform after vectorization.
1710 /// The data is collected per VF.
1711 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1712
1713 /// Holds the instructions known to be scalar after vectorization.
1714 /// The data is collected per VF.
1715 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1716
1717 /// Holds the instructions (address computations) that are forced to be
1718 /// scalarized.
1719 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1720
1721 /// PHINodes of the reductions that should be expanded in-loop along with
1722 /// their associated chains of reduction operations, in program order from top
1723 /// (PHI) to bottom
1724 ReductionChainMap InLoopReductionChains;
1725
1726 /// A Map of inloop reduction operations and their immediate chain operand.
1727 /// FIXME: This can be removed once reductions can be costed correctly in
1728 /// vplan. This was added to allow quick lookup to the inloop operations,
1729 /// without having to loop through InLoopReductionChains.
1730 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1731
1732 /// Returns the expected difference in cost from scalarizing the expression
1733 /// feeding a predicated instruction \p PredInst. The instructions to
1734 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1735 /// non-negative return value implies the expression will be scalarized.
1736 /// Currently, only single-use chains are considered for scalarization.
1737 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1738 ElementCount VF);
1739
1740 /// Collect the instructions that are uniform after vectorization. An
1741 /// instruction is uniform if we represent it with a single scalar value in
1742 /// the vectorized loop corresponding to each vector iteration. Examples of
1743 /// uniform instructions include pointer operands of consecutive or
1744 /// interleaved memory accesses. Note that although uniformity implies an
1745 /// instruction will be scalar, the reverse is not true. In general, a
1746 /// scalarized instruction will be represented by VF scalar values in the
1747 /// vectorized loop, each corresponding to an iteration of the original
1748 /// scalar loop.
1749 void collectLoopUniforms(ElementCount VF);
1750
1751 /// Collect the instructions that are scalar after vectorization. An
1752 /// instruction is scalar if it is known to be uniform or will be scalarized
1753 /// during vectorization. Non-uniform scalarized instructions will be
1754 /// represented by VF values in the vectorized loop, each corresponding to an
1755 /// iteration of the original scalar loop.
1756 void collectLoopScalars(ElementCount VF);
1757
1758 /// Keeps cost model vectorization decision and cost for instructions.
1759 /// Right now it is used for memory instructions only.
1760 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1761 std::pair<InstWidening, InstructionCost>>;
1762
1763 DecisionList WideningDecisions;
1764
1765 /// Returns true if \p V is expected to be vectorized and it needs to be
1766 /// extracted.
1767 bool needsExtract(Value *V, ElementCount VF) const {
1768 Instruction *I = dyn_cast<Instruction>(V);
1769 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1770 TheLoop->isLoopInvariant(I))
1771 return false;
1772
1773 // Assume we can vectorize V (and hence we need extraction) if the
1774 // scalars are not computed yet. This can happen, because it is called
1775 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1776 // the scalars are collected. That should be a safe assumption in most
1777 // cases, because we check if the operands have vectorizable types
1778 // beforehand in LoopVectorizationLegality.
1779 return Scalars.find(VF) == Scalars.end() ||
1780 !isScalarAfterVectorization(I, VF);
1781 };
1782
1783 /// Returns a range containing only operands needing to be extracted.
1784 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1785 ElementCount VF) {
1786 return SmallVector<Value *, 4>(make_filter_range(
1787 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1788 }
1789
1790 /// Determines if we have the infrastructure to vectorize loop \p L and its
1791 /// epilogue, assuming the main loop is vectorized by \p VF.
1792 bool isCandidateForEpilogueVectorization(const Loop &L,
1793 const ElementCount VF) const;
1794
1795 /// Returns true if epilogue vectorization is considered profitable, and
1796 /// false otherwise.
1797 /// \p VF is the vectorization factor chosen for the original loop.
1798 bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1799
1800public:
1801 /// The loop that we evaluate.
1802 Loop *TheLoop;
1803
1804 /// Predicated scalar evolution analysis.
1805 PredicatedScalarEvolution &PSE;
1806
1807 /// Loop Info analysis.
1808 LoopInfo *LI;
1809
1810 /// Vectorization legality.
1811 LoopVectorizationLegality *Legal;
1812
1813 /// Vector target information.
1814 const TargetTransformInfo &TTI;
1815
1816 /// Target Library Info.
1817 const TargetLibraryInfo *TLI;
1818
1819 /// Demanded bits analysis.
1820 DemandedBits *DB;
1821
1822 /// Assumption cache.
1823 AssumptionCache *AC;
1824
1825 /// Interface to emit optimization remarks.
1826 OptimizationRemarkEmitter *ORE;
1827
1828 const Function *TheFunction;
1829
1830 /// Loop Vectorize Hint.
1831 const LoopVectorizeHints *Hints;
1832
1833 /// The interleave access information contains groups of interleaved accesses
1834 /// with the same stride and close to each other.
1835 InterleavedAccessInfo &InterleaveInfo;
1836
1837 /// Values to ignore in the cost model.
1838 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1839
1840 /// Values to ignore in the cost model when VF > 1.
1841 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1842
1843 /// Profitable vector factors.
1844 SmallVector<VectorizationFactor, 8> ProfitableVFs;
1845};
1846} // end namespace llvm
1847
1848/// Helper struct to manage generating runtime checks for vectorization.
1849///
1850/// The runtime checks are created up-front in temporary blocks to allow better
1851/// estimating the cost and un-linked from the existing IR. After deciding to
1852/// vectorize, the checks are moved back. If deciding not to vectorize, the
1853/// temporary blocks are completely removed.
1854class GeneratedRTChecks {
1855 /// Basic block which contains the generated SCEV checks, if any.
1856 BasicBlock *SCEVCheckBlock = nullptr;
1857
1858 /// The value representing the result of the generated SCEV checks. If it is
1859 /// nullptr, either no SCEV checks have been generated or they have been used.
1860 Value *SCEVCheckCond = nullptr;
1861
1862 /// Basic block which contains the generated memory runtime checks, if any.
1863 BasicBlock *MemCheckBlock = nullptr;
1864
1865 /// The value representing the result of the generated memory runtime checks.
1866 /// If it is nullptr, either no memory runtime checks have been generated or
1867 /// they have been used.
1868 Instruction *MemRuntimeCheckCond = nullptr;
1869
1870 DominatorTree *DT;
1871 LoopInfo *LI;
1872
1873 SCEVExpander SCEVExp;
1874 SCEVExpander MemCheckExp;
1875
1876public:
1877 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1878 const DataLayout &DL)
1879 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1880 MemCheckExp(SE, DL, "scev.check") {}
1881
1882 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1883 /// accurately estimate the cost of the runtime checks. The blocks are
1884 /// un-linked from the IR and is added back during vector code generation. If
1885 /// there is no vector code generation, the check blocks are removed
1886 /// completely.
1887 void Create(Loop *L, const LoopAccessInfo &LAI,
1888 const SCEVUnionPredicate &UnionPred) {
1889
1890 BasicBlock *LoopHeader = L->getHeader();
1891 BasicBlock *Preheader = L->getLoopPreheader();
1892
1893 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1894 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1895 // may be used by SCEVExpander. The blocks will be un-linked from their
1896 // predecessors and removed from LI & DT at the end of the function.
1897 if (!UnionPred.isAlwaysTrue()) {
1898 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1899 nullptr, "vector.scevcheck");
1900
1901 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1902 &UnionPred, SCEVCheckBlock->getTerminator());
1903 }
1904
1905 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1906 if (RtPtrChecking.Need) {
1907 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1908 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1909 "vector.memcheck");
1910
1911 std::tie(std::ignore, MemRuntimeCheckCond) =
1912 addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1913 RtPtrChecking.getChecks(), MemCheckExp);
1914 assert(MemRuntimeCheckCond &&((MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? static_cast<void> (0)
: __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1916, __PRETTY_FUNCTION__))
1915 "no RT checks generated although RtPtrChecking "((MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? static_cast<void> (0)
: __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1916, __PRETTY_FUNCTION__))
1916 "claimed checks are required")((MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? static_cast<void> (0)
: __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1916, __PRETTY_FUNCTION__))
;
1917 }
1918
1919 if (!MemCheckBlock && !SCEVCheckBlock)
1920 return;
1921
1922 // Unhook the temporary block with the checks, update various places
1923 // accordingly.
1924 if (SCEVCheckBlock)
1925 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1926 if (MemCheckBlock)
1927 MemCheckBlock->replaceAllUsesWith(Preheader);
1928
1929 if (SCEVCheckBlock) {
1930 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1931 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1932 Preheader->getTerminator()->eraseFromParent();
1933 }
1934 if (MemCheckBlock) {
1935 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1936 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1937 Preheader->getTerminator()->eraseFromParent();
1938 }
1939
1940 DT->changeImmediateDominator(LoopHeader, Preheader);
1941 if (MemCheckBlock) {
1942 DT->eraseNode(MemCheckBlock);
1943 LI->removeBlock(MemCheckBlock);
1944 }
1945 if (SCEVCheckBlock) {
1946 DT->eraseNode(SCEVCheckBlock);
1947 LI->removeBlock(SCEVCheckBlock);
1948 }
1949 }
1950
1951 /// Remove the created SCEV & memory runtime check blocks & instructions, if
1952 /// unused.
1953 ~GeneratedRTChecks() {
1954 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
1955 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
1956 if (!SCEVCheckCond)
1957 SCEVCleaner.markResultUsed();
1958
1959 if (!MemRuntimeCheckCond)
1960 MemCheckCleaner.markResultUsed();
1961
1962 if (MemRuntimeCheckCond) {
1963 auto &SE = *MemCheckExp.getSE();
1964 // Memory runtime check generation creates compares that use expanded
1965 // values. Remove them before running the SCEVExpanderCleaners.
1966 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
1967 if (MemCheckExp.isInsertedInstruction(&I))
1968 continue;
1969 SE.forgetValue(&I);
1970 SE.eraseValueFromMap(&I);
1971 I.eraseFromParent();
1972 }
1973 }
1974 MemCheckCleaner.cleanup();
1975 SCEVCleaner.cleanup();
1976
1977 if (SCEVCheckCond)
1978 SCEVCheckBlock->eraseFromParent();
1979 if (MemRuntimeCheckCond)
1980 MemCheckBlock->eraseFromParent();
1981 }
1982
1983 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
1984 /// adjusts the branches to branch to the vector preheader or \p Bypass,
1985 /// depending on the generated condition.
1986 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
1987 BasicBlock *LoopVectorPreHeader,
1988 BasicBlock *LoopExitBlock) {
1989 if (!SCEVCheckCond)
1990 return nullptr;
1991 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
1992 if (C->isZero())
1993 return nullptr;
1994
1995 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
1996
1997 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
1998 // Create new preheader for vector loop.
1999 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2000 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2001
2002 SCEVCheckBlock->getTerminator()->eraseFromParent();
2003 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2004 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2005 SCEVCheckBlock);
2006
2007 DT->addNewBlock(SCEVCheckBlock, Pred);
2008 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2009
2010 ReplaceInstWithInst(
2011 SCEVCheckBlock->getTerminator(),
2012 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2013 // Mark the check as used, to prevent it from being removed during cleanup.
2014 SCEVCheckCond = nullptr;
2015 return SCEVCheckBlock;
2016 }
2017
2018 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2019 /// the branches to branch to the vector preheader or \p Bypass, depending on
2020 /// the generated condition.
2021 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2022 BasicBlock *LoopVectorPreHeader) {
2023 // Check if we generated code that checks in runtime if arrays overlap.
2024 if (!MemRuntimeCheckCond)
2025 return nullptr;
2026
2027 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2028 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2029 MemCheckBlock);
2030
2031 DT->addNewBlock(MemCheckBlock, Pred);
2032 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2033 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2034
2035 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2036 PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2037
2038 ReplaceInstWithInst(
2039 MemCheckBlock->getTerminator(),
2040 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2041 MemCheckBlock->getTerminator()->setDebugLoc(
2042 Pred->getTerminator()->getDebugLoc());
2043
2044 // Mark the check as used, to prevent it from being removed during cleanup.
2045 MemRuntimeCheckCond = nullptr;
2046 return MemCheckBlock;
2047 }
2048};
2049
2050// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2051// vectorization. The loop needs to be annotated with #pragma omp simd
2052// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2053// vector length information is not provided, vectorization is not considered
2054// explicit. Interleave hints are not allowed either. These limitations will be
2055// relaxed in the future.
2056// Please, note that we are currently forced to abuse the pragma 'clang
2057// vectorize' semantics. This pragma provides *auto-vectorization hints*
2058// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2059// provides *explicit vectorization hints* (LV can bypass legal checks and
2060// assume that vectorization is legal). However, both hints are implemented
2061// using the same metadata (llvm.loop.vectorize, processed by
2062// LoopVectorizeHints). This will be fixed in the future when the native IR
2063// representation for pragma 'omp simd' is introduced.
2064static bool isExplicitVecOuterLoop(Loop *OuterLp,
2065 OptimizationRemarkEmitter *ORE) {
2066 assert(!OuterLp->isInnermost() && "This is not an outer loop")((!OuterLp->isInnermost() && "This is not an outer loop"
) ? static_cast<void> (0) : __assert_fail ("!OuterLp->isInnermost() && \"This is not an outer loop\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2066, __PRETTY_FUNCTION__))
;
2067 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2068
2069 // Only outer loops with an explicit vectorization hint are supported.
2070 // Unannotated outer loops are ignored.
2071 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2072 return false;
2073
2074 Function *Fn = OuterLp->getHeader()->getParent();
2075 if (!Hints.allowVectorization(Fn, OuterLp,
2076 true /*VectorizeOnlyWhenForced*/)) {
2077 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"
; } } while (false)
;
2078 return false;
2079 }
2080
2081 if (Hints.getInterleave() > 1) {
2082 // TODO: Interleave support is future work.
2083 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
2084 "outer loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
;
2085 Hints.emitRemarkWithHints();
2086 return false;
2087 }
2088
2089 return true;
2090}
2091
2092static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2093 OptimizationRemarkEmitter *ORE,
2094 SmallVectorImpl<Loop *> &V) {
2095 // Collect inner loops and outer loops without irreducible control flow. For
2096 // now, only collect outer loops that have explicit vectorization hints. If we
2097 // are stress testing the VPlan H-CFG construction, we collect the outermost
2098 // loop of every loop nest.
2099 if (L.isInnermost() || VPlanBuildStressTest ||
2100 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2101 LoopBlocksRPO RPOT(&L);
2102 RPOT.perform(LI);
2103 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2104 V.push_back(&L);
2105 // TODO: Collect inner loops inside marked outer loops in case
2106 // vectorization fails for the outer loop. Do not invoke
2107 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2108 // already known to be reducible. We can use an inherited attribute for
2109 // that.
2110 return;
2111 }
2112 }
2113 for (Loop *InnerL : L)
2114 collectSupportedLoops(*InnerL, LI, ORE, V);
2115}
2116
2117namespace {
2118
2119/// The LoopVectorize Pass.
2120struct LoopVectorize : public FunctionPass {
2121 /// Pass identification, replacement for typeid
2122 static char ID;
2123
2124 LoopVectorizePass Impl;
2125
2126 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2127 bool VectorizeOnlyWhenForced = false)
2128 : FunctionPass(ID),
2129 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2130 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2131 }
2132
2133 bool runOnFunction(Function &F) override {
2134 if (skipFunction(F))
2135 return false;
2136
2137 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2138 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2139 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2140 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2141 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2142 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2143 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2144 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2145 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2146 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2147 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2148 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2149 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2150
2151 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2152 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2153
2154 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2155 GetLAA, *ORE, PSI).MadeAnyChange;
2156 }
2157
2158 void getAnalysisUsage(AnalysisUsage &AU) const override {
2159 AU.addRequired<AssumptionCacheTracker>();
2160 AU.addRequired<BlockFrequencyInfoWrapperPass>();
2161 AU.addRequired<DominatorTreeWrapperPass>();
2162 AU.addRequired<LoopInfoWrapperPass>();
2163 AU.addRequired<ScalarEvolutionWrapperPass>();
2164 AU.addRequired<TargetTransformInfoWrapperPass>();
2165 AU.addRequired<AAResultsWrapperPass>();
2166 AU.addRequired<LoopAccessLegacyAnalysis>();
2167 AU.addRequired<DemandedBitsWrapperPass>();
2168 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2169 AU.addRequired<InjectTLIMappingsLegacy>();
2170
2171 // We currently do not preserve loopinfo/dominator analyses with outer loop
2172 // vectorization. Until this is addressed, mark these analyses as preserved
2173 // only for non-VPlan-native path.
2174 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2175 if (!EnableVPlanNativePath) {
2176 AU.addPreserved<LoopInfoWrapperPass>();
2177 AU.addPreserved<DominatorTreeWrapperPass>();
2178 }
2179
2180 AU.addPreserved<BasicAAWrapperPass>();
2181 AU.addPreserved<GlobalsAAWrapperPass>();
2182 AU.addRequired<ProfileSummaryInfoWrapperPass>();
2183 }
2184};
2185
2186} // end anonymous namespace
2187
2188//===----------------------------------------------------------------------===//
2189// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2190// LoopVectorizationCostModel and LoopVectorizationPlanner.
2191//===----------------------------------------------------------------------===//
2192
2193Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2194 // We need to place the broadcast of invariant variables outside the loop,
2195 // but only if it's proven safe to do so. Else, broadcast will be inside
2196 // vector loop body.
2197 Instruction *Instr = dyn_cast<Instruction>(V);
2198 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2199 (!Instr ||
2200 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2201 // Place the code for broadcasting invariant variables in the new preheader.
2202 IRBuilder<>::InsertPointGuard Guard(Builder);
2203 if (SafeToHoist)
2204 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2205
2206 // Broadcast the scalar into all locations in the vector.
2207 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2208
2209 return Shuf;
2210}
2211
2212void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2213 const InductionDescriptor &II, Value *Step, Value *Start,
2214 Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
2215 VPTransformState &State) {
2216 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2217, __PRETTY_FUNCTION__))
2217 "Expected either an induction phi-node or a truncate of it!")(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2217, __PRETTY_FUNCTION__))
;
2218
2219 // Construct the initial value of the vector IV in the vector loop preheader
2220 auto CurrIP = Builder.saveIP();
2221 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2222 if (isa<TruncInst>(EntryVal)) {
2223 assert(Start->getType()->isIntegerTy() &&((Start->getType()->isIntegerTy() && "Truncation requires an integer type"
) ? static_cast<void> (0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2224, __PRETTY_FUNCTION__))
2224 "Truncation requires an integer type")((Start->getType()->isIntegerTy() && "Truncation requires an integer type"
) ? static_cast<void> (0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2224, __PRETTY_FUNCTION__))
;
2225 auto *TruncType = cast<IntegerType>(EntryVal->getType());
2226 Step = Builder.CreateTrunc(Step, TruncType);
2227 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2228 }
2229 Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2230 Value *SteppedStart =
2231 getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2232
2233 // We create vector phi nodes for both integer and floating-point induction
2234 // variables. Here, we determine the kind of arithmetic we will perform.
2235 Instruction::BinaryOps AddOp;
2236 Instruction::BinaryOps MulOp;
2237 if (Step->getType()->isIntegerTy()) {
2238 AddOp = Instruction::Add;
2239 MulOp = Instruction::Mul;
2240 } else {
2241 AddOp = II.getInductionOpcode();
2242 MulOp = Instruction::FMul;
2243 }
2244
2245 // Multiply the vectorization factor by the step using integer or
2246 // floating-point arithmetic as appropriate.
2247 Value *ConstVF =
2248 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
2249 Value *Mul = Builder.CreateBinOp(MulOp, Step, ConstVF);
2250
2251 // Create a vector splat to use in the induction update.
2252 //
2253 // FIXME: If the step is non-constant, we create the vector splat with
2254 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2255 // handle a constant vector splat.
2256 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2256, __PRETTY_FUNCTION__))
;
2257 Value *SplatVF = isa<Constant>(Mul)
2258 ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2259 : Builder.CreateVectorSplat(VF, Mul);
2260 Builder.restoreIP(CurrIP);
2261
2262 // We may need to add the step a number of times, depending on the unroll
2263 // factor. The last of those goes into the PHI.
2264 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2265 &*LoopVectorBody->getFirstInsertionPt());
2266 VecInd->setDebugLoc(EntryVal->getDebugLoc());
2267 Instruction *LastInduction = VecInd;
2268 for (unsigned Part = 0; Part < UF; ++Part) {
2269 State.set(Def, LastInduction, Part);
2270
2271 if (isa<TruncInst>(EntryVal))
2272 addMetadata(LastInduction, EntryVal);
2273 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
2274 State, Part);
2275
2276 LastInduction = cast<Instruction>(
2277 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2278 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2279 }
2280
2281 // Move the last step to the end of the latch block. This ensures consistent
2282 // placement of all induction updates.
2283 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2284 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2285 auto *ICmp = cast<Instruction>(Br->getCondition());
2286 LastInduction->moveBefore(ICmp);
2287 LastInduction->setName("vec.ind.next");
2288
2289 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2290 VecInd->addIncoming(LastInduction, LoopVectorLatch);
2291}
2292
2293bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2294 return Cost->isScalarAfterVectorization(I, VF) ||
2295 Cost->isProfitableToScalarize(I, VF);
2296}
2297
2298bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2299 if (shouldScalarizeInstruction(IV))
2300 return true;
2301 auto isScalarInst = [&](User *U) -> bool {
2302 auto *I = cast<Instruction>(U);
2303 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2304 };
2305 return llvm::any_of(IV->users(), isScalarInst);
2306}
2307
2308void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2309 const InductionDescriptor &ID, const Instruction *EntryVal,
2310 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
2311 unsigned Part, unsigned Lane) {
2312 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2313, __PRETTY_FUNCTION__))
2313 "Expected either an induction phi-node or a truncate of it!")(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2313, __PRETTY_FUNCTION__))
;
2314
2315 // This induction variable is not the phi from the original loop but the
2316 // newly-created IV based on the proof that casted Phi is equal to the
2317 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2318 // re-uses the same InductionDescriptor that original IV uses but we don't
2319 // have to do any recording in this case - that is done when original IV is
2320 // processed.
2321 if (isa<TruncInst>(EntryVal))
2322 return;
2323
2324 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2325 if (Casts.empty())
2326 return;
2327 // Only the first Cast instruction in the Casts vector is of interest.
2328 // The rest of the Casts (if exist) have no uses outside the
2329 // induction update chain itself.
2330 if (Lane < UINT_MAX(2147483647 *2U +1U))
2331 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
2332 else
2333 State.set(CastDef, VectorLoopVal, Part);
2334}
2335
2336void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
2337 TruncInst *Trunc, VPValue *Def,
2338 VPValue *CastDef,
2339 VPTransformState &State) {
2340 assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&(((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type") ? static_cast
<void> (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2341, __PRETTY_FUNCTION__))
2341 "Primary induction variable must have an integer type")(((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type") ? static_cast
<void> (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2341, __PRETTY_FUNCTION__))
;
2342
2343 auto II = Legal->getInductionVars().find(IV);
2344 assert(II != Legal->getInductionVars().end() && "IV is not an induction")((II != Legal->getInductionVars().end() && "IV is not an induction"
) ? static_cast<void> (0) : __assert_fail ("II != Legal->getInductionVars().end() && \"IV is not an induction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2344, __PRETTY_FUNCTION__))
;
2345
2346 auto ID = II->second;
2347 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match")((IV->getType() == ID.getStartValue()->getType() &&
"Types must match") ? static_cast<void> (0) : __assert_fail
("IV->getType() == ID.getStartValue()->getType() && \"Types must match\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2347, __PRETTY_FUNCTION__))
;
2348
2349 // The value from the original loop to which we are mapping the new induction
2350 // variable.
2351 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2352
2353 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2354
2355 // Generate code for the induction step. Note that induction steps are
2356 // required to be loop-invariant
2357 auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2358 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&((PSE.getSE()->isLoopInvariant(Step, OrigLoop) && "Induction step should be loop invariant"
) ? static_cast<void> (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(Step, OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2359, __PRETTY_FUNCTION__))
2359 "Induction step should be loop invariant")((PSE.getSE()->isLoopInvariant(Step, OrigLoop) && "Induction step should be loop invariant"
) ? static_cast<void> (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(Step, OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2359, __PRETTY_FUNCTION__))
;
2360 if (PSE.getSE()->isSCEVable(IV->getType())) {
2361 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2362 return Exp.expandCodeFor(Step, Step->getType(),
2363 LoopVectorPreHeader->getTerminator());
2364 }
2365 return cast<SCEVUnknown>(Step)->getValue();
2366 };
2367
2368 // The scalar value to broadcast. This is derived from the canonical
2369 // induction variable. If a truncation type is given, truncate the canonical
2370 // induction variable and step. Otherwise, derive these values from the
2371 // induction descriptor.
2372 auto CreateScalarIV = [&](Value *&Step) -> Value * {
2373 Value *ScalarIV = Induction;
2374 if (IV != OldInduction) {
2375 ScalarIV = IV->getType()->isIntegerTy()
2376 ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2377 : Builder.CreateCast(Instruction::SIToFP, Induction,
2378 IV->getType());
2379 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2380 ScalarIV->setName("offset.idx");
2381 }
2382 if (Trunc) {
2383 auto *TruncType = cast<IntegerType>(Trunc->getType());
2384 assert(Step->getType()->isIntegerTy() &&((Step->getType()->isIntegerTy() && "Truncation requires an integer step"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2385, __PRETTY_FUNCTION__))
2385 "Truncation requires an integer step")((Step->getType()->isIntegerTy() && "Truncation requires an integer step"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2385, __PRETTY_FUNCTION__))
;
2386 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2387 Step = Builder.CreateTrunc(Step, TruncType);
2388 }
2389 return ScalarIV;
2390 };
2391
2392 // Create the vector values from the scalar IV, in the absence of creating a
2393 // vector IV.
2394 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2395 Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2396 for (unsigned Part = 0; Part < UF; ++Part) {
2397 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2397, __PRETTY_FUNCTION__))
;
2398 Value *EntryPart =
2399 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2400 ID.getInductionOpcode());
2401 State.set(Def, EntryPart, Part);
2402 if (Trunc)
2403 addMetadata(EntryPart, Trunc);
2404 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
2405 State, Part);
2406 }
2407 };
2408
2409 // Fast-math-flags propagate from the original induction instruction.
2410 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2411 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2412 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2413
2414 // Now do the actual transformations, and start with creating the step value.
2415 Value *Step = CreateStepValue(ID.getStep());
2416 if (VF.isZero() || VF.isScalar()) {
2417 Value *ScalarIV = CreateScalarIV(Step);
2418 CreateSplatIV(ScalarIV, Step);
2419 return;
2420 }
2421
2422 // Determine if we want a scalar version of the induction variable. This is
2423 // true if the induction variable itself is not widened, or if it has at
2424 // least one user in the loop that is not widened.
2425 auto NeedsScalarIV = needsScalarInduction(EntryVal);
2426 if (!NeedsScalarIV) {
2427 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2428 State);
2429 return;
2430 }
2431
2432 // Try to create a new independent vector induction variable. If we can't
2433 // create the phi node, we will splat the scalar induction variable in each
2434 // loop iteration.
2435 if (!shouldScalarizeInstruction(EntryVal)) {
2436 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2437 State);
2438 Value *ScalarIV = CreateScalarIV(Step);
2439 // Create scalar steps that can be used by instructions we will later
2440 // scalarize. Note that the addition of the scalar steps will not increase
2441 // the number of instructions in the loop in the common case prior to
2442 // InstCombine. We will be trading one vector extract for each scalar step.
2443 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2444 return;
2445 }
2446
2447 // All IV users are scalar instructions, so only emit a scalar IV, not a
2448 // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2449 // predicate used by the masked loads/stores.
2450 Value *ScalarIV = CreateScalarIV(Step);
2451 if (!Cost->isScalarEpilogueAllowed())
2452 CreateSplatIV(ScalarIV, Step);
2453 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2454}
2455
2456Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2457 Instruction::BinaryOps BinOp) {
2458 // Create and check the types.
2459 auto *ValVTy = cast<FixedVectorType>(Val->getType());
2460 int VLen = ValVTy->getNumElements();
2461
2462 Type *STy = Val->getType()->getScalarType();
2463 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&(((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
"Induction Step must be an integer or FP") ? static_cast<
void> (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2464, __PRETTY_FUNCTION__))
2464 "Induction Step must be an integer or FP")(((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
"Induction Step must be an integer or FP") ? static_cast<
void> (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2464, __PRETTY_FUNCTION__))
;
2465 assert(Step->getType() == STy && "Step has wrong type")((Step->getType() == STy && "Step has wrong type")
? static_cast<void> (0) : __assert_fail ("Step->getType() == STy && \"Step has wrong type\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2465, __PRETTY_FUNCTION__))
;
2466
2467 SmallVector<Constant *, 8> Indices;
2468
2469 if (STy->isIntegerTy()) {
2470 // Create a vector of consecutive numbers from zero to VF.
2471 for (int i = 0; i < VLen; ++i)
2472 Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2473
2474 // Add the consecutive indices to the vector value.
2475 Constant *Cv = ConstantVector::get(Indices);
2476 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec")((Cv->getType() == Val->getType() && "Invalid consecutive vec"
) ? static_cast<void> (0) : __assert_fail ("Cv->getType() == Val->getType() && \"Invalid consecutive vec\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2476, __PRETTY_FUNCTION__))
;
2477 Step = Builder.CreateVectorSplat(VLen, Step);
2478 assert(Step->getType() == Val->getType() && "Invalid step vec")((Step->getType() == Val->getType() && "Invalid step vec"
) ? static_cast<void> (0) : __assert_fail ("Step->getType() == Val->getType() && \"Invalid step vec\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2478, __PRETTY_FUNCTION__))
;
2479 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2480 // which can be found from the original scalar operations.
2481 Step = Builder.CreateMul(Cv, Step);
2482 return Builder.CreateAdd(Val, Step, "induction");
2483 }
2484
2485 // Floating point induction.
2486 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&(((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
"Binary Opcode should be specified for FP induction") ? static_cast
<void> (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2487, __PRETTY_FUNCTION__))
2487 "Binary Opcode should be specified for FP induction")(((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
"Binary Opcode should be specified for FP induction") ? static_cast
<void> (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2487, __PRETTY_FUNCTION__))
;
2488 // Create a vector of consecutive numbers from zero to VF.
2489 for (int i = 0; i < VLen; ++i)
2490 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2491
2492 // Add the consecutive indices to the vector value.
2493 // Floating-point operations inherit FMF via the builder's flags.
2494 Constant *Cv = ConstantVector::get(Indices);
2495 Step = Builder.CreateVectorSplat(VLen, Step);
2496 Value *MulOp = Builder.CreateFMul(Cv, Step);
2497 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2498}
2499
2500void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2501 Instruction *EntryVal,
2502 const InductionDescriptor &ID,
2503 VPValue *Def, VPValue *CastDef,
2504 VPTransformState &State) {
2505 // We shouldn't have to build scalar steps if we aren't vectorizing.
2506 assert(VF.isVector() && "VF should be greater than one")((VF.isVector() && "VF should be greater than one") ?
static_cast<void> (0) : __assert_fail ("VF.isVector() && \"VF should be greater than one\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2506, __PRETTY_FUNCTION__))
;
2507 // Get the value type and ensure it and the step have the same integer type.
2508 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2509 assert(ScalarIVTy == Step->getType() &&((ScalarIVTy == Step->getType() && "Val and Step should have the same type"
) ? static_cast<void> (0) : __assert_fail ("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2510, __PRETTY_FUNCTION__))
2510 "Val and Step should have the same type")((ScalarIVTy == Step->getType() && "Val and Step should have the same type"
) ? static_cast<void> (0) : __assert_fail ("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2510, __PRETTY_FUNCTION__))
;
2511
2512 // We build scalar steps for both integer and floating-point induction
2513 // variables. Here, we determine the kind of arithmetic we will perform.
2514 Instruction::BinaryOps AddOp;
2515 Instruction::BinaryOps MulOp;
2516 if (ScalarIVTy->isIntegerTy()) {
2517 AddOp = Instruction::Add;
2518 MulOp = Instruction::Mul;
2519 } else {
2520 AddOp = ID.getInductionOpcode();
2521 MulOp = Instruction::FMul;
2522 }
2523
2524 // Determine the number of scalars we need to generate for each unroll
2525 // iteration. If EntryVal is uniform, we only need to generate the first
2526 // lane. Otherwise, we generate all VF values.
2527 unsigned Lanes =
2528 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2529 ? 1
2530 : VF.getKnownMinValue();
2531 assert((!VF.isScalable() || Lanes == 1) &&(((!VF.isScalable() || Lanes == 1) && "Should never scalarize a scalable vector"
) ? static_cast<void> (0) : __assert_fail ("(!VF.isScalable() || Lanes == 1) && \"Should never scalarize a scalable vector\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2532, __PRETTY_FUNCTION__))
2532 "Should never scalarize a scalable vector")(((!VF.isScalable() || Lanes == 1) && "Should never scalarize a scalable vector"
) ? static_cast<void> (0) : __assert_fail ("(!VF.isScalable() || Lanes == 1) && \"Should never scalarize a scalable vector\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2532, __PRETTY_FUNCTION__))
;
2533 // Compute the scalar steps and save the results in State.
2534 for (unsigned Part = 0; Part < UF; ++Part) {
2535 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2536 auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2537 ScalarIVTy->getScalarSizeInBits());
2538 Value *StartIdx =
2539 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2540 if (ScalarIVTy->isFloatingPointTy())
2541 StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
2542 StartIdx = Builder.CreateBinOp(
2543 AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2544 // The step returned by `createStepForVF` is a runtime-evaluated value
2545 // when VF is scalable. Otherwise, it should be folded into a Constant.
2546 assert((VF.isScalable() || isa<Constant>(StartIdx)) &&(((VF.isScalable() || isa<Constant>(StartIdx)) &&
"Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? static_cast<void> (0) : __assert_fail ("(VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2548, __PRETTY_FUNCTION__))
2547 "Expected StartIdx to be folded to a constant when VF is not "(((VF.isScalable() || isa<Constant>(StartIdx)) &&
"Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? static_cast<void> (0) : __assert_fail ("(VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2548, __PRETTY_FUNCTION__))
2548 "scalable")(((VF.isScalable() || isa<Constant>(StartIdx)) &&
"Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? static_cast<void> (0) : __assert_fail ("(VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2548, __PRETTY_FUNCTION__))
;
2549 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2550 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2551 State.set(Def, Add, VPIteration(Part, Lane));
2552 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2553 Part, Lane);
2554 }
2555 }
2556}
2557
2558void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2559 const VPIteration &Instance,
2560 VPTransformState &State) {
2561 Value *ScalarInst = State.get(Def, Instance);
2562 Value *VectorValue = State.get(Def, Instance.Part);
2563 VectorValue = Builder.CreateInsertElement(
2564 VectorValue, ScalarInst,
2565 Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2566 State.set(Def, VectorValue, Instance.Part);
2567}
2568
2569Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2570 assert(Vec->getType()->isVectorTy() && "Invalid type")((Vec->getType()->isVectorTy() && "Invalid type"
) ? static_cast<void> (0) : __assert_fail ("Vec->getType()->isVectorTy() && \"Invalid type\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2570, __PRETTY_FUNCTION__))
;
2571 assert(!VF.isScalable() && "Cannot reverse scalable vectors")((!VF.isScalable() && "Cannot reverse scalable vectors"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"Cannot reverse scalable vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2571, __PRETTY_FUNCTION__))
;
2572 SmallVector<int, 8> ShuffleMask;
2573 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2574 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2575
2576 return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
2577}
2578
2579// Return whether we allow using masked interleave-groups (for dealing with
2580// strided loads/stores that reside in predicated blocks, or for dealing
2581// with gaps).
2582static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2583 // If an override option has been passed in for interleaved accesses, use it.
2584 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2585 return EnableMaskedInterleavedMemAccesses;
2586
2587 return TTI.enableMaskedInterleavedAccessVectorization();
2588}
2589
2590// Try to vectorize the interleave group that \p Instr belongs to.
2591//
2592// E.g. Translate following interleaved load group (factor = 3):
2593// for (i = 0; i < N; i+=3) {
2594// R = Pic[i]; // Member of index 0
2595// G = Pic[i+1]; // Member of index 1
2596// B = Pic[i+2]; // Member of index 2
2597// ... // do something to R, G, B
2598// }
2599// To:
2600// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2601// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2602// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2603// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2604//
2605// Or translate following interleaved store group (factor = 3):
2606// for (i = 0; i < N; i+=3) {
2607// ... do something to R, G, B
2608// Pic[i] = R; // Member of index 0
2609// Pic[i+1] = G; // Member of index 1
2610// Pic[i+2] = B; // Member of index 2
2611// }
2612// To:
2613// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2614// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2615// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2616// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2617// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2618void InnerLoopVectorizer::vectorizeInterleaveGroup(
2619 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2620 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2621 VPValue *BlockInMask) {
2622 Instruction *Instr = Group->getInsertPos();
2623 const DataLayout &DL = Instr->getModule()->getDataLayout();
2624
2625 // Prepare for the vector type of the interleaved load/store.
2626 Type *ScalarTy = getMemInstValueType(Instr);
2627 unsigned InterleaveFactor = Group->getFactor();
2628 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2628, __PRETTY_FUNCTION__))
;
2629 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2630
2631 // Prepare for the new pointers.
2632 SmallVector<Value *, 2> AddrParts;
2633 unsigned Index = Group->getIndex(Instr);
2634
2635 // TODO: extend the masked interleaved-group support to reversed access.
2636 assert((!BlockInMask || !Group->isReverse()) &&(((!BlockInMask || !Group->isReverse()) && "Reversed masked interleave-group not supported."
) ? static_cast<void> (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2637, __PRETTY_FUNCTION__))
2637 "Reversed masked interleave-group not supported.")(((!BlockInMask || !Group->isReverse()) && "Reversed masked interleave-group not supported."
) ? static_cast<void> (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2637, __PRETTY_FUNCTION__))
;
2638
2639 // If the group is reverse, adjust the index to refer to the last vector lane
2640 // instead of the first. We adjust the index from the first vector lane,
2641 // rather than directly getting the pointer for lane VF - 1, because the
2642 // pointer operand of the interleaved access is supposed to be uniform. For
2643 // uniform instructions, we're only required to generate a value for the
2644 // first vector lane in each unroll iteration.
2645 assert(!VF.isScalable() &&((!VF.isScalable() && "scalable vector reverse operation is not implemented"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vector reverse operation is not implemented\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2646, __PRETTY_FUNCTION__))
2646 "scalable vector reverse operation is not implemented")((!VF.isScalable() && "scalable vector reverse operation is not implemented"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vector reverse operation is not implemented\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2646, __PRETTY_FUNCTION__))
;
2647 if (Group->isReverse())
2648 Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2649
2650 for (unsigned Part = 0; Part < UF; Part++) {
2651 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2652 setDebugLocFromInst(Builder, AddrPart);
2653
2654 // Notice current instruction could be any index. Need to adjust the address
2655 // to the member of index 0.
2656 //
2657 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2658 // b = A[i]; // Member of index 0
2659 // Current pointer is pointed to A[i+1], adjust it to A[i].
2660 //
2661 // E.g. A[i+1] = a; // Member of index 1
2662 // A[i] = b; // Member of index 0
2663 // A[i+2] = c; // Member of index 2 (Current instruction)
2664 // Current pointer is pointed to A[i+2], adjust it to A[i].
2665
2666 bool InBounds = false;
2667 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2668 InBounds = gep->isInBounds();
2669 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2670 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2671
2672 // Cast to the vector pointer type.
2673 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2674 Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2675 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2676 }
2677
2678 setDebugLocFromInst(Builder, Instr);
2679 Value *PoisonVec = PoisonValue::get(VecTy);
2680
2681 Value *MaskForGaps = nullptr;
2682 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2683 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2683, __PRETTY_FUNCTION__))
;
2684 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2685 assert(MaskForGaps && "Mask for Gaps is required but it is null")((MaskForGaps && "Mask for Gaps is required but it is null"
) ? static_cast<void> (0) : __assert_fail ("MaskForGaps && \"Mask for Gaps is required but it is null\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2685, __PRETTY_FUNCTION__))
;
2686 }
2687
2688 // Vectorize the interleaved load group.
2689 if (isa<LoadInst>(Instr)) {
2690 // For each unroll part, create a wide load for the group.
2691 SmallVector<Value *, 2> NewLoads;
2692 for (unsigned Part = 0; Part < UF; Part++) {
2693 Instruction *NewLoad;
2694 if (BlockInMask || MaskForGaps) {
2695 assert(useMaskedInterleavedAccesses(*TTI) &&((useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2696, __PRETTY_FUNCTION__))
2696 "masked interleaved groups are not allowed.")((useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2696, __PRETTY_FUNCTION__))
;
2697 Value *GroupMask = MaskForGaps;
2698 if (BlockInMask) {
2699 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2700 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2700, __PRETTY_FUNCTION__))
;
2701 Value *ShuffledMask = Builder.CreateShuffleVector(
2702 BlockInMaskPart,
2703 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2704 "interleaved.mask");
2705 GroupMask = MaskForGaps
2706 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2707 MaskForGaps)
2708 : ShuffledMask;
2709 }
2710 NewLoad =
2711 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2712 GroupMask, PoisonVec, "wide.masked.vec");
2713 }
2714 else
2715 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2716 Group->getAlign(), "wide.vec");
2717 Group->addMetadata(NewLoad);
2718 NewLoads.push_back(NewLoad);
2719 }
2720
2721 // For each member in the group, shuffle out the appropriate data from the
2722 // wide loads.
2723 unsigned J = 0;
2724 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2725 Instruction *Member = Group->getMember(I);
2726
2727 // Skip the gaps in the group.
2728 if (!Member)
2729 continue;
2730
2731 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2731, __PRETTY_FUNCTION__))
;
2732 auto StrideMask =
2733 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2734 for (unsigned Part = 0; Part < UF; Part++) {
2735 Value *StridedVec = Builder.CreateShuffleVector(
2736 NewLoads[Part], StrideMask, "strided.vec");
2737
2738 // If this member has different type, cast the result type.
2739 if (Member->getType() != ScalarTy) {
2740 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2740, __PRETTY_FUNCTION__))
;
2741 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2742 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2743 }
2744
2745 if (Group->isReverse())
2746 StridedVec = reverseVector(StridedVec);
2747
2748 State.set(VPDefs[J], StridedVec, Part);
2749 }
2750 ++J;
2751 }
2752 return;
2753 }
2754
2755 // The sub vector type for current instruction.
2756 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2756, __PRETTY_FUNCTION__))
;
2757 auto *SubVT = VectorType::get(ScalarTy, VF);
2758
2759 // Vectorize the interleaved store group.
2760 for (unsigned Part = 0; Part < UF; Part++) {
2761 // Collect the stored vector from each member.
2762 SmallVector<Value *, 4> StoredVecs;
2763 for (unsigned i = 0; i < InterleaveFactor; i++) {
2764 // Interleaved store group doesn't allow a gap, so each index has a member
2765 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group")((Group->getMember(i) && "Fail to get a member from an interleaved store group"
) ? static_cast<void> (0) : __assert_fail ("Group->getMember(i) && \"Fail to get a member from an interleaved store group\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2765, __PRETTY_FUNCTION__))
;
2766
2767 Value *StoredVec = State.get(StoredValues[i], Part);
2768
2769 if (Group->isReverse())
2770 StoredVec = reverseVector(StoredVec);
2771
2772 // If this member has different type, cast it to a unified type.
2773
2774 if (StoredVec->getType() != SubVT)
2775 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2776
2777 StoredVecs.push_back(StoredVec);
2778 }
2779
2780 // Concatenate all vectors into a wide vector.
2781 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2782
2783 // Interleave the elements in the wide vector.
2784 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2784, __PRETTY_FUNCTION__))
;
2785 Value *IVec = Builder.CreateShuffleVector(
2786 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2787 "interleaved.vec");
2788
2789 Instruction *NewStoreInstr;
2790 if (BlockInMask) {
2791 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2792 Value *ShuffledMask = Builder.CreateShuffleVector(
2793 BlockInMaskPart,
2794 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2795 "interleaved.mask");
2796 NewStoreInstr = Builder.CreateMaskedStore(
2797 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2798 }
2799 else
2800 NewStoreInstr =
2801 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2802
2803 Group->addMetadata(NewStoreInstr);
2804 }
2805}
2806
2807void InnerLoopVectorizer::vectorizeMemoryInstruction(
2808 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2809 VPValue *StoredValue, VPValue *BlockInMask) {
2810 // Attempt to issue a wide load.
2811 LoadInst *LI = dyn_cast<LoadInst>(Instr);
2812 StoreInst *SI = dyn_cast<StoreInst>(Instr);
2813
2814 assert((LI || SI) && "Invalid Load/Store instruction")(((LI || SI) && "Invalid Load/Store instruction") ? static_cast
<void> (0) : __assert_fail ("(LI || SI) && \"Invalid Load/Store instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2814, __PRETTY_FUNCTION__))
;
2815 assert((!SI || StoredValue) && "No stored value provided for widened store")(((!SI || StoredValue) && "No stored value provided for widened store"
) ? static_cast<void> (0) : __assert_fail ("(!SI || StoredValue) && \"No stored value provided for widened store\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2815, __PRETTY_FUNCTION__))
;
2816 assert((!LI || !StoredValue) && "Stored value provided for widened load")(((!LI || !StoredValue) && "Stored value provided for widened load"
) ? static_cast<void> (0) : __assert_fail ("(!LI || !StoredValue) && \"Stored value provided for widened load\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2816, __PRETTY_FUNCTION__))
;
2817
2818 LoopVectorizationCostModel::InstWidening Decision =
2819 Cost->getWideningDecision(Instr, VF);
2820 assert((Decision == LoopVectorizationCostModel::CM_Widen ||(((Decision == LoopVectorizationCostModel::CM_Widen || Decision
== LoopVectorizationCostModel::CM_Widen_Reverse || Decision ==
LoopVectorizationCostModel::CM_GatherScatter) && "CM decision is not to widen the memory instruction"
) ? static_cast<void> (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2823, __PRETTY_FUNCTION__))
2821 Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||(((Decision == LoopVectorizationCostModel::CM_Widen || Decision
== LoopVectorizationCostModel::CM_Widen_Reverse || Decision ==
LoopVectorizationCostModel::CM_GatherScatter) && "CM decision is not to widen the memory instruction"
) ? static_cast<void> (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2823, __PRETTY_FUNCTION__))
2822 Decision == LoopVectorizationCostModel::CM_GatherScatter) &&(((Decision == LoopVectorizationCostModel::CM_Widen || Decision
== LoopVectorizationCostModel::CM_Widen_Reverse || Decision ==
LoopVectorizationCostModel::CM_GatherScatter) && "CM decision is not to widen the memory instruction"
) ? static_cast<void> (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2823, __PRETTY_FUNCTION__))
2823 "CM decision is not to widen the memory instruction")(((Decision == LoopVectorizationCostModel::CM_Widen || Decision
== LoopVectorizationCostModel::CM_Widen_Reverse || Decision ==
LoopVectorizationCostModel::CM_GatherScatter) && "CM decision is not to widen the memory instruction"
) ? static_cast<void> (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2823, __PRETTY_FUNCTION__))
;
2824
2825 Type *ScalarDataTy = getMemInstValueType(Instr);
2826
2827 auto *DataTy = VectorType::get(ScalarDataTy, VF);
2828 const Align Alignment = getLoadStoreAlignment(Instr);
2829
2830 // Determine if the pointer operand of the access is either consecutive or
2831 // reverse consecutive.
2832 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2833 bool ConsecutiveStride =
2834 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2835 bool CreateGatherScatter =
2836 (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2837
2838 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2839 // gather/scatter. Otherwise Decision should have been to Scalarize.
2840 assert((ConsecutiveStride || CreateGatherScatter) &&(((ConsecutiveStride || CreateGatherScatter) && "The instruction should be scalarized"
) ? static_cast<void> (0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2841, __PRETTY_FUNCTION__))
2841 "The instruction should be scalarized")(((ConsecutiveStride || CreateGatherScatter) && "The instruction should be scalarized"
) ? static_cast<void> (0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2841, __PRETTY_FUNCTION__))
;
2842 (void)ConsecutiveStride;
2843
2844 VectorParts BlockInMaskParts(UF);
2845 bool isMaskRequired = BlockInMask;
2846 if (isMaskRequired)
2847 for (unsigned Part = 0; Part < UF; ++Part)
2848 BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2849
2850 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2851 // Calculate the pointer for the specific unroll-part.
2852 GetElementPtrInst *PartPtr = nullptr;
2853
2854 bool InBounds = false;
2855 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2856 InBounds = gep->isInBounds();
2857
2858 if (Reverse) {
2859 assert(!VF.isScalable() &&((!VF.isScalable() && "Reversing vectors is not yet supported for scalable vectors."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"Reversing vectors is not yet supported for scalable vectors.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2860, __PRETTY_FUNCTION__))
2860 "Reversing vectors is not yet supported for scalable vectors.")((!VF.isScalable() && "Reversing vectors is not yet supported for scalable vectors."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"Reversing vectors is not yet supported for scalable vectors.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2860, __PRETTY_FUNCTION__))
;
2861
2862 // If the address is consecutive but reversed, then the
2863 // wide store needs to start at the last vector element.
2864 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2865 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2866 PartPtr->setIsInBounds(InBounds);
2867 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2868 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2869 PartPtr->setIsInBounds(InBounds);
2870 if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2871 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2872 } else {
2873 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2874 PartPtr = cast<GetElementPtrInst>(
2875 Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2876 PartPtr->setIsInBounds(InBounds);
2877 }
2878
2879 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2880 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2881 };
2882
2883 // Handle Stores:
2884 if (SI) {
2885 setDebugLocFromInst(Builder, SI);
2886
2887 for (unsigned Part = 0; Part < UF; ++Part) {
2888 Instruction *NewSI = nullptr;
2889 Value *StoredVal = State.get(StoredValue, Part);
2890 if (CreateGatherScatter) {
2891 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2892 Value *VectorGep = State.get(Addr, Part);
2893 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2894 MaskPart);
2895 } else {
2896 if (Reverse) {
2897 // If we store to reverse consecutive memory locations, then we need
2898 // to reverse the order of elements in the stored value.
2899 StoredVal = reverseVector(StoredVal);
2900 // We don't want to update the value in the map as it might be used in
2901 // another expression. So don't call resetVectorValue(StoredVal).
2902 }
2903 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2904 if (isMaskRequired)
2905 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2906 BlockInMaskParts[Part]);
2907 else
2908 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2909 }
2910 addMetadata(NewSI, SI);
2911 }
2912 return;
2913 }
2914
2915 // Handle loads.
2916 assert(LI && "Must have a load instruction")((LI && "Must have a load instruction") ? static_cast
<void> (0) : __assert_fail ("LI && \"Must have a load instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2916, __PRETTY_FUNCTION__))
;
2917 setDebugLocFromInst(Builder, LI);
2918 for (unsigned Part = 0; Part < UF; ++Part) {
2919 Value *NewLI;
2920 if (CreateGatherScatter) {
2921 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2922 Value *VectorGep = State.get(Addr, Part);
2923 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2924 nullptr, "wide.masked.gather");
2925 addMetadata(NewLI, LI);
2926 } else {
2927 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2928 if (isMaskRequired)
2929 NewLI = Builder.CreateMaskedLoad(
2930 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy),
2931 "wide.masked.load");
2932 else
2933 NewLI =
2934 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2935
2936 // Add metadata to the load, but setVectorValue to the reverse shuffle.
2937 addMetadata(NewLI, LI);
2938 if (Reverse)
2939 NewLI = reverseVector(NewLI);
2940 }
2941
2942 State.set(Def, NewLI, Part);
2943 }
2944}
2945
2946void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
2947 VPUser &User,
2948 const VPIteration &Instance,
2949 bool IfPredicateInstr,
2950 VPTransformState &State) {
2951 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")((!Instr->getType()->isAggregateType() && "Can't handle vectors"
) ? static_cast<void> (0) : __assert_fail ("!Instr->getType()->isAggregateType() && \"Can't handle vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2951, __PRETTY_FUNCTION__))
;
2952
2953 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2954 // the first lane and part.
2955 if (isa<NoAliasScopeDeclInst>(Instr))
2956 if (!Instance.isFirstIteration())
2957 return;
2958
2959 setDebugLocFromInst(Builder, Instr);
2960
2961 // Does this instruction return a value ?
2962 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2963
2964 Instruction *Cloned = Instr->clone();
2965 if (!IsVoidRetTy)
2966 Cloned->setName(Instr->getName() + ".cloned");
2967
2968 State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
2969 Builder.GetInsertPoint());
2970 // Replace the operands of the cloned instructions with their scalar
2971 // equivalents in the new loop.
2972 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2973 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
2974 auto InputInstance = Instance;
2975 if (!Operand || !OrigLoop->contains(Operand) ||
2976 (Cost->isUniformAfterVectorization(Operand, State.VF)))
2977 InputInstance.Lane = VPLane::getFirstLane();
2978 auto *NewOp = State.get(User.getOperand(op), InputInstance);
2979 Cloned->setOperand(op, NewOp);
2980 }
2981 addNewMetadata(Cloned, Instr);
2982
2983 // Place the cloned scalar in the new loop.
2984 Builder.Insert(Cloned);
2985
2986 State.set(Def, Cloned, Instance);
2987
2988 // If we just cloned a new assumption, add it the assumption cache.
2989 if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2990 if (II->getIntrinsicID() == Intrinsic::assume)
2991 AC->registerAssumption(II);
2992
2993 // End if-block.
2994 if (IfPredicateInstr)
2995 PredicatedInstructions.push_back(Cloned);
2996}
2997
2998PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2999 Value *End, Value *Step,
3000 Instruction *DL) {
3001 BasicBlock *Header = L->getHeader();
3002 BasicBlock *Latch = L->getLoopLatch();
3003 // As we're just creating this loop, it's possible no latch exists
3004 // yet. If so, use the header as this will be a single block loop.
3005 if (!Latch)
3006 Latch = Header;
3007
3008 IRBuilder<> Builder(&*Header->getFirstInsertionPt());
3009 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3010 setDebugLocFromInst(Builder, OldInst);
3011 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
3012
3013 Builder.SetInsertPoint(Latch->getTerminator());
3014 setDebugLocFromInst(Builder, OldInst);
3015
3016 // Create i+1 and fill the PHINode.
3017 Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
3018 Induction->addIncoming(Start, L->getLoopPreheader());
3019 Induction->addIncoming(Next, Latch);
3020 // Create the compare.
3021 Value *ICmp = Builder.CreateICmpEQ(Next, End);
3022 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
3023
3024 // Now we have two terminators. Remove the old one from the block.
3025 Latch->getTerminator()->eraseFromParent();
3026
3027 return Induction;
3028}
3029
3030Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3031 if (TripCount)
3032 return TripCount;
3033
3034 assert(L && "Create Trip Count for null loop.")((L && "Create Trip Count for null loop.") ? static_cast
<void> (0) : __assert_fail ("L && \"Create Trip Count for null loop.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3034, __PRETTY_FUNCTION__))
;
3035 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3036 // Find the loop boundaries.
3037 ScalarEvolution *SE = PSE.getSE();
3038 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3039 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&((!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
"Invalid loop count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3040, __PRETTY_FUNCTION__))
3040 "Invalid loop count")((!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
"Invalid loop count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3040, __PRETTY_FUNCTION__))
;
3041
3042 Type *IdxTy = Legal->getWidestInductionType();
3043 assert(IdxTy && "No type for induction")((IdxTy && "No type for induction") ? static_cast<
void> (0) : __assert_fail ("IdxTy && \"No type for induction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3043, __PRETTY_FUNCTION__))
;
3044
3045 // The exit count might have the type of i64 while the phi is i32. This can
3046 // happen if we have an induction variable that is sign extended before the
3047 // compare. The only way that we get a backedge taken count is that the
3048 // induction variable was signed and as such will not overflow. In such a case
3049 // truncation is legal.
3050 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3051 IdxTy->getPrimitiveSizeInBits())
3052 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3053 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3054
3055 // Get the total trip count from the count by adding 1.
3056 const SCEV *ExitCount = SE->getAddExpr(
3057 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3058
3059 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3060
3061 // Expand the trip count and place the new instructions in the preheader.
3062 // Notice that the pre-header does not change, only the loop body.
3063 SCEVExpander Exp(*SE, DL, "induction");
3064
3065 // Count holds the overall loop count (N).
3066 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3067 L->getLoopPreheader()->getTerminator());
3068
3069 if (TripCount->getType()->isPointerTy())
3070 TripCount =
3071 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3072 L->getLoopPreheader()->getTerminator());
3073
3074 return TripCount;
3075}
3076
3077Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3078 if (VectorTripCount)
3079 return VectorTripCount;
3080
3081 Value *TC = getOrCreateTripCount(L);
3082 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3083
3084 Type *Ty = TC->getType();
3085 // This is where we can make the step a runtime constant.
3086 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
3087
3088 // If the tail is to be folded by masking, round the number of iterations N
3089 // up to a multiple of Step instead of rounding down. This is done by first
3090 // adding Step-1 and then rounding down. Note that it's ok if this addition
3091 // overflows: the vector induction variable will eventually wrap to zero given
3092 // that it starts at zero and its Step is a power of two; the loop will then
3093 // exit, with the last early-exit vector comparison also producing all-true.
3094 if (Cost->foldTailByMasking()) {
3095 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&((isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3096, __PRETTY_FUNCTION__))
3096 "VF*UF must be a power of 2 when folding tail by masking")((isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3096, __PRETTY_FUNCTION__))
;
3097 assert(!VF.isScalable() &&((!VF.isScalable() && "Tail folding not yet supported for scalable vectors"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"Tail folding not yet supported for scalable vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3098, __PRETTY_FUNCTION__))
3098 "Tail folding not yet supported for scalable vectors")((!VF.isScalable() && "Tail folding not yet supported for scalable vectors"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"Tail folding not yet supported for scalable vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3098, __PRETTY_FUNCTION__))
;
3099 TC = Builder.CreateAdd(
3100 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3101 }
3102
3103 // Now we need to generate the expression for the part of the loop that the
3104 // vectorized body will execute. This is equal to N - (N % Step) if scalar
3105 // iterations are not required for correctness, or N - Step, otherwise. Step
3106 // is equal to the vectorization factor (number of SIMD elements) times the
3107 // unroll factor (number of SIMD instructions).
3108 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3109
3110 // There are two cases where we need to ensure (at least) the last iteration
3111 // runs in the scalar remainder loop. Thus, if the step evenly divides
3112 // the trip count, we set the remainder to be equal to the step. If the step
3113 // does not evenly divide the trip count, no adjustment is necessary since
3114 // there will already be scalar iterations. Note that the minimum iterations
3115 // check ensures that N >= Step. The cases are:
3116 // 1) If there is a non-reversed interleaved group that may speculatively
3117 // access memory out-of-bounds.
3118 // 2) If any instruction may follow a conditionally taken exit. That is, if
3119 // the loop contains multiple exiting blocks, or a single exiting block
3120 // which is not the latch.
3121 if (VF.isVector() && Cost->requiresScalarEpilogue()) {
3122 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3123 R = Builder.CreateSelect(IsZero, Step, R);
3124 }
3125
3126 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3127
3128 return VectorTripCount;
3129}
3130
3131Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3132 const DataLayout &DL) {
3133 // Verify that V is a vector type with same number of elements as DstVTy.
3134 auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3135 unsigned VF = DstFVTy->getNumElements();
3136 auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3137 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match")(((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"
) ? static_cast<void> (0) : __assert_fail ("(VF == SrcVecTy->getNumElements()) && \"Vector dimensions do not match\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3137, __PRETTY_FUNCTION__))
;
3138 Type *SrcElemTy = SrcVecTy->getElementType();
3139 Type *DstElemTy = DstFVTy->getElementType();
3140 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&(((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy
)) && "Vector elements must have same size") ? static_cast
<void> (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3141, __PRETTY_FUNCTION__))
3141 "Vector elements must have same size")(((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy
)) && "Vector elements must have same size") ? static_cast
<void> (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3141, __PRETTY_FUNCTION__))
;
3142
3143 // Do a direct cast if element types are castable.
3144 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3145 return Builder.CreateBitOrPointerCast(V, DstFVTy);
3146 }
3147 // V cannot be directly casted to desired vector type.
3148 // May happen when V is a floating point vector but DstVTy is a vector of
3149 // pointers or vice-versa. Handle this using a two-step bitcast using an
3150 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3151 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&(((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()
) && "Only one type should be a pointer type") ? static_cast
<void> (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3152, __PRETTY_FUNCTION__))
3152 "Only one type should be a pointer type")(((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()
) && "Only one type should be a pointer type") ? static_cast
<void> (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3152, __PRETTY_FUNCTION__))
;
3153 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&(((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy
()) && "Only one type should be a floating point type"
) ? static_cast<void> (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3154, __PRETTY_FUNCTION__))
3154 "Only one type should be a floating point type")(((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy
()) && "Only one type should be a floating point type"
) ? static_cast<void> (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3154, __PRETTY_FUNCTION__))
;
3155 Type *IntTy =
3156 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3157 auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3158 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3159 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3160}
3161
3162void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3163 BasicBlock *Bypass) {
3164 Value *Count = getOrCreateTripCount(L);
3165 // Reuse existing vector loop preheader for TC checks.
3166 // Note that new preheader block is generated for vector loop.
3167 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3168 IRBuilder<> Builder(TCCheckBlock->getTerminator());
3169
3170 // Generate code to check if the loop's trip count is less than VF * UF, or
3171 // equal to it in case a scalar epilogue is required; this implies that the
3172 // vector trip count is zero. This check also covers the case where adding one
3173 // to the backedge-taken count overflowed leading to an incorrect trip count
3174 // of zero. In this case we will also jump to the scalar loop.
3175 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3176 : ICmpInst::ICMP_ULT;
3177
3178 // If tail is to be folded, vector loop takes care of all iterations.
3179 Value *CheckMinIters = Builder.getFalse();
3180 if (!Cost->foldTailByMasking()) {
3181 Value *Step =
3182 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3183 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3184 }
3185 // Create new preheader for vector loop.
3186 LoopVectorPreHeader =
3187 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3188 "vector.ph");
3189
3190 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3192, __PRETTY_FUNCTION__))
3191 DT->getNode(Bypass)->getIDom()) &&((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3192, __PRETTY_FUNCTION__))
3192 "TC check is expected to dominate Bypass")((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3192, __PRETTY_FUNCTION__))
;
3193
3194 // Update dominator for Bypass & LoopExit.
3195 DT->changeImmediateDominator(Bypass, TCCheckBlock);
3196 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3197
3198 ReplaceInstWithInst(
3199 TCCheckBlock->getTerminator(),
3200 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3201 LoopBypassBlocks.push_back(TCCheckBlock);
3202}
3203
3204BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3205
3206 BasicBlock *const SCEVCheckBlock =
3207 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3208 if (!SCEVCheckBlock)
3209 return nullptr;
3210
3211 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||((!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile
&& Cost->Hints->getForce() != LoopVectorizeHints
::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3214, __PRETTY_FUNCTION__))
3212 (OptForSizeBasedOnProfile &&((!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile
&& Cost->Hints->getForce() != LoopVectorizeHints
::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3214, __PRETTY_FUNCTION__))
3213 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&((!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile
&& Cost->Hints->getForce() != LoopVectorizeHints
::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3214, __PRETTY_FUNCTION__))
3214 "Cannot SCEV check stride or overflow when optimizing for size")((!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile
&& Cost->Hints->getForce() != LoopVectorizeHints
::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3214, __PRETTY_FUNCTION__))
;
3215
3216
3217 // Update dominator only if this is first RT check.
3218 if (LoopBypassBlocks.empty()) {
3219 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3220 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3221 }
3222
3223 LoopBypassBlocks.push_back(SCEVCheckBlock);
3224 AddedSafetyChecks = true;
3225 return SCEVCheckBlock;
3226}
3227
3228BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3229 BasicBlock *Bypass) {
3230 // VPlan-native path does not do any analysis for runtime checks currently.
3231 if (EnableVPlanNativePath)
3232 return nullptr;
3233
3234 BasicBlock *const MemCheckBlock =
3235 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3236
3237 // Check if we generated code that checks in runtime if arrays overlap. We put
3238 // the checks into a separate block to make the more common case of few
3239 // elements faster.
3240 if (!MemCheckBlock)
3241 return nullptr;
3242
3243 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3244 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&((Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled
&& "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? static_cast<void> (0) : __assert_fail
("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3246, __PRETTY_FUNCTION__))
3245 "Cannot emit memory checks when optimizing for size, unless forced "((Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled
&& "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? static_cast<void> (0) : __assert_fail
("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3246, __PRETTY_FUNCTION__))
3246 "to vectorize.")((Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled
&& "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? static_cast<void> (0) : __assert_fail
("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3246, __PRETTY_FUNCTION__))
;
3247 ORE->emit([&]() {
3248 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationCodeSize",
3249 L->getStartLoc(), L->getHeader())
3250 << "Code-size may be reduced by not forcing "
3251 "vectorization, or by source-code modifications "
3252 "eliminating the need for runtime checks "
3253 "(e.g., adding 'restrict').";
3254 });
3255 }
3256
3257 LoopBypassBlocks.push_back(MemCheckBlock);
3258
3259 AddedSafetyChecks = true;
3260
3261 // We currently don't use LoopVersioning for the actual loop cloning but we
3262 // still use it to add the noalias metadata.
3263 LVer = std::make_unique<LoopVersioning>(
3264 *Legal->getLAI(),
3265 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3266 DT, PSE.getSE());
3267 LVer->prepareNoAliasMetadata();
3268 return MemCheckBlock;
3269}
3270
3271Value *InnerLoopVectorizer::emitTransformedIndex(
3272 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3273 const InductionDescriptor &ID) const {
3274
3275 SCEVExpander Exp(*SE, DL, "induction");
3276 auto Step = ID.getStep();
3277 auto StartValue = ID.getStartValue();
3278 assert(Index->getType() == Step->getType() &&((Index->getType() == Step->getType() && "Index type does not match StepValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == Step->getType() && \"Index type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3279, __PRETTY_FUNCTION__))
3279 "Index type does not match StepValue type")((Index->getType() == Step->getType() && "Index type does not match StepValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == Step->getType() && \"Index type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3279, __PRETTY_FUNCTION__))
;
3280
3281 // Note: the IR at this point is broken. We cannot use SE to create any new
3282 // SCEV and then expand it, hoping that SCEV's simplification will give us
3283 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3284 // lead to various SCEV crashes. So all we can do is to use builder and rely
3285 // on InstCombine for future simplifications. Here we handle some trivial
3286 // cases only.
3287 auto CreateAdd = [&B](Value *X, Value *Y) {
3288 assert(X->getType() == Y->getType() && "Types don't match!")((X->getType() == Y->getType() && "Types don't match!"
) ? static_cast<void> (0) : __assert_fail ("X->getType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3288, __PRETTY_FUNCTION__))
;
3289 if (auto *CX = dyn_cast<ConstantInt>(X))
3290 if (CX->isZero())
3291 return Y;
3292 if (auto *CY = dyn_cast<ConstantInt>(Y))
3293 if (CY->isZero())
3294 return X;
3295 return B.CreateAdd(X, Y);
3296 };
3297
3298 auto CreateMul = [&B](Value *X, Value *Y) {
3299 assert(X->getType() == Y->getType() && "Types don't match!")((X->getType() == Y->getType() && "Types don't match!"
) ? static_cast<void> (0) : __assert_fail ("X->getType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3299, __PRETTY_FUNCTION__))
;
3300 if (auto *CX = dyn_cast<ConstantInt>(X))
3301 if (CX->isOne())
3302 return Y;
3303 if (auto *CY = dyn_cast<ConstantInt>(Y))
3304 if (CY->isOne())
3305 return X;
3306 return B.CreateMul(X, Y);
3307 };
3308
3309 // Get a suitable insert point for SCEV expansion. For blocks in the vector
3310 // loop, choose the end of the vector loop header (=LoopVectorBody), because
3311 // the DomTree is not kept up-to-date for additional blocks generated in the
3312 // vector loop. By using the header as insertion point, we guarantee that the
3313 // expanded instructions dominate all their uses.
3314 auto GetInsertPoint = [this, &B]() {
3315 BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3316 if (InsertBB != LoopVectorBody &&
3317 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3318 return LoopVectorBody->getTerminator();
3319 return &*B.GetInsertPoint();
3320 };
3321
3322 switch (ID.getKind()) {
3323 case InductionDescriptor::IK_IntInduction: {
3324 assert(Index->getType() == StartValue->getType() &&((Index->getType() == StartValue->getType() && "Index type does not match StartValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3325, __PRETTY_FUNCTION__))
3325 "Index type does not match StartValue type")((Index->getType() == StartValue->getType() && "Index type does not match StartValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3325, __PRETTY_FUNCTION__))
;
3326 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3327 return B.CreateSub(StartValue, Index);
3328 auto *Offset = CreateMul(
3329 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3330 return CreateAdd(StartValue, Offset);
3331 }
3332 case InductionDescriptor::IK_PtrInduction: {
3333 assert(isa<SCEVConstant>(Step) &&((isa<SCEVConstant>(Step) && "Expected constant step for pointer induction"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3334, __PRETTY_FUNCTION__))
3334 "Expected constant step for pointer induction")((isa<SCEVConstant>(Step) && "Expected constant step for pointer induction"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3334, __PRETTY_FUNCTION__))
;
3335 return B.CreateGEP(
3336 StartValue->getType()->getPointerElementType(), StartValue,
3337 CreateMul(Index,
3338 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3339 }
3340 case InductionDescriptor::IK_FpInduction: {
3341 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value")((Step->getType()->isFloatingPointTy() && "Expected FP Step value"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isFloatingPointTy() && \"Expected FP Step value\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3341, __PRETTY_FUNCTION__))
;
3342 auto InductionBinOp = ID.getInductionBinOp();
3343 assert(InductionBinOp &&((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3346, __PRETTY_FUNCTION__))
3344 (InductionBinOp->getOpcode() == Instruction::FAdd ||((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3346, __PRETTY_FUNCTION__))
3345 InductionBinOp->getOpcode() == Instruction::FSub) &&((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3346, __PRETTY_FUNCTION__))
3346 "Original bin op should be defined for FP induction")((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3346, __PRETTY_FUNCTION__))
;
3347
3348 Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3349 Value *MulExp = B.CreateFMul(StepValue, Index);
3350 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3351 "induction");
3352 }
3353 case InductionDescriptor::IK_NoInduction:
3354 return nullptr;
3355 }
3356 llvm_unreachable("invalid enum")::llvm::llvm_unreachable_internal("invalid enum", "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3356)
;
3357}
3358
3359Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3360 LoopScalarBody = OrigLoop->getHeader();
3361 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3362 LoopExitBlock = OrigLoop->getUniqueExitBlock();
3363 assert(LoopExitBlock && "Must have an exit block")((LoopExitBlock && "Must have an exit block") ? static_cast
<void> (0) : __assert_fail ("LoopExitBlock && \"Must have an exit block\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3363, __PRETTY_FUNCTION__))
;
3364 assert(LoopVectorPreHeader && "Invalid loop structure")((LoopVectorPreHeader && "Invalid loop structure") ? static_cast
<void> (0) : __assert_fail ("LoopVectorPreHeader && \"Invalid loop structure\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3364, __PRETTY_FUNCTION__))
;
3365
3366 LoopMiddleBlock =
3367 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3368 LI, nullptr, Twine(Prefix) + "middle.block");
3369 LoopScalarPreHeader =
3370 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3371 nullptr, Twine(Prefix) + "scalar.ph");
3372
3373 // Set up branch from middle block to the exit and scalar preheader blocks.
3374 // completeLoopSkeleton will update the condition to use an iteration check,
3375 // if required to decide whether to execute the remainder.
3376 BranchInst *BrInst =
3377 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
3378 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3379 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3380 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3381
3382 // We intentionally don't let SplitBlock to update LoopInfo since
3383 // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3384 // LoopVectorBody is explicitly added to the correct place few lines later.
3385 LoopVectorBody =
3386 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3387 nullptr, nullptr, Twine(Prefix) + "vector.body");
3388
3389 // Update dominator for loop exit.
3390 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3391
3392 // Create and register the new vector loop.
3393 Loop *Lp = LI->AllocateLoop();
3394 Loop *ParentLoop = OrigLoop->getParentLoop();
3395
3396 // Insert the new loop into the loop nest and register the new basic blocks
3397 // before calling any utilities such as SCEV that require valid LoopInfo.
3398 if (ParentLoop) {
3399 ParentLoop->addChildLoop(Lp);
3400 } else {
3401 LI->addTopLevelLoop(Lp);
3402 }
3403 Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3404 return Lp;
3405}
3406
3407void InnerLoopVectorizer::createInductionResumeValues(
3408 Loop *L, Value *VectorTripCount,
3409 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3410 assert(VectorTripCount && L && "Expected valid arguments")((VectorTripCount && L && "Expected valid arguments"
) ? static_cast<void> (0) : __assert_fail ("VectorTripCount && L && \"Expected valid arguments\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3410, __PRETTY_FUNCTION__))
;
3411 assert(((AdditionalBypass.first && AdditionalBypass.second) ||((((AdditionalBypass.first && AdditionalBypass.second
) || (!AdditionalBypass.first && !AdditionalBypass.second
)) && "Inconsistent information about additional bypass."
) ? static_cast<void> (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3413, __PRETTY_FUNCTION__))
3412 (!AdditionalBypass.first && !AdditionalBypass.second)) &&((((AdditionalBypass.first && AdditionalBypass.second
) || (!AdditionalBypass.first && !AdditionalBypass.second
)) && "Inconsistent information about additional bypass."
) ? static_cast<void> (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3413, __PRETTY_FUNCTION__))
3413 "Inconsistent information about additional bypass.")((((AdditionalBypass.first && AdditionalBypass.second
) || (!AdditionalBypass.first && !AdditionalBypass.second
)) && "Inconsistent information about additional bypass."
) ? static_cast<void> (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3413, __PRETTY_FUNCTION__))
;
3414 // We are going to resume the execution of the scalar loop.
3415 // Go over all of the induction variables that we found and fix the
3416 // PHIs that are left in the scalar version of the loop.
3417 // The starting values of PHI nodes depend on the counter of the last
3418 // iteration in the vectorized loop.
3419 // If we come from a bypass edge then we need to start from the original
3420 // start value.
3421 for (auto &InductionEntry : Legal->getInductionVars()) {
3422 PHINode *OrigPhi = InductionEntry.first;
3423 InductionDescriptor II = InductionEntry.second;
3424
3425 // Create phi nodes to merge from the backedge-taken check block.
3426 PHINode *BCResumeVal =
3427 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3428 LoopScalarPreHeader->getTerminator());
3429 // Copy original phi DL over to the new one.
3430 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3431 Value *&EndValue = IVEndValues[OrigPhi];
3432 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3433 if (OrigPhi == OldInduction) {
3434 // We know what the end value is.
3435 EndValue = VectorTripCount;
3436 } else {
3437 IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3438
3439 // Fast-math-flags propagate from the original induction instruction.
3440 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3441 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3442
3443 Type *StepType = II.getStep()->getType();
3444 Instruction::CastOps CastOp =
3445 CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3446 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3447 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3448 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3449 EndValue->setName("ind.end");
3450
3451 // Compute the end value for the additional bypass (if applicable).
3452 if (AdditionalBypass.first) {
3453 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3454 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3455 StepType, true);
3456 CRD =
3457 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3458 EndValueFromAdditionalBypass =
3459 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3460 EndValueFromAdditionalBypass->setName("ind.end");
3461 }
3462 }
3463 // The new PHI merges the original incoming value, in case of a bypass,
3464 // or the value at the end of the vectorized loop.
3465 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3466
3467 // Fix the scalar body counter (PHI node).
3468 // The old induction's phi node in the scalar body needs the truncated
3469 // value.
3470 for (BasicBlock *BB : LoopBypassBlocks)
3471 BCResumeVal->addIncoming(II.getStartValue(), BB);
3472
3473 if (AdditionalBypass.first)
3474 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3475 EndValueFromAdditionalBypass);
3476
3477 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3478 }
3479}
3480
3481BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3482 MDNode *OrigLoopID) {
3483 assert(L && "Expected valid loop.")((L && "Expected valid loop.") ? static_cast<void>
(0) : __assert_fail ("L && \"Expected valid loop.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3483, __PRETTY_FUNCTION__))
;
3484
3485 // The trip counts should be cached by now.
3486 Value *Count = getOrCreateTripCount(L);
3487 Value *VectorTripCount = getOrCreateVectorTripCount(L);
3488
3489 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3490
3491 // Add a check in the middle block to see if we have completed
3492 // all of the iterations in the first vector loop.
3493 // If (N - N%VF) == N, then we *don't* need to run the remainder.
3494 // If tail is to be folded, we know we don't need to run the remainder.
3495 if (!Cost->foldTailByMasking()) {
3496 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3497 Count, VectorTripCount, "cmp.n",
3498 LoopMiddleBlock->getTerminator());
3499
3500 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3501 // of the corresponding compare because they may have ended up with
3502 // different line numbers and we want to avoid awkward line stepping while
3503 // debugging. Eg. if the compare has got a line number inside the loop.
3504 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3505 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3506 }
3507
3508 // Get ready to start creating new instructions into the vectorized body.
3509 assert(LoopVectorPreHeader == L->getLoopPreheader() &&((LoopVectorPreHeader == L->getLoopPreheader() && "Inconsistent vector loop preheader"
) ? static_cast<void> (0) : __assert_fail ("LoopVectorPreHeader == L->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3510, __PRETTY_FUNCTION__))
3510 "Inconsistent vector loop preheader")((LoopVectorPreHeader == L->getLoopPreheader() && "Inconsistent vector loop preheader"
) ? static_cast<void> (0) : __assert_fail ("LoopVectorPreHeader == L->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3510, __PRETTY_FUNCTION__))
;
3511 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3512
3513 Optional<MDNode *> VectorizedLoopID =
3514 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3515 LLVMLoopVectorizeFollowupVectorized});
3516 if (VectorizedLoopID.hasValue()) {
3517 L->setLoopID(VectorizedLoopID.getValue());
3518
3519 // Do not setAlreadyVectorized if loop attributes have been defined
3520 // explicitly.
3521 return LoopVectorPreHeader;
3522 }
3523
3524 // Keep all loop hints from the original loop on the vector loop (we'll
3525 // replace the vectorizer-specific hints below).
3526 if (MDNode *LID = OrigLoop->getLoopID())
3527 L->setLoopID(LID);
3528
3529 LoopVectorizeHints Hints(L, true, *ORE);
3530 Hints.setAlreadyVectorized();
3531
3532#ifdef EXPENSIVE_CHECKS
3533 assert(DT->verify(DominatorTree::VerificationLevel::Fast))((DT->verify(DominatorTree::VerificationLevel::Fast)) ? static_cast
<void> (0) : __assert_fail ("DT->verify(DominatorTree::VerificationLevel::Fast)"
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3533, __PRETTY_FUNCTION__))
;
3534 LI->verify(*DT);
3535#endif
3536
3537 return LoopVectorPreHeader;
3538}
3539
3540BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3541 /*
3542 In this function we generate a new loop. The new loop will contain
3543 the vectorized instructions while the old loop will continue to run the
3544 scalar remainder.
3545
3546 [ ] <-- loop iteration number check.
3547 / |
3548 / v
3549 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3550 | / |
3551 | / v
3552 || [ ] <-- vector pre header.
3553 |/ |
3554 | v
3555 | [ ] \
3556 | [ ]_| <-- vector loop.
3557 | |
3558 | v
3559 | -[ ] <--- middle-block.
3560 | / |
3561 | / v
3562 -|- >[ ] <--- new preheader.
3563 | |
3564 | v
3565 | [ ] \
3566 | [ ]_| <-- old scalar loop to handle remainder.
3567 \ |
3568 \ v
3569 >[ ] <-- exit block.
3570 ...
3571 */
3572
3573 // Get the metadata of the original loop before it gets modified.
3574 MDNode *OrigLoopID = OrigLoop->getLoopID();
3575
3576 // Create an empty vector loop, and prepare basic blocks for the runtime
3577 // checks.
3578 Loop *Lp = createVectorLoopSkeleton("");
3579
3580 // Now, compare the new count to zero. If it is zero skip the vector loop and
3581 // jump to the scalar loop. This check also covers the case where the
3582 // backedge-taken count is uint##_max: adding one to it will overflow leading
3583 // to an incorrect trip count of zero. In this (rare) case we will also jump
3584 // to the scalar loop.
3585 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3586
3587 // Generate the code to check any assumptions that we've made for SCEV
3588 // expressions.
3589 emitSCEVChecks(Lp, LoopScalarPreHeader);
3590
3591 // Generate the code that checks in runtime if arrays overlap. We put the
3592 // checks into a separate block to make the more common case of few elements
3593 // faster.
3594 emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3595
3596 // Some loops have a single integer induction variable, while other loops
3597 // don't. One example is c++ iterators that often have multiple pointer
3598 // induction variables. In the code below we also support a case where we
3599 // don't have a single induction variable.
3600 //
3601 // We try to obtain an induction variable from the original loop as hard
3602 // as possible. However if we don't find one that:
3603 // - is an integer
3604 // - counts from zero, stepping by one
3605 // - is the size of the widest induction variable type
3606 // then we create a new one.
3607 OldInduction = Legal->getPrimaryInduction();
3608 Type *IdxTy = Legal->getWidestInductionType();
3609 Value *StartIdx = ConstantInt::get(IdxTy, 0);
3610 // The loop step is equal to the vectorization factor (num of SIMD elements)
3611 // times the unroll factor (num of SIMD instructions).
3612 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3613 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3614 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3615 Induction =
3616 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3617 getDebugLocFromInstOrOperands(OldInduction));
3618
3619 // Emit phis for the new starting index of the scalar loop.
3620 createInductionResumeValues(Lp, CountRoundDown);
3621
3622 return completeLoopSkeleton(Lp, OrigLoopID);
3623}
3624
3625// Fix up external users of the induction variable. At this point, we are
3626// in LCSSA form, with all external PHIs that use the IV having one input value,
3627// coming from the remainder loop. We need those PHIs to also have a correct
3628// value for the IV when arriving directly from the middle block.
3629void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3630 const InductionDescriptor &II,
3631 Value *CountRoundDown, Value *EndValue,
3632 BasicBlock *MiddleBlock) {
3633 // There are two kinds of external IV usages - those that use the value
3634 // computed in the last iteration (the PHI) and those that use the penultimate
3635 // value (the value that feeds into the phi from the loop latch).
3636 // We allow both, but they, obviously, have different values.
3637
3638 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block")((OrigLoop->getUniqueExitBlock() && "Expected a single exit block"
) ? static_cast<void> (0) : __assert_fail ("OrigLoop->getUniqueExitBlock() && \"Expected a single exit block\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3638, __PRETTY_FUNCTION__))
;
3639
3640 DenseMap<Value *, Value *> MissingVals;
3641
3642 // An external user of the last iteration's value should see the value that
3643 // the remainder loop uses to initialize its own IV.
3644 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3645 for (User *U : PostInc->users()) {
3646 Instruction *UI = cast<Instruction>(U);
3647 if (!OrigLoop->contains(UI)) {
3648 assert(isa<PHINode>(UI) && "Expected LCSSA form")((isa<PHINode>(UI) && "Expected LCSSA form") ? static_cast
<void> (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3648, __PRETTY_FUNCTION__))
;
3649 MissingVals[UI] = EndValue;
3650 }
3651 }
3652
3653 // An external user of the penultimate value need to see EndValue - Step.
3654 // The simplest way to get this is to recompute it from the constituent SCEVs,
3655 // that is Start + (Step * (CRD - 1)).
3656 for (User *U : OrigPhi->users()) {
3657 auto *UI = cast<Instruction>(U);
3658 if (!OrigLoop->contains(UI)) {
3659 const DataLayout &DL =
3660 OrigLoop->getHeader()->getModule()->getDataLayout();
3661 assert(isa<PHINode>(UI) && "Expected LCSSA form")((isa<PHINode>(UI) && "Expected LCSSA form") ? static_cast
<void> (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3661, __PRETTY_FUNCTION__))
;
3662
3663 IRBuilder<> B(MiddleBlock->getTerminator());
3664
3665 // Fast-math-flags propagate from the original induction instruction.
3666 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3667 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3668
3669 Value *CountMinusOne = B.CreateSub(
3670 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3671 Value *CMO =
3672 !II.getStep()->getType()->isIntegerTy()
3673 ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3674 II.getStep()->getType())
3675 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3676 CMO->setName("cast.cmo");
3677 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3678 Escape->setName("ind.escape");
3679 MissingVals[UI] = Escape;
3680 }
3681 }
3682
3683 for (auto &I : MissingVals) {
3684 PHINode *PHI = cast<PHINode>(I.first);
3685 // One corner case we have to handle is two IVs "chasing" each-other,
3686 // that is %IV2 = phi [...], [ %IV1, %latch ]
3687 // In this case, if IV1 has an external use, we need to avoid adding both
3688 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3689 // don't already have an incoming value for the middle block.
3690 if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3691 PHI->addIncoming(I.second, MiddleBlock);
3692 }
3693}
3694
3695namespace {
3696
3697struct CSEDenseMapInfo {
3698 static bool canHandle(const Instruction *I) {
3699 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3700 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3701 }
3702
3703 static inline Instruction *getEmptyKey() {
3704 return DenseMapInfo<Instruction *>::getEmptyKey();
3705 }
3706
3707 static inline Instruction *getTombstoneKey() {
3708 return DenseMapInfo<Instruction *>::getTombstoneKey();
3709 }
3710
3711 static unsigned getHashValue(const Instruction *I) {
3712 assert(canHandle(I) && "Unknown instruction!")((canHandle(I) && "Unknown instruction!") ? static_cast
<void> (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3712, __PRETTY_FUNCTION__))
;
3713 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3714 I->value_op_end()));
3715 }
3716
3717 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3718 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3719 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3720 return LHS == RHS;
3721 return LHS->isIdenticalTo(RHS);
3722 }
3723};
3724
3725} // end anonymous namespace
3726
3727///Perform cse of induction variable instructions.
3728static void cse(BasicBlock *BB) {
3729 // Perform simple cse.
3730 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3731 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3732 Instruction *In = &*I++;
3733
3734 if (!CSEDenseMapInfo::canHandle(In))
3735 continue;
3736
3737 // Check if we can replace this instruction with any of the
3738 // visited instructions.
3739 if (Instruction *V = CSEMap.lookup(In)) {
3740 In->replaceAllUsesWith(V);
3741 In->eraseFromParent();
3742 continue;
3743 }
3744
3745 CSEMap[In] = In;
3746 }
3747}
3748
3749InstructionCost
3750LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3751 bool &NeedToScalarize) {
3752 Function *F = CI->getCalledFunction();
3753 Type *ScalarRetTy = CI->getType();
3754 SmallVector<Type *, 4> Tys, ScalarTys;
3755 for (auto &ArgOp : CI->arg_operands())
3756 ScalarTys.push_back(ArgOp->getType());
3757
3758 // Estimate cost of scalarized vector call. The source operands are assumed
3759 // to be vectors, so we need to extract individual elements from there,
3760 // execute VF scalar calls, and then gather the result into the vector return
3761 // value.
3762 InstructionCost ScalarCallCost =
3763 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3764 if (VF.isScalar())
3765 return ScalarCallCost;
3766
3767 // Compute corresponding vector type for return value and arguments.
3768 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3769 for (Type *ScalarTy : ScalarTys)
3770 Tys.push_back(ToVectorTy(ScalarTy, VF));
3771
3772 // Compute costs of unpacking argument values for the scalar calls and
3773 // packing the return values to a vector.
3774 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3775
3776 InstructionCost Cost =
3777 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3778
3779 // If we can't emit a vector call for this function, then the currently found
3780 // cost is the cost we need to return.
3781 NeedToScalarize = true;
3782 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3783 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3784
3785 if (!TLI || CI->isNoBuiltin() || !VecFunc)
3786 return Cost;
3787
3788 // If the corresponding vector cost is cheaper, return its cost.
3789 InstructionCost VectorCallCost =
3790 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3791 if (VectorCallCost < Cost) {
3792 NeedToScalarize = false;
3793 Cost = VectorCallCost;
3794 }
3795 return Cost;
3796}
3797
3798static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3799 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3800 return Elt;
3801 return VectorType::get(Elt, VF);
3802}
3803
3804InstructionCost
3805LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3806 ElementCount VF) {
3807 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3808 assert(ID && "Expected intrinsic call!")((ID && "Expected intrinsic call!") ? static_cast<
void> (0) : __assert_fail ("ID && \"Expected intrinsic call!\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3808, __PRETTY_FUNCTION__))
;
3809 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3810 FastMathFlags FMF;
3811 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3812 FMF = FPMO->getFastMathFlags();
3813
3814 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
3815 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3816 SmallVector<Type *> ParamTys;
3817 std::transform(FTy->param_begin(), FTy->param_end(),
3818 std::back_inserter(ParamTys),
3819 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3820
3821 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3822 dyn_cast<IntrinsicInst>(CI));
3823 return TTI.getIntrinsicInstrCost(CostAttrs,
3824 TargetTransformInfo::TCK_RecipThroughput);
3825}
3826
3827static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3828 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3829 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3830 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3831}
3832
3833static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3834 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3835 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3836 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3837}
3838
3839void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3840 // For every instruction `I` in MinBWs, truncate the operands, create a
3841 // truncated version of `I` and reextend its result. InstCombine runs
3842 // later and will remove any ext/trunc pairs.
3843 SmallPtrSet<Value *, 4> Erased;
3844 for (const auto &KV : Cost->getMinimalBitwidths()) {
3845 // If the value wasn't vectorized, we must maintain the original scalar
3846 // type. The absence of the value from State indicates that it
3847 // wasn't vectorized.
3848 VPValue *Def = State.Plan->getVPValue(KV.first);
3849 if (!State.hasAnyVectorValue(Def))
3850 continue;
3851 for (unsigned Part = 0; Part < UF; ++Part) {
3852 Value *I = State.get(Def, Part);
3853 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3854 continue;
3855 Type *OriginalTy = I->getType();
3856 Type *ScalarTruncatedTy =
3857 IntegerType::get(OriginalTy->getContext(), KV.second);
3858 auto *TruncatedTy = FixedVectorType::get(
3859 ScalarTruncatedTy,
3860 cast<FixedVectorType>(OriginalTy)->getNumElements());
3861 if (TruncatedTy == OriginalTy)
3862 continue;
3863
3864 IRBuilder<> B(cast<Instruction>(I));
3865 auto ShrinkOperand = [&](Value *V) -> Value * {
3866 if (auto *ZI = dyn_cast<ZExtInst>(V))
3867 if (ZI->getSrcTy() == TruncatedTy)
3868 return ZI->getOperand(0);
3869 return B.CreateZExtOrTrunc(V, TruncatedTy);
3870 };
3871
3872 // The actual instruction modification depends on the instruction type,
3873 // unfortunately.
3874 Value *NewI = nullptr;
3875 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3876 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3877 ShrinkOperand(BO->getOperand(1)));
3878
3879 // Any wrapping introduced by shrinking this operation shouldn't be
3880 // considered undefined behavior. So, we can't unconditionally copy
3881 // arithmetic wrapping flags to NewI.
3882 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3883 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3884 NewI =
3885 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3886 ShrinkOperand(CI->getOperand(1)));
3887 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3888 NewI = B.CreateSelect(SI->getCondition(),
3889 ShrinkOperand(SI->getTrueValue()),
3890 ShrinkOperand(SI->getFalseValue()));
3891 } else if (auto *CI = dyn_cast<CastInst>(I)) {
3892 switch (CI->getOpcode()) {
3893 default:
3894 llvm_unreachable("Unhandled cast!")::llvm::llvm_unreachable_internal("Unhandled cast!", "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3894)
;
3895 case Instruction::Trunc:
3896 NewI = ShrinkOperand(CI->getOperand(0));
3897 break;
3898 case Instruction::SExt:
3899 NewI = B.CreateSExtOrTrunc(
3900 CI->getOperand(0),
3901 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3902 break;
3903 case Instruction::ZExt:
3904 NewI = B.CreateZExtOrTrunc(
3905 CI->getOperand(0),
3906 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3907 break;
3908 }
3909 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3910 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3911 ->getNumElements();
3912 auto *O0 = B.CreateZExtOrTrunc(
3913 SI->getOperand(0),
3914 FixedVectorType::get(ScalarTruncatedTy, Elements0));
3915 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3916 ->getNumElements();
3917 auto *O1 = B.CreateZExtOrTrunc(
3918 SI->getOperand(1),
3919 FixedVectorType::get(ScalarTruncatedTy, Elements1));
3920
3921 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3922 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3923 // Don't do anything with the operands, just extend the result.
3924 continue;
3925 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3926 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3927 ->getNumElements();
3928 auto *O0 = B.CreateZExtOrTrunc(
3929 IE->getOperand(0),
3930 FixedVectorType::get(ScalarTruncatedTy, Elements));
3931 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3932 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3933 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3934 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3935 ->getNumElements();
3936 auto *O0 = B.CreateZExtOrTrunc(
3937 EE->getOperand(0),
3938 FixedVectorType::get(ScalarTruncatedTy, Elements));
3939 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3940 } else {
3941 // If we don't know what to do, be conservative and don't do anything.
3942 continue;
3943 }
3944
3945 // Lastly, extend the result.
3946 NewI->takeName(cast<Instruction>(I));
3947 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3948 I->replaceAllUsesWith(Res);
3949 cast<Instruction>(I)->eraseFromParent();
3950 Erased.insert(I);
3951 State.reset(Def, Res, Part);
3952 }
3953 }
3954
3955 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3956 for (const auto &KV : Cost->getMinimalBitwidths()) {
3957 // If the value wasn't vectorized, we must maintain the original scalar
3958 // type. The absence of the value from State indicates that it
3959 // wasn't vectorized.
3960 VPValue *Def = State.Plan->getVPValue(KV.first);
3961 if (!State.hasAnyVectorValue(Def))
3962 continue;
3963 for (unsigned Part = 0; Part < UF; ++Part) {
3964 Value *I = State.get(Def, Part);
3965 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3966 if (Inst && Inst->use_empty()) {
3967 Value *NewI = Inst->getOperand(0);
3968 Inst->eraseFromParent();
3969 State.reset(Def, NewI, Part);
3970 }
3971 }
3972 }
3973}
3974
3975void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
3976 // Insert truncates and extends for any truncated instructions as hints to
3977 // InstCombine.
3978 if (VF.isVector())
3979 truncateToMinimalBitwidths(State);
3980
3981 // Fix widened non-induction PHIs by setting up the PHI operands.
3982 if (OrigPHIsToFix.size()) {
3983 assert(EnableVPlanNativePath &&((EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3984, __PRETTY_FUNCTION__))
3984 "Unexpected non-induction PHIs for fixup in non VPlan-native path")((EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3984, __PRETTY_FUNCTION__))
;
3985 fixNonInductionPHIs(State);
3986 }
3987
3988 // At this point every instruction in the original loop is widened to a
3989 // vector form. Now we need to fix the recurrences in the loop. These PHI
3990 // nodes are currently empty because we did not want to introduce cycles.
3991 // This is the second stage of vectorizing recurrences.
3992 fixCrossIterationPHIs(State);
3993
3994 // Forget the original basic block.
3995 PSE.getSE()->forgetLoop(OrigLoop);
3996
3997 // Fix-up external users of the induction variables.
3998 for (auto &Entry : Legal->getInductionVars())
3999 fixupIVUsers(Entry.first, Entry.second,
4000 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4001 IVEndValues[Entry.first], LoopMiddleBlock);
4002
4003 fixLCSSAPHIs(State);
4004 for (Instruction *PI : PredicatedInstructions)
4005 sinkScalarOperands(&*PI);
4006
4007 // Remove redundant induction instructions.
4008 cse(LoopVectorBody);
4009
4010 // Set/update profile weights for the vector and remainder loops as original
4011 // loop iterations are now distributed among them. Note that original loop
4012 // represented by LoopScalarBody becomes remainder loop after vectorization.
4013 //
4014 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4015 // end up getting slightly roughened result but that should be OK since
4016 // profile is not inherently precise anyway. Note also possible bypass of
4017 // vector code caused by legality checks is ignored, assigning all the weight
4018 // to the vector loop, optimistically.
4019 //
4020 // For scalable vectorization we can't know at compile time how many iterations
4021 // of the loop are handled in one vector iteration, so instead assume a pessimistic
4022 // vscale of '1'.
4023 setProfileInfoAfterUnrolling(
4024 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
4025 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
4026}
4027
4028void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
4029 // In order to support recurrences we need to be able to vectorize Phi nodes.
4030 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4031 // stage #2: We now need to fix the recurrences by adding incoming edges to
4032 // the currently empty PHI nodes. At this point every instruction in the
4033 // original loop is widened to a vector form so we can use them to construct
4034 // the incoming edges.
4035 for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
4036 // Handle first-order recurrences and reductions that need to be fixed.
4037 if (Legal->isFirstOrderRecurrence(&Phi))
4038 fixFirstOrderRecurrence(&Phi, State);
4039 else if (Legal->isReductionVariable(&Phi))
4040 fixReduction(&Phi, State);
4041 }
4042}
4043
4044void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi,
4045 VPTransformState &State) {
4046 // This is the second phase of vectorizing first-order recurrences. An
4047 // overview of the transformation is described below. Suppose we have the
4048 // following loop.
4049 //
4050 // for (int i = 0; i < n; ++i)
4051 // b[i] = a[i] - a[i - 1];
4052 //
4053 // There is a first-order recurrence on "a". For this loop, the shorthand
4054 // scalar IR looks like:
4055 //
4056 // scalar.ph:
4057 // s_init = a[-1]
4058 // br scalar.body
4059 //
4060 // scalar.body:
4061 // i = phi [0, scalar.ph], [i+1, scalar.body]
4062 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4063 // s2 = a[i]
4064 // b[i] = s2 - s1
4065 // br cond, scalar.body, ...
4066 //
4067 // In this example, s1 is a recurrence because it's value depends on the
4068 // previous iteration. In the first phase of vectorization, we created a
4069 // temporary value for s1. We now complete the vectorization and produce the
4070 // shorthand vector IR shown below (for VF = 4, UF = 1).
4071 //
4072 // vector.ph:
4073 // v_init = vector(..., ..., ..., a[-1])
4074 // br vector.body
4075 //
4076 // vector.body
4077 // i = phi [0, vector.ph], [i+4, vector.body]
4078 // v1 = phi [v_init, vector.ph], [v2, vector.body]
4079 // v2 = a[i, i+1, i+2, i+3];
4080 // v3 = vector(v1(3), v2(0, 1, 2))
4081 // b[i, i+1, i+2, i+3] = v2 - v3
4082 // br cond, vector.body, middle.block
4083 //
4084 // middle.block:
4085 // x = v2(3)
4086 // br scalar.ph
4087 //
4088 // scalar.ph:
4089 // s_init = phi [x, middle.block], [a[-1], otherwise]
4090 // br scalar.body
4091 //
4092 // After execution completes the vector loop, we extract the next value of
4093 // the recurrence (x) to use as the initial value in the scalar loop.
4094
4095 // Get the original loop preheader and single loop latch.
4096 auto *Preheader = OrigLoop->getLoopPreheader();
4097 auto *Latch = OrigLoop->getLoopLatch();
4098
4099 // Get the initial and previous values of the scalar recurrence.
4100 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4101 auto *Previous = Phi->getIncomingValueForBlock(Latch);
4102
4103 // Create a vector from the initial value.
4104 auto *VectorInit = ScalarInit;
4105 if (VF.isVector()) {
4106 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4107 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4107, __PRETTY_FUNCTION__))
;
4108 VectorInit = Builder.CreateInsertElement(
4109 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4110 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
4111 }
4112
4113 VPValue *PhiDef = State.Plan->getVPValue(Phi);
4114 VPValue *PreviousDef = State.Plan->getVPValue(Previous);
4115 // We constructed a temporary phi node in the first phase of vectorization.
4116 // This phi node will eventually be deleted.
4117 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0)));
4118
4119 // Create a phi node for the new recurrence. The current value will either be
4120 // the initial value inserted into a vector or loop-varying vector value.
4121 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4122 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4123
4124 // Get the vectorized previous value of the last part UF - 1. It appears last
4125 // among all unrolled iterations, due to the order of their construction.
4126 Value *PreviousLastPart = State.get(PreviousDef, UF - 1);
4127
4128 // Find and set the insertion point after the previous value if it is an
4129 // instruction.
4130 BasicBlock::iterator InsertPt;
4131 // Note that the previous value may have been constant-folded so it is not
4132 // guaranteed to be an instruction in the vector loop.
4133 // FIXME: Loop invariant values do not form recurrences. We should deal with
4134 // them earlier.
4135 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
4136 InsertPt = LoopVectorBody->getFirstInsertionPt();
4137 else {
4138 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
4139 if (isa<PHINode>(PreviousLastPart))
4140 // If the previous value is a phi node, we should insert after all the phi
4141 // nodes in the block containing the PHI to avoid breaking basic block
4142 // verification. Note that the basic block may be different to
4143 // LoopVectorBody, in case we predicate the loop.
4144 InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
4145 else
4146 InsertPt = ++PreviousInst->getIterator();
4147 }
4148 Builder.SetInsertPoint(&*InsertPt);
4149
4150 // We will construct a vector for the recurrence by combining the values for
4151 // the current and previous iterations. This is the required shuffle mask.
4152 assert(!VF.isScalable())((!VF.isScalable()) ? static_cast<void> (0) : __assert_fail
("!VF.isScalable()", "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4152, __PRETTY_FUNCTION__))
;
4153 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
4154 ShuffleMask[0] = VF.getKnownMinValue() - 1;
4155 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
4156 ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
4157
4158 // The vector from which to take the initial value for the current iteration
4159 // (actual or unrolled). Initially, this is the vector phi node.
4160 Value *Incoming = VecPhi;
4161
4162 // Shuffle the current and previous vector and update the vector parts.
4163 for (unsigned Part = 0; Part < UF; ++Part) {
4164 Value *PreviousPart = State.get(PreviousDef, Part);
4165 Value *PhiPart = State.get(PhiDef, Part);
4166 auto *Shuffle =
4167 VF.isVector()
4168 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
4169 : Incoming;
4170 PhiPart->replaceAllUsesWith(Shuffle);
4171 cast<Instruction>(PhiPart)->eraseFromParent();
4172 State.reset(PhiDef, Shuffle, Part);
4173 Incoming = PreviousPart;
4174 }
4175
4176 // Fix the latch value of the new recurrence in the vector loop.
4177 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4178
4179 // Extract the last vector element in the middle block. This will be the
4180 // initial value for the recurrence when jumping to the scalar loop.
4181 auto *ExtractForScalar = Incoming;
4182 if (VF.isVector()) {
4183 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4184 ExtractForScalar = Builder.CreateExtractElement(
4185 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
4186 "vector.recur.extract");
4187 }
4188 // Extract the second last element in the middle block if the
4189 // Phi is used outside the loop. We need to extract the phi itself
4190 // and not the last element (the phi update in the current iteration). This
4191 // will be the value when jumping to the exit block from the LoopMiddleBlock,
4192 // when the scalar loop is not run at all.
4193 Value *ExtractForPhiUsedOutsideLoop = nullptr;
4194 if (VF.isVector())
4195 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4196 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
4197 "vector.recur.extract.for.phi");
4198 // When loop is unrolled without vectorizing, initialize
4199 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4200 // `Incoming`. This is analogous to the vectorized case above: extracting the
4201 // second last element when VF > 1.
4202 else if (UF > 1)
4203 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4204
4205 // Fix the initial value of the original recurrence in the scalar loop.
4206 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4207 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4208 for (auto *BB : predecessors(LoopScalarPreHeader)) {
4209 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4210 Start->addIncoming(Incoming, BB);
4211 }
4212
4213 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4214 Phi->setName("scalar.recur");
4215
4216 // Finally, fix users of the recurrence outside the loop. The users will need
4217 // either the last value of the scalar recurrence or the last value of the
4218 // vector recurrence we extracted in the middle block. Since the loop is in
4219 // LCSSA form, we just need to find all the phi nodes for the original scalar
4220 // recurrence in the exit block, and then add an edge for the middle block.
4221 // Note that LCSSA does not imply single entry when the original scalar loop
4222 // had multiple exiting edges (as we always run the last iteration in the
4223 // scalar epilogue); in that case, the exiting path through middle will be
4224 // dynamically dead and the value picked for the phi doesn't matter.
4225 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4226 if (any_of(LCSSAPhi.incoming_values(),
4227 [Phi](Value *V) { return V == Phi; }))
4228 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4229}
4230
4231void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) {
4232 // Get it's reduction variable descriptor.
4233 assert(Legal->isReductionVariable(Phi) &&((Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->isReductionVariable(Phi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4234, __PRETTY_FUNCTION__))
4234 "Unable to find the reduction variable")((Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->isReductionVariable(Phi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4234, __PRETTY_FUNCTION__))
;
4235 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4236
4237 RecurKind RK = RdxDesc.getRecurrenceKind();
4238 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4239 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4240 setDebugLocFromInst(Builder, ReductionStartValue);
4241 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
4242
4243 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst);
4244 // This is the vector-clone of the value that leaves the loop.
4245 Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4246
4247 // Wrap flags are in general invalid after vectorization, clear them.
4248 clearReductionWrapFlags(RdxDesc, State);
4249
4250 // Fix the vector-loop phi.
4251
4252 // Reductions do not have to start at zero. They can start with
4253 // any loop invariant values.
4254 BasicBlock *Latch = OrigLoop->getLoopLatch();
4255 Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4256
4257 for (unsigned Part = 0; Part < UF; ++Part) {
4258 Value *VecRdxPhi = State.get(State.Plan->getVPValue(Phi), Part);
4259 Value *Val = State.get(State.Plan->getVPValue(LoopVal), Part);
4260 cast<PHINode>(VecRdxPhi)
4261 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4262 }
4263
4264 // Before each round, move the insertion point right between
4265 // the PHIs and the values we are going to write.
4266 // This allows us to write both PHINodes and the extractelement
4267 // instructions.
4268 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4269
4270 setDebugLocFromInst(Builder, LoopExitInst);
4271
4272 // If tail is folded by masking, the vector value to leave the loop should be
4273 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4274 // instead of the former. For an inloop reduction the reduction will already
4275 // be predicated, and does not need to be handled here.
4276 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4277 for (unsigned Part = 0; Part < UF; ++Part) {
4278 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4279 Value *Sel = nullptr;
4280 for (User *U : VecLoopExitInst->users()) {
4281 if (isa<SelectInst>(U)) {
4282 assert(!Sel && "Reduction exit feeding two selects")((!Sel && "Reduction exit feeding two selects") ? static_cast
<void> (0) : __assert_fail ("!Sel && \"Reduction exit feeding two selects\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4282, __PRETTY_FUNCTION__))
;
4283 Sel = U;
4284 } else
4285 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select")((isa<PHINode>(U) && "Reduction exit must feed Phi's or select"
) ? static_cast<void> (0) : __assert_fail ("isa<PHINode>(U) && \"Reduction exit must feed Phi's or select\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4285, __PRETTY_FUNCTION__))
;
4286 }
4287 assert(Sel && "Reduction exit feeds no select")((Sel && "Reduction exit feeds no select") ? static_cast
<void> (0) : __assert_fail ("Sel && \"Reduction exit feeds no select\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4287, __PRETTY_FUNCTION__))
;
4288 State.reset(LoopExitInstDef, Sel, Part);
4289
4290 // If the target can create a predicated operator for the reduction at no
4291 // extra cost in the loop (for example a predicated vadd), it can be
4292 // cheaper for the select to remain in the loop than be sunk out of it,
4293 // and so use the select value for the phi instead of the old
4294 // LoopExitValue.
4295 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4296 if (PreferPredicatedReductionSelect ||
4297 TTI->preferPredicatedReductionSelect(
4298 RdxDesc.getOpcode(), Phi->getType(),
4299 TargetTransformInfo::ReductionFlags())) {
4300 auto *VecRdxPhi =
4301 cast<PHINode>(State.get(State.Plan->getVPValue(Phi), Part));
4302 VecRdxPhi->setIncomingValueForBlock(
4303 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4304 }
4305 }
4306 }
4307
4308 // If the vector reduction can be performed in a smaller type, we truncate
4309 // then extend the loop exit value to enable InstCombine to evaluate the
4310 // entire expression in the smaller type.
4311 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4312 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!")((!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"
) ? static_cast<void> (0) : __assert_fail ("!IsInLoopReductionPhi && \"Unexpected truncated inloop reduction!\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4312, __PRETTY_FUNCTION__))
;
4313 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4313, __PRETTY_FUNCTION__))
;
4314 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4315 Builder.SetInsertPoint(
4316 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4317 VectorParts RdxParts(UF);
4318 for (unsigned Part = 0; Part < UF; ++Part) {
4319 RdxParts[Part] = State.get(LoopExitInstDef, Part);
4320 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4321 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4322 : Builder.CreateZExt(Trunc, VecTy);
4323 for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4324 UI != RdxParts[Part]->user_end();)
4325 if (*UI != Trunc) {
4326 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4327 RdxParts[Part] = Extnd;
4328 } else {
4329 ++UI;
4330 }
4331 }
4332 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4333 for (unsigned Part = 0; Part < UF; ++Part) {
4334 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4335 State.reset(LoopExitInstDef, RdxParts[Part], Part);
4336 }
4337 }
4338
4339 // Reduce all of the unrolled parts into a single vector.
4340 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4341 unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4342
4343 // The middle block terminator has already been assigned a DebugLoc here (the
4344 // OrigLoop's single latch terminator). We want the whole middle block to
4345 // appear to execute on this line because: (a) it is all compiler generated,
4346 // (b) these instructions are always executed after evaluating the latch
4347 // conditional branch, and (c) other passes may add new predecessors which
4348 // terminate on this line. This is the easiest way to ensure we don't
4349 // accidentally cause an extra step back into the loop while debugging.
4350 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4351 {
4352 // Floating-point operations should have some FMF to enable the reduction.
4353 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4354 Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4355 for (unsigned Part = 1; Part < UF; ++Part) {
4356 Value *RdxPart = State.get(LoopExitInstDef, Part);
4357 if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4358 ReducedPartRdx = Builder.CreateBinOp(
4359 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4360 } else {
4361 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4362 }
4363 }
4364 }
4365
4366 // Create the reduction after the loop. Note that inloop reductions create the
4367 // target reduction in the loop using a Reduction recipe.
4368 if (VF.isVector() && !IsInLoopReductionPhi) {
4369 ReducedPartRdx =
4370 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
4371 // If the reduction can be performed in a smaller type, we need to extend
4372 // the reduction to the wider type before we branch to the original loop.
4373 if (Phi->getType() != RdxDesc.getRecurrenceType())
4374 ReducedPartRdx =
4375 RdxDesc.isSigned()
4376 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4377 : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4378 }
4379
4380 // Create a phi node that merges control-flow from the backedge-taken check
4381 // block and the middle block.
4382 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4383 LoopScalarPreHeader->getTerminator());
4384 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4385 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4386 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4387
4388 // Now, we need to fix the users of the reduction variable
4389 // inside and outside of the scalar remainder loop.
4390
4391 // We know that the loop is in LCSSA form. We need to update the PHI nodes
4392 // in the exit blocks. See comment on analogous loop in
4393 // fixFirstOrderRecurrence for a more complete explaination of the logic.
4394 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4395 if (any_of(LCSSAPhi.incoming_values(),
4396 [LoopExitInst](Value *V) { return V == LoopExitInst; }))
4397 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4398
4399 // Fix the scalar loop reduction variable with the incoming reduction sum
4400 // from the vector body and from the backedge value.
4401 int IncomingEdgeBlockIdx =
4402 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4403 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")((IncomingEdgeBlockIdx >= 0 && "Invalid block index"
) ? static_cast<void> (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4403, __PRETTY_FUNCTION__))
;
4404 // Pick the other block.
4405 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4406 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4407 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4408}
4409
4410void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc,
4411 VPTransformState &State) {
4412 RecurKind RK = RdxDesc.getRecurrenceKind();
4413 if (RK != RecurKind::Add && RK != RecurKind::Mul)
4414 return;
4415
4416 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4417 assert(LoopExitInstr && "null loop exit instruction")((LoopExitInstr && "null loop exit instruction") ? static_cast
<void> (0) : __assert_fail ("LoopExitInstr && \"null loop exit instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4417, __PRETTY_FUNCTION__))
;
4418 SmallVector<Instruction *, 8> Worklist;
4419 SmallPtrSet<Instruction *, 8> Visited;
4420 Worklist.push_back(LoopExitInstr);
4421 Visited.insert(LoopExitInstr);
4422
4423 while (!Worklist.empty()) {
4424 Instruction *Cur = Worklist.pop_back_val();
4425 if (isa<OverflowingBinaryOperator>(Cur))
4426 for (unsigned Part = 0; Part < UF; ++Part) {
4427 Value *V = State.get(State.Plan->getVPValue(Cur), Part);
4428 cast<Instruction>(V)->dropPoisonGeneratingFlags();
4429 }
4430
4431 for (User *U : Cur->users()) {
4432 Instruction *UI = cast<Instruction>(U);
4433 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4434 Visited.insert(UI).second)
4435 Worklist.push_back(UI);
4436 }
4437 }
4438}
4439
4440void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4441 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4442 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4443 // Some phis were already hand updated by the reduction and recurrence
4444 // code above, leave them alone.
4445 continue;
4446
4447 auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4448 // Non-instruction incoming values will have only one value.
4449
4450 VPLane Lane = VPLane::getFirstLane();
4451 if (isa<Instruction>(IncomingValue) &&
4452 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4453 VF))
4454 Lane = VPLane::getLastLaneForVF(VF);
4455
4456 // Can be a loop invariant incoming value or the last scalar value to be
4457 // extracted from the vectorized loop.
4458 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4459 Value *lastIncomingValue =
4460 OrigLoop->isLoopInvariant(IncomingValue)
4461 ? IncomingValue
4462 : State.get(State.Plan->getVPValue(IncomingValue),
4463 VPIteration(UF - 1, Lane));
4464 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4465 }
4466}
4467
4468void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4469 // The basic block and loop containing the predicated instruction.
4470 auto *PredBB = PredInst->getParent();
4471 auto *VectorLoop = LI->getLoopFor(PredBB);
4472
4473 // Initialize a worklist with the operands of the predicated instruction.
4474 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4475
4476 // Holds instructions that we need to analyze again. An instruction may be
4477 // reanalyzed if we don't yet know if we can sink it or not.
4478 SmallVector<Instruction *, 8> InstsToReanalyze;
4479
4480 // Returns true if a given use occurs in the predicated block. Phi nodes use
4481 // their operands in their corresponding predecessor blocks.
4482 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4483 auto *I = cast<Instruction>(U.getUser());
4484 BasicBlock *BB = I->getParent();
4485 if (auto *Phi = dyn_cast<PHINode>(I))
4486 BB = Phi->getIncomingBlock(
4487 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4488 return BB == PredBB;
4489 };
4490
4491 // Iteratively sink the scalarized operands of the predicated instruction
4492 // into the block we created for it. When an instruction is sunk, it's
4493 // operands are then added to the worklist. The algorithm ends after one pass
4494 // through the worklist doesn't sink a single instruction.
4495 bool Changed;
4496 do {
4497 // Add the instructions that need to be reanalyzed to the worklist, and
4498 // reset the changed indicator.
4499 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4500 InstsToReanalyze.clear();
4501 Changed = false;
4502
4503 while (!Worklist.empty()) {
4504 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4505
4506 // We can't sink an instruction if it is a phi node, is already in the
4507 // predicated block, is not in the loop, or may have side effects.
4508 if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4509 !VectorLoop->contains(I) || I->mayHaveSideEffects())
4510 continue;
4511
4512 // It's legal to sink the instruction if all its uses occur in the
4513 // predicated block. Otherwise, there's nothing to do yet, and we may
4514 // need to reanalyze the instruction.
4515 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4516 InstsToReanalyze.push_back(I);
4517 continue;
4518 }
4519
4520 // Move the instruction to the beginning of the predicated block, and add
4521 // it's operands to the worklist.
4522 I->moveBefore(&*PredBB->getFirstInsertionPt());
4523 Worklist.insert(I->op_begin(), I->op_end());
4524
4525 // The sinking may have enabled other instructions to be sunk, so we will
4526 // need to iterate.
4527 Changed = true;
4528 }
4529 } while (Changed);
4530}
4531
4532void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4533 for (PHINode *OrigPhi : OrigPHIsToFix) {
4534 VPWidenPHIRecipe *VPPhi =
4535 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4536 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4537 // Make sure the builder has a valid insert point.
4538 Builder.SetInsertPoint(NewPhi);
4539 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4540 VPValue *Inc = VPPhi->getIncomingValue(i);
4541 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4542 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4543 }
4544 }
4545}
4546
4547void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4548 VPUser &Operands, unsigned UF,
4549 ElementCount VF, bool IsPtrLoopInvariant,
4550 SmallBitVector &IsIndexLoopInvariant,
4551 VPTransformState &State) {
4552 // Construct a vector GEP by widening the operands of the scalar GEP as
4553 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4554 // results in a vector of pointers when at least one operand of the GEP
4555 // is vector-typed. Thus, to keep the representation compact, we only use
4556 // vector-typed operands for loop-varying values.
4557
4558 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4559 // If we are vectorizing, but the GEP has only loop-invariant operands,
4560 // the GEP we build (by only using vector-typed operands for
4561 // loop-varying values) would be a scalar pointer. Thus, to ensure we
4562 // produce a vector of pointers, we need to either arbitrarily pick an
4563 // operand to broadcast, or broadcast a clone of the original GEP.
4564 // Here, we broadcast a clone of the original.
4565 //
4566 // TODO: If at some point we decide to scalarize instructions having
4567 // loop-invariant operands, this special case will no longer be
4568 // required. We would add the scalarization decision to
4569 // collectLoopScalars() and teach getVectorValue() to broadcast
4570 // the lane-zero scalar value.
4571 auto *Clone = Builder.Insert(GEP->clone());
4572 for (unsigned Part = 0; Part < UF; ++Part) {
4573 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4574 State.set(VPDef, EntryPart, Part);
4575 addMetadata(EntryPart, GEP);
4576 }
4577 } else {
4578 // If the GEP has at least one loop-varying operand, we are sure to
4579 // produce a vector of pointers. But if we are only unrolling, we want
4580 // to produce a scalar GEP for each unroll part. Thus, the GEP we
4581 // produce with the code below will be scalar (if VF == 1) or vector
4582 // (otherwise). Note that for the unroll-only case, we still maintain
4583 // values in the vector mapping with initVector, as we do for other
4584 // instructions.
4585 for (unsigned Part = 0; Part < UF; ++Part) {
4586 // The pointer operand of the new GEP. If it's loop-invariant, we
4587 // won't broadcast it.
4588 auto *Ptr = IsPtrLoopInvariant
4589 ? State.get(Operands.getOperand(0), VPIteration(0, 0))
4590 : State.get(Operands.getOperand(0), Part);
4591
4592 // Collect all the indices for the new GEP. If any index is
4593 // loop-invariant, we won't broadcast it.
4594 SmallVector<Value *, 4> Indices;
4595 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4596 VPValue *Operand = Operands.getOperand(I);
4597 if (IsIndexLoopInvariant[I - 1])
4598 Indices.push_back(State.get(Operand, VPIteration(0, 0)));
4599 else
4600 Indices.push_back(State.get(Operand, Part));
4601 }
4602
4603 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4604 // but it should be a vector, otherwise.
4605 auto *NewGEP =
4606 GEP->isInBounds()
4607 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4608 Indices)
4609 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4610 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&(((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector") ? static_cast<void> (
0) : __assert_fail ("(VF.isScalar() || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4611, __PRETTY_FUNCTION__))
4611 "NewGEP is not a pointer vector")(((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector") ? static_cast<void> (
0) : __assert_fail ("(VF.isScalar() || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4611, __PRETTY_FUNCTION__))
;
4612 State.set(VPDef, NewGEP, Part);
4613 addMetadata(NewGEP, GEP);
4614 }
4615 }
4616}
4617
4618void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4619 RecurrenceDescriptor *RdxDesc,
4620 VPValue *StartVPV, VPValue *Def,
4621 VPTransformState &State) {
4622 PHINode *P = cast<PHINode>(PN);
4623 if (EnableVPlanNativePath) {
4624 // Currently we enter here in the VPlan-native path for non-induction
4625 // PHIs where all control flow is uniform. We simply widen these PHIs.
4626 // Create a vector phi with no operands - the vector phi operands will be
4627 // set at the end of vector code generation.
4628 Type *VecTy = (State.VF.isScalar())
4629 ? PN->getType()
4630 : VectorType::get(PN->getType(), State.VF);
4631 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4632 State.set(Def, VecPhi, 0);
4633 OrigPHIsToFix.push_back(P);
4634
4635 return;
4636 }
4637
4638 assert(PN->getParent() == OrigLoop->getHeader() &&((PN->getParent() == OrigLoop->getHeader() && "Non-header phis should have been handled elsewhere"
) ? static_cast<void> (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4639, __PRETTY_FUNCTION__))
4639 "Non-header phis should have been handled elsewhere")((PN->getParent() == OrigLoop->getHeader() && "Non-header phis should have been handled elsewhere"
) ? static_cast<void> (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4639, __PRETTY_FUNCTION__))
;
4640
4641 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr;
4642 // In order to support recurrences we need to be able to vectorize Phi nodes.
4643 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4644 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4645 // this value when we vectorize all of the instructions that use the PHI.
4646 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) {
4647 Value *Iden = nullptr;
4648 bool ScalarPHI =
4649 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4650 Type *VecTy =
4651 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
4652
4653 if (RdxDesc) {
4654 assert(Legal->isReductionVariable(P) && StartV &&((Legal->isReductionVariable(P) && StartV &&
"RdxDesc should only be set for reduction variables; in that case "
"a StartV is also required") ? static_cast<void> (0) :
__assert_fail ("Legal->isReductionVariable(P) && StartV && \"RdxDesc should only be set for reduction variables; in that case \" \"a StartV is also required\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4656, __PRETTY_FUNCTION__))
4655 "RdxDesc should only be set for reduction variables; in that case "((Legal->isReductionVariable(P) && StartV &&
"RdxDesc should only be set for reduction variables; in that case "
"a StartV is also required") ? static_cast<void> (0) :
__assert_fail ("Legal->isReductionVariable(P) && StartV && \"RdxDesc should only be set for reduction variables; in that case \" \"a StartV is also required\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4656, __PRETTY_FUNCTION__))
4656 "a StartV is also required")((Legal->isReductionVariable(P) && StartV &&
"RdxDesc should only be set for reduction variables; in that case "
"a StartV is also required") ? static_cast<void> (0) :
__assert_fail ("Legal->isReductionVariable(P) && StartV && \"RdxDesc should only be set for reduction variables; in that case \" \"a StartV is also required\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4656, __PRETTY_FUNCTION__))
;
4657 RecurKind RK = RdxDesc->getRecurrenceKind();
4658 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
4659 // MinMax reduction have the start value as their identify.
4660 if (ScalarPHI) {
4661 Iden = StartV;
4662 } else {
4663 IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4664 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4665 StartV = Iden =
4666 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
4667 }
4668 } else {
4669 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity(
4670 RK, VecTy->getScalarType());
4671 Iden = IdenC;
4672
4673 if (!ScalarPHI) {
4674 Iden = ConstantVector::getSplat(State.VF, IdenC);
4675 IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4676 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4677 Constant *Zero = Builder.getInt32(0);
4678 StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
4679 }
4680 }
4681 }
4682
4683 for (unsigned Part = 0; Part < State.UF; ++Part) {
4684 // This is phase one of vectorizing PHIs.
4685 Value *EntryPart = PHINode::Create(
4686 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4687 State.set(Def, EntryPart, Part);
4688 if (StartV) {
4689 // Make sure to add the reduction start value only to the
4690 // first unroll part.
4691 Value *StartVal = (Part == 0) ? StartV : Iden;
4692 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader);
4693 }
4694 }
4695 return;
4696 }
4697
4698 assert(!Legal->isReductionVariable(P) &&((!Legal->isReductionVariable(P) && "reductions should be handled above"
) ? static_cast<void> (0) : __assert_fail ("!Legal->isReductionVariable(P) && \"reductions should be handled above\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4699, __PRETTY_FUNCTION__))
4699 "reductions should be handled above")((!Legal->isReductionVariable(P) && "reductions should be handled above"
) ? static_cast<void> (0) : __assert_fail ("!Legal->isReductionVariable(P) && \"reductions should be handled above\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4699, __PRETTY_FUNCTION__))
;
4700
4701 setDebugLocFromInst(Builder, P);
4702
4703 // This PHINode must be an induction variable.
4704 // Make sure that we know about it.
4705 assert(Legal->getInductionVars().count(P) && "Not an induction variable")((Legal->getInductionVars().count(P) && "Not an induction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->getInductionVars().count(P) && \"Not an induction variable\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4705, __PRETTY_FUNCTION__))
;
4706
4707 InductionDescriptor II = Legal->getInductionVars().lookup(P);
4708 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4709
4710 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4711 // which can be found from the original scalar operations.
4712 switch (II.getKind()) {
4713 case InductionDescriptor::IK_NoInduction:
4714 llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4714)
;
4715 case InductionDescriptor::IK_IntInduction:
4716 case InductionDescriptor::IK_FpInduction:
4717 llvm_unreachable("Integer/fp induction is handled elsewhere.")::llvm::llvm_unreachable_internal("Integer/fp induction is handled elsewhere."
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4717)
;
4718 case InductionDescriptor::IK_PtrInduction: {
4719 // Handle the pointer induction variable case.
4720 assert(P->getType()->isPointerTy() && "Unexpected type.")((P->getType()->isPointerTy() && "Unexpected type."
) ? static_cast<void> (0) : __assert_fail ("P->getType()->isPointerTy() && \"Unexpected type.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4720, __PRETTY_FUNCTION__))
;
4721
4722 if (Cost->isScalarAfterVectorization(P, State.VF)) {
4723 // This is the normalized GEP that starts counting at zero.
4724 Value *PtrInd =
4725 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4726 // Determine the number of scalars we need to generate for each unroll
4727 // iteration. If the instruction is uniform, we only need to generate the
4728 // first lane. Otherwise, we generate all VF values.
4729 unsigned Lanes = Cost->isUniformAfterVectorization(P, State.VF)
4730 ? 1
4731 : State.VF.getKnownMinValue();
4732 for (unsigned Part = 0; Part < UF; ++Part) {
4733 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4734 Constant *Idx = ConstantInt::get(
4735 PtrInd->getType(), Lane + Part * State.VF.getKnownMinValue());
4736 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4737 Value *SclrGep =
4738 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4739 SclrGep->setName("next.gep");
4740 State.set(Def, SclrGep, VPIteration(Part, Lane));
4741 }
4742 }
4743 return;
4744 }
4745 assert(isa<SCEVConstant>(II.getStep()) &&((isa<SCEVConstant>(II.getStep()) && "Induction step not a SCEV constant!"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(II.getStep()) && \"Induction step not a SCEV constant!\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4746, __PRETTY_FUNCTION__))
4746 "Induction step not a SCEV constant!")((isa<SCEVConstant>(II.getStep()) && "Induction step not a SCEV constant!"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(II.getStep()) && \"Induction step not a SCEV constant!\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4746, __PRETTY_FUNCTION__))
;
4747 Type *PhiType = II.getStep()->getType();
4748
4749 // Build a pointer phi
4750 Value *ScalarStartValue = II.getStartValue();
4751 Type *ScStValueType = ScalarStartValue->getType();
4752 PHINode *NewPointerPhi =
4753 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4754 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4755
4756 // A pointer induction, performed by using a gep
4757 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4758 Instruction *InductionLoc = LoopLatch->getTerminator();
4759 const SCEV *ScalarStep = II.getStep();
4760 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4761 Value *ScalarStepValue =
4762 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4763 Value *InductionGEP = GetElementPtrInst::Create(
4764 ScStValueType->getPointerElementType(), NewPointerPhi,
4765 Builder.CreateMul(
4766 ScalarStepValue,
4767 ConstantInt::get(PhiType, State.VF.getKnownMinValue() * State.UF)),
4768 "ptr.ind", InductionLoc);
4769 NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4770
4771 // Create UF many actual address geps that use the pointer
4772 // phi as base and a vectorized version of the step value
4773 // (<step*0, ..., step*N>) as offset.
4774 for (unsigned Part = 0; Part < State.UF; ++Part) {
4775 SmallVector<Constant *, 8> Indices;
4776 // Create a vector of consecutive numbers from zero to VF.
4777 for (unsigned i = 0; i < State.VF.getKnownMinValue(); ++i)
4778 Indices.push_back(
4779 ConstantInt::get(PhiType, i + Part * State.VF.getKnownMinValue()));
4780 Constant *StartOffset = ConstantVector::get(Indices);
4781
4782 Value *GEP = Builder.CreateGEP(
4783 ScStValueType->getPointerElementType(), NewPointerPhi,
4784 Builder.CreateMul(StartOffset,
4785 Builder.CreateVectorSplat(
4786 State.VF.getKnownMinValue(), ScalarStepValue),
4787 "vector.gep"));
4788 State.set(Def, GEP, Part);
4789 }
4790 }
4791 }
4792}
4793
4794/// A helper function for checking whether an integer division-related
4795/// instruction may divide by zero (in which case it must be predicated if
4796/// executed conditionally in the scalar code).
4797/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4798/// Non-zero divisors that are non compile-time constants will not be
4799/// converted into multiplication, so we will still end up scalarizing
4800/// the division, but can do so w/o predication.
4801static bool mayDivideByZero(Instruction &I) {
4802 assert((I.getOpcode() == Instruction::UDiv ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4806, __PRETTY_FUNCTION__))
4803 I.getOpcode() == Instruction::SDiv ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4806, __PRETTY_FUNCTION__))
4804 I.getOpcode() == Instruction::URem ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4806, __PRETTY_FUNCTION__))
4805 I.getOpcode() == Instruction::SRem) &&(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4806, __PRETTY_FUNCTION__))
4806 "Unexpected instruction")(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4806, __PRETTY_FUNCTION__))
;
4807 Value *Divisor = I.getOperand(1);
4808 auto *CInt = dyn_cast<ConstantInt>(Divisor);
4809 return !CInt || CInt->isZero();
4810}
4811
4812void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4813 VPUser &User,
4814 VPTransformState &State) {
4815 switch (I.getOpcode()) {
4816 case Instruction::Call:
4817 case Instruction::Br:
4818 case Instruction::PHI:
4819 case Instruction::GetElementPtr:
4820 case Instruction::Select:
4821 llvm_unreachable("This instruction is handled by a different recipe.")::llvm::llvm_unreachable_internal("This instruction is handled by a different recipe."
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4821)
;
4822 case Instruction::UDiv:
4823 case Instruction::SDiv:
4824 case Instruction::SRem:
4825 case Instruction::URem:
4826 case Instruction::Add:
4827 case Instruction::FAdd:
4828 case Instruction::Sub:
4829 case Instruction::FSub:
4830 case Instruction::FNeg:
4831 case Instruction::Mul:
4832 case Instruction::FMul:
4833 case Instruction::FDiv:
4834 case Instruction::FRem:
4835 case Instruction::Shl:
4836 case Instruction::LShr:
4837 case Instruction::AShr:
4838 case Instruction::And:
4839 case Instruction::Or:
4840 case Instruction::Xor: {
4841 // Just widen unops and binops.
4842 setDebugLocFromInst(Builder, &I);
4843
4844 for (unsigned Part = 0; Part < UF; ++Part) {
4845 SmallVector<Value *, 2> Ops;
4846 for (VPValue *VPOp : User.operands())
4847 Ops.push_back(State.get(VPOp, Part));
4848
4849 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4850
4851 if (auto *VecOp = dyn_cast<Instruction>(V))
4852 VecOp->copyIRFlags(&I);
4853
4854 // Use this vector value for all users of the original instruction.
4855 State.set(Def, V, Part);
4856 addMetadata(V, &I);
4857 }
4858
4859 break;
4860 }
4861 case Instruction::ICmp:
4862 case Instruction::FCmp: {
4863 // Widen compares. Generate vector compares.
4864 bool FCmp = (I.getOpcode() == Instruction::FCmp);
4865 auto *Cmp = cast<CmpInst>(&I);
4866 setDebugLocFromInst(Builder, Cmp);
4867 for (unsigned Part = 0; Part < UF; ++Part) {
4868 Value *A = State.get(User.getOperand(0), Part);
4869 Value *B = State.get(User.getOperand(1), Part);
4870 Value *C = nullptr;
4871 if (FCmp) {
4872 // Propagate fast math flags.
4873 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4874 Builder.setFastMathFlags(Cmp->getFastMathFlags());
4875 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4876 } else {
4877 C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4878 }
4879 State.set(Def, C, Part);
4880 addMetadata(C, &I);
4881 }
4882
4883 break;
4884 }
4885
4886 case Instruction::ZExt:
4887 case Instruction::SExt:
4888 case Instruction::FPToUI:
4889 case Instruction::FPToSI:
4890 case Instruction::FPExt:
4891 case Instruction::PtrToInt:
4892 case Instruction::IntToPtr:
4893 case Instruction::SIToFP:
4894 case Instruction::UIToFP:
4895 case Instruction::Trunc:
4896 case Instruction::FPTrunc:
4897 case Instruction::BitCast: {
4898 auto *CI = cast<CastInst>(&I);
4899 setDebugLocFromInst(Builder, CI);
4900
4901 /// Vectorize casts.
4902 Type *DestTy =
4903 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4904
4905 for (unsigned Part = 0; Part < UF; ++Part) {
4906 Value *A = State.get(User.getOperand(0), Part);
4907 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4908 State.set(Def, Cast, Part);
4909 addMetadata(Cast, &I);
4910 }
4911 break;
4912 }
4913 default:
4914 // This instruction is not vectorized by simple widening.
4915 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an unhandled instruction: "
<< I; } } while (false)
;
4916 llvm_unreachable("Unhandled instruction!")::llvm::llvm_unreachable_internal("Unhandled instruction!", "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4916)
;
4917 } // end of switch.
4918}
4919
4920void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4921 VPUser &ArgOperands,
4922 VPTransformState &State) {
4923 assert(!isa<DbgInfoIntrinsic>(I) &&((!isa<DbgInfoIntrinsic>(I) && "DbgInfoIntrinsic should have been dropped during VPlan construction"
) ? static_cast<void> (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(I) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4924, __PRETTY_FUNCTION__))
4924 "DbgInfoIntrinsic should have been dropped during VPlan construction")((!isa<DbgInfoIntrinsic>(I) && "DbgInfoIntrinsic should have been dropped during VPlan construction"
) ? static_cast<void> (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(I) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4924, __PRETTY_FUNCTION__))
;
4925 setDebugLocFromInst(Builder, &I);
4926
4927 Module *M = I.getParent()->getParent()->getParent();
4928 auto *CI = cast<CallInst>(&I);
4929
4930 SmallVector<Type *, 4> Tys;
4931 for (Value *ArgOperand : CI->arg_operands())
4932 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4933
4934 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4935
4936 // The flag shows whether we use Intrinsic or a usual Call for vectorized
4937 // version of the instruction.
4938 // Is it beneficial to perform intrinsic call compared to lib call?
4939 bool NeedToScalarize = false;
4940 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4941 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4942 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4943 assert((UseVectorIntrinsic || !NeedToScalarize) &&(((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."
) ? static_cast<void> (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4944, __PRETTY_FUNCTION__))
4944 "Instruction should be scalarized elsewhere.")(((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."
) ? static_cast<void> (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4944, __PRETTY_FUNCTION__))
;
4945 assert(IntrinsicCost.isValid() && CallCost.isValid() &&((IntrinsicCost.isValid() && CallCost.isValid() &&
"Cannot have invalid costs while widening") ? static_cast<
void> (0) : __assert_fail ("IntrinsicCost.isValid() && CallCost.isValid() && \"Cannot have invalid costs while widening\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4946, __PRETTY_FUNCTION__))
4946 "Cannot have invalid costs while widening")((IntrinsicCost.isValid() && CallCost.isValid() &&
"Cannot have invalid costs while widening") ? static_cast<
void> (0) : __assert_fail ("IntrinsicCost.isValid() && CallCost.isValid() && \"Cannot have invalid costs while widening\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4946, __PRETTY_FUNCTION__))
;
4947
4948 for (unsigned Part = 0; Part < UF; ++Part) {
4949 SmallVector<Value *, 4> Args;
4950 for (auto &I : enumerate(ArgOperands.operands())) {
4951 // Some intrinsics have a scalar argument - don't replace it with a
4952 // vector.
4953 Value *Arg;
4954 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4955 Arg = State.get(I.value(), Part);
4956 else
4957 Arg = State.get(I.value(), VPIteration(0, 0));
4958 Args.push_back(Arg);
4959 }
4960
4961 Function *VectorF;
4962 if (UseVectorIntrinsic) {
4963 // Use vector version of the intrinsic.
4964 Type *TysForDecl[] = {CI->getType()};
4965 if (VF.isVector())
4966 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4967 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4968 assert(VectorF && "Can't retrieve vector intrinsic.")((VectorF && "Can't retrieve vector intrinsic.") ? static_cast
<void> (0) : __assert_fail ("VectorF && \"Can't retrieve vector intrinsic.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4968, __PRETTY_FUNCTION__))
;
4969 } else {
4970 // Use vector version of the function call.
4971 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4972#ifndef NDEBUG
4973 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&((VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
"Can't create vector function.") ? static_cast<void> (
0) : __assert_fail ("VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4974, __PRETTY_FUNCTION__))
4974 "Can't create vector function.")((VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
"Can't create vector function.") ? static_cast<void> (
0) : __assert_fail ("VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4974, __PRETTY_FUNCTION__))
;
4975#endif
4976 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4977 }
4978 SmallVector<OperandBundleDef, 1> OpBundles;
4979 CI->getOperandBundlesAsDefs(OpBundles);
4980 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4981
4982 if (isa<FPMathOperator>(V))
4983 V->copyFastMathFlags(CI);
4984
4985 State.set(Def, V, Part);
4986 addMetadata(V, &I);
4987 }
4988}
4989
4990void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
4991 VPUser &Operands,
4992 bool InvariantCond,
4993 VPTransformState &State) {
4994 setDebugLocFromInst(Builder, &I);
4995
4996 // The condition can be loop invariant but still defined inside the
4997 // loop. This means that we can't just use the original 'cond' value.
4998 // We have to take the 'vectorized' value and pick the first lane.
4999 // Instcombine will make this a no-op.
5000 auto *InvarCond = InvariantCond
5001 ? State.get(Operands.getOperand(0), VPIteration(0, 0))
5002 : nullptr;
5003
5004 for (unsigned Part = 0; Part < UF; ++Part) {
5005 Value *Cond =
5006 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
5007 Value *Op0 = State.get(Operands.getOperand(1), Part);
5008 Value *Op1 = State.get(Operands.getOperand(2), Part);
5009 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
5010 State.set(VPDef, Sel, Part);
5011 addMetadata(Sel, &I);
5012 }
5013}
5014
5015void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
5016 // We should not collect Scalars more than once per VF. Right now, this
5017 // function is called from collectUniformsAndScalars(), which already does
5018 // this check. Collecting Scalars for VF=1 does not make any sense.
5019 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&((VF.isVector() && Scalars.find(VF) == Scalars.end() &&
"This function should not be visited twice for the same VF")
? static_cast<void> (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5020, __PRETTY_FUNCTION__))
5020 "This function should not be visited twice for the same VF")((VF.isVector() && Scalars.find(VF) == Scalars.end() &&
"This function should not be visited twice for the same VF")
? static_cast<void> (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5020, __PRETTY_FUNCTION__))
;
5021
5022 SmallSetVector<Instruction *, 8> Worklist;
5023
5024 // These sets are used to seed the analysis with pointers used by memory
5025 // accesses that will remain scalar.
5026 SmallSetVector<Instruction *, 8> ScalarPtrs;
5027 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
5028 auto *Latch = TheLoop->getLoopLatch();
5029
5030 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
5031 // The pointer operands of loads and stores will be scalar as long as the
5032 // memory access is not a gather or scatter operation. The value operand of a
5033 // store will remain scalar if the store is scalarized.
5034 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
5035 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
5036 assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5037, __PRETTY_FUNCTION__))
5037 "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5037, __PRETTY_FUNCTION__))
;
5038 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
5039 if (Ptr == Store->getValueOperand())
5040 return WideningDecision == CM_Scalarize;
5041 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&((Ptr == getLoadStorePointerOperand(MemAccess) && "Ptr is neither a value or pointer operand"
) ? static_cast<void> (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5042, __PRETTY_FUNCTION__))
5042 "Ptr is neither a value or pointer operand")((Ptr == getLoadStorePointerOperand(MemAccess) && "Ptr is neither a value or pointer operand"
) ? static_cast<void> (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5042, __PRETTY_FUNCTION__))
;
5043 return WideningDecision != CM_GatherScatter;
5044 };
5045
5046 // A helper that returns true if the given value is a bitcast or
5047 // getelementptr instruction contained in the loop.
5048 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
5049 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
5050 isa<GetElementPtrInst>(V)) &&
5051 !TheLoop->isLoopInvariant(V);
5052 };
5053
5054 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
5055 if (!isa<PHINode>(Ptr) ||
5056 !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
5057 return false;
5058 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
5059 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
5060 return false;
5061 return isScalarUse(MemAccess, Ptr);
5062 };
5063
5064 // A helper that evaluates a memory access's use of a pointer. If the
5065 // pointer is actually the pointer induction of a loop, it is being
5066 // inserted into Worklist. If the use will be a scalar use, and the
5067 // pointer is only used by memory accesses, we place the pointer in
5068 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
5069 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
5070 if (isScalarPtrInduction(MemAccess, Ptr)) {
5071 Worklist.insert(cast<Instruction>(Ptr));
5072 Instruction *Update = cast<Instruction>(
5073 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
5074 Worklist.insert(Update);
5075 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptrdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Ptr << "\n"; } } while (false)
5076 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Ptr << "\n"; } } while (false)
;
5077 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Updatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Update << "\n"; } } while (false)
5078 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Update << "\n"; } } while (false)
;
5079 return;
5080 }
5081 // We only care about bitcast and getelementptr instructions contained in
5082 // the loop.
5083 if (!isLoopVaryingBitCastOrGEP(Ptr))
5084 return;
5085
5086 // If the pointer has already been identified as scalar (e.g., if it was
5087 // also identified as uniform), there's nothing to do.
5088 auto *I = cast<Instruction>(Ptr);
5089 if (Worklist.count(I))
5090 return;
5091
5092 // If the use of the pointer will be a scalar use, and all users of the
5093 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5094 // place the pointer in PossibleNonScalarPtrs.
5095 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5096 return isa<LoadInst>(U) || isa<StoreInst>(U);
5097 }))
5098 ScalarPtrs.insert(I);
5099 else
5100 PossibleNonScalarPtrs.insert(I);
5101 };
5102
5103 // We seed the scalars analysis with three classes of instructions: (1)
5104 // instructions marked uniform-after-vectorization and (2) bitcast,
5105 // getelementptr and (pointer) phi instructions used by memory accesses
5106 // requiring a scalar use.
5107 //
5108 // (1) Add to the worklist all instructions that have been identified as
5109 // uniform-after-vectorization.
5110 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5111
5112 // (2) Add to the worklist all bitcast and getelementptr instructions used by
5113 // memory accesses requiring a scalar use. The pointer operands of loads and
5114 // stores will be scalar as long as the memory accesses is not a gather or
5115 // scatter operation. The value operand of a store will remain scalar if the
5116 // store is scalarized.
5117 for (auto *BB : TheLoop->blocks())
5118 for (auto &I : *BB) {
5119 if (auto *Load = dyn_cast<LoadInst>(&I)) {
5120 evaluatePtrUse(Load, Load->getPointerOperand());
5121 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5122 evaluatePtrUse(Store, Store->getPointerOperand());
5123 evaluatePtrUse(Store, Store->getValueOperand());
5124 }
5125 }
5126 for (auto *I : ScalarPtrs)
5127 if (!PossibleNonScalarPtrs.count(I)) {
5128 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *I << "\n"; } } while (false)
;
5129 Worklist.insert(I);
5130 }
5131
5132 // Insert the forced scalars.
5133 // FIXME: Currently widenPHIInstruction() often creates a dead vector
5134 // induction variable when the PHI user is scalarized.
5135 auto ForcedScalar = ForcedScalars.find(VF);
5136 if (ForcedScalar != ForcedScalars.end())
5137 for (auto *I : ForcedScalar->second)
5138 Worklist.insert(I);
5139
5140 // Expand the worklist by looking through any bitcasts and getelementptr
5141 // instructions we've already identified as scalar. This is similar to the
5142 // expansion step in collectLoopUniforms(); however, here we're only
5143 // expanding to include additional bitcasts and getelementptr instructions.
5144 unsigned Idx = 0;
5145 while (Idx != Worklist.size()) {
5146 Instruction *Dst = Worklist[Idx++];
5147 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5148 continue;
5149 auto *Src = cast<Instruction>(Dst->getOperand(0));
5150 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5151 auto *J = cast<Instruction>(U);
5152 return !TheLoop->contains(J) || Worklist.count(J) ||
5153 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5154 isScalarUse(J, Src));
5155 })) {
5156 Worklist.insert(Src);
5157 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Src << "\n"; } } while (false)
;
5158 }
5159 }
5160
5161 // An induction variable will remain scalar if all users of the induction
5162 // variable and induction variable update remain scalar.
5163 for (auto &Induction : Legal->getInductionVars()) {
5164 auto *Ind = Induction.first;
5165 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5166
5167 // If tail-folding is applied, the primary induction variable will be used
5168 // to feed a vector compare.
5169 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5170 continue;
5171
5172 // Determine if all users of the induction variable are scalar after
5173 // vectorization.
5174 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5175 auto *I = cast<Instruction>(U);
5176 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5177 });
5178 if (!ScalarInd)
5179 continue;
5180
5181 // Determine if all users of the induction variable update instruction are
5182 // scalar after vectorization.
5183 auto ScalarIndUpdate =
5184 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5185 auto *I = cast<Instruction>(U);
5186 return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5187 });
5188 if (!ScalarIndUpdate)
5189 continue;
5190
5191 // The induction variable and its update instruction will remain scalar.
5192 Worklist.insert(Ind);
5193 Worklist.insert(IndUpdate);
5194 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Ind << "\n"; } } while (false)
;
5195 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
5196 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
;
5197 }
5198
5199 Scalars[VF].insert(Worklist.begin(), Worklist.end());
5200}
5201
5202bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
5203 ElementCount VF) {
5204 if (!blockNeedsPredication(I->getParent()))
5205 return false;
5206 switch(I->getOpcode()) {
5207 default:
5208 break;
5209 case Instruction::Load:
5210 case Instruction::Store: {
5211 if (!Legal->isMaskRequired(I))
5212 return false;
5213 auto *Ptr = getLoadStorePointerOperand(I);
5214 auto *Ty = getMemInstValueType(I);
5215 // We have already decided how to vectorize this instruction, get that
5216 // result.
5217 if (VF.isVector()) {
5218 InstWidening WideningDecision = getWideningDecision(I, VF);
5219 assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5220, __PRETTY_FUNCTION__))
5220 "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5220, __PRETTY_FUNCTION__))
;
5221 return WideningDecision == CM_Scalarize;
5222 }
5223 const Align Alignment = getLoadStoreAlignment(I);
5224 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5225 isLegalMaskedGather(Ty, Alignment))
5226 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5227 isLegalMaskedScatter(Ty, Alignment));
5228 }
5229 case Instruction::UDiv:
5230 case Instruction::SDiv:
5231 case Instruction::SRem:
5232 case Instruction::URem:
5233 return mayDivideByZero(*I);
5234 }
5235 return false;
5236}
5237
5238bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5239 Instruction *I, ElementCount VF) {
5240 assert(isAccessInterleaved(I) && "Expecting interleaved access.")((isAccessInterleaved(I) && "Expecting interleaved access."
) ? static_cast<void> (0) : __assert_fail ("isAccessInterleaved(I) && \"Expecting interleaved access.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5240, __PRETTY_FUNCTION__))
;
5241 assert(getWideningDecision(I, VF) == CM_Unknown &&((getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."
) ? static_cast<void> (0) : __assert_fail ("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5242, __PRETTY_FUNCTION__))
5242 "Decision should not be set yet.")((getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."
) ? static_cast<void> (0) : __assert_fail ("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5242, __PRETTY_FUNCTION__))
;
5243 auto *Group = getInterleavedAccessGroup(I);
5244 assert(Group && "Must have a group.")((Group && "Must have a group.") ? static_cast<void
> (0) : __assert_fail ("Group && \"Must have a group.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5244, __PRETTY_FUNCTION__))
;
5245
5246 // If the instruction's allocated size doesn't equal it's type size, it
5247 // requires padding and will be scalarized.
5248 auto &DL = I->getModule()->getDataLayout();
5249 auto *ScalarTy = getMemInstValueType(I);
5250 if (hasIrregularType(ScalarTy, DL, VF))
5251 return false;
5252
5253 // Check if masking is required.
5254 // A Group may need masking for one of two reasons: it resides in a block that
5255 // needs predication, or it was decided to use masking to deal with gaps.
5256 bool PredicatedAccessRequiresMasking =
5257 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5258 bool AccessWithGapsRequiresMasking =
5259 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5260 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
5261 return true;
5262
5263 // If masked interleaving is required, we expect that the user/target had
5264 // enabled it, because otherwise it either wouldn't have been created or
5265 // it should have been invalidated by the CostModel.
5266 assert(useMaskedInterleavedAccesses(TTI) &&((useMaskedInterleavedAccesses(TTI) && "Masked interleave-groups for predicated accesses are not enabled."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5267, __PRETTY_FUNCTION__))
5267 "Masked interleave-groups for predicated accesses are not enabled.")((useMaskedInterleavedAccesses(TTI) && "Masked interleave-groups for predicated accesses are not enabled."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5267, __PRETTY_FUNCTION__))
;
5268
5269 auto *Ty = getMemInstValueType(I);
5270 const Align Alignment = getLoadStoreAlignment(I);
5271 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5272 : TTI.isLegalMaskedStore(Ty, Alignment);
5273}
5274
5275bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5276 Instruction *I, ElementCount VF) {
5277 // Get and ensure we have a valid memory instruction.
5278 LoadInst *LI = dyn_cast<LoadInst>(I);
5279 StoreInst *SI = dyn_cast<StoreInst>(I);
5280 assert((LI || SI) && "Invalid memory instruction")(((LI || SI) && "Invalid memory instruction") ? static_cast
<void> (0) : __assert_fail ("(LI || SI) && \"Invalid memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5280, __PRETTY_FUNCTION__))
;
5281
5282 auto *Ptr = getLoadStorePointerOperand(I);
5283
5284 // In order to be widened, the pointer should be consecutive, first of all.
5285 if (!Legal->isConsecutivePtr(Ptr))
5286 return false;
5287
5288 // If the instruction is a store located in a predicated block, it will be
5289 // scalarized.
5290 if (isScalarWithPredication(I))
5291 return false;
5292
5293 // If the instruction's allocated size doesn't equal it's type size, it
5294 // requires padding and will be scalarized.
5295 auto &DL = I->getModule()->getDataLayout();
5296 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5297 if (hasIrregularType(ScalarTy, DL, VF))
5298 return false;
5299
5300 return true;
5301}
5302
5303void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5304 // We should not collect Uniforms more than once per VF. Right now,
5305 // this function is called from collectUniformsAndScalars(), which
5306 // already does this check. Collecting Uniforms for VF=1 does not make any
5307 // sense.
5308
5309 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&((VF.isVector() && Uniforms.find(VF) == Uniforms.end(
) && "This function should not be visited twice for the same VF"
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5310, __PRETTY_FUNCTION__))
5310 "This function should not be visited twice for the same VF")((VF.isVector() && Uniforms.find(VF) == Uniforms.end(
) && "This function should not be visited twice for the same VF"
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5310, __PRETTY_FUNCTION__))
;
5311
5312 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5313 // not analyze again. Uniforms.count(VF) will return 1.
5314 Uniforms[VF].clear();
5315
5316 // We now know that the loop is vectorizable!
5317 // Collect instructions inside the loop that will remain uniform after
5318 // vectorization.
5319
5320 // Global values, params and instructions outside of current loop are out of
5321 // scope.
5322 auto isOutOfScope = [&](Value *V) -> bool {
5323 Instruction *I = dyn_cast<Instruction>(V);
5324 return (!I || !TheLoop->contains(I));
5325 };
5326
5327 SetVector<Instruction *> Worklist;
5328 BasicBlock *Latch = TheLoop->getLoopLatch();
5329
5330 // Instructions that are scalar with predication must not be considered
5331 // uniform after vectorization, because that would create an erroneous
5332 // replicating region where only a single instance out of VF should be formed.
5333 // TODO: optimize such seldom cases if found important, see PR40816.
5334 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5335 if (isOutOfScope(I)) {
5336 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
5337 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
;
5338 return;
5339 }
5340 if (isScalarWithPredication(I, VF)) {
5341 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
5342 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
;
5343 return;
5344 }
5345 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
<< *I << "\n"; } } while (false)
;
5346 Worklist.insert(I);
5347 };
5348
5349 // Start with the conditional branch. If the branch condition is an
5350 // instruction contained in the loop that is only used by the branch, it is
5351 // uniform.
5352 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5353 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5354 addToWorklistIfAllowed(Cmp);
5355
5356 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5357 InstWidening WideningDecision = getWideningDecision(I, VF);
5358 assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5359, __PRETTY_FUNCTION__))
5359 "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5359, __PRETTY_FUNCTION__))
;
5360
5361 // A uniform memory op is itself uniform. We exclude uniform stores
5362 // here as they demand the last lane, not the first one.
5363 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5364 assert(WideningDecision == CM_Scalarize)((WideningDecision == CM_Scalarize) ? static_cast<void>
(0) : __assert_fail ("WideningDecision == CM_Scalarize", "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5364, __PRETTY_FUNCTION__))
;
5365 return true;
5366 }
5367
5368 return (WideningDecision == CM_Widen ||
5369 WideningDecision == CM_Widen_Reverse ||
5370 WideningDecision == CM_Interleave);
5371 };
5372
5373
5374 // Returns true if Ptr is the pointer operand of a memory access instruction
5375 // I, and I is known to not require scalarization.
5376 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5377 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5378 };
5379
5380 // Holds a list of values which are known to have at least one uniform use.
5381 // Note that there may be other uses which aren't uniform. A "uniform use"
5382 // here is something which only demands lane 0 of the unrolled iterations;
5383 // it does not imply that all lanes produce the same value (e.g. this is not
5384 // the usual meaning of uniform)
5385 SmallPtrSet<Value *, 8> HasUniformUse;
5386
5387 // Scan the loop for instructions which are either a) known to have only
5388 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5389 for (auto *BB : TheLoop->blocks())
5390 for (auto &I : *BB) {
5391 // If there's no pointer operand, there's nothing to do.
5392 auto *Ptr = getLoadStorePointerOperand(&I);
5393 if (!Ptr)
5394 continue;
5395
5396 // A uniform memory op is itself uniform. We exclude uniform stores
5397 // here as they demand the last lane, not the first one.
5398 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5399 addToWorklistIfAllowed(&I);
5400
5401 if (isUniformDecision(&I, VF)) {
5402 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check")((isVectorizedMemAccessUse(&I, Ptr) && "consistency check"
) ? static_cast<void> (0) : __assert_fail ("isVectorizedMemAccessUse(&I, Ptr) && \"consistency check\""
, "/build/llvm-toolchain-snapshot-13~++20210307111131+ab67fd39fc14/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5402, __PRETTY_FUNCTION__))
;
5403 HasUniformUse.insert(Ptr);
5404 }
5405 }
5406
5407 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5408 // demanding) users. Since loops are assumed to be in LCSSA form, this
5409 // disallows uses outside the loop as well.
5410 for (auto *V : HasUniformUse) {
5411 if (isOutOfScope(V))
5412 continue;
5413 auto *I = cast<Instruction>(V);
5414 auto UsersAreMemAccesses =
5415 llvm::all_of(I->users(), [&](User *U) -> bool {
5416 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5417 });
5418 if (UsersAreMemAccesses)
5419 addToWorklistIfAllowed(I);
5420 }
5421
5422 // Expand Worklist in topological order: whenever a new instruction
5423 // is added , its users should be already inside Worklist. It ensures
5424 // a uniform instruction will only be used by uniform instructions.
5425 unsigned idx = 0;
5426 while (idx != Worklist.size()) {
5427 Instruction *I = Worklist[idx++];
5428
5429 for (auto OV : I->operand_values()) {
5430 // isOutOfScope operands cannot be uniform instructions.
5431 if (isOutOfScope(OV))
5432 continue;
5433 // First order recurrence Phi's should typically be considered
5434 // non-uniform.
5435 auto *OP = dyn_cast<PHINode>(OV);
5436 if (OP && Legal->isFirstOrderRecurrence(OP))
5437 continue;
5438 // If all the users of the operand are uniform, then add the
5439 // operand into the uniform worklist.
5440 auto *OI = cast<Instruction>(OV);
5441 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5442 auto *J = cast<Instruction>(U);
5443 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5444 }))