Bug Summary

File:llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Warning:line 8887, column 5
Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopVectorize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/build-llvm/lib/Transforms/Vectorize -resource-dir /usr/lib/llvm-13/lib/clang/13.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/build-llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/build-llvm/include -I /build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/include -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-13/lib/clang/13.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/build-llvm/lib/Transforms/Vectorize -fdebug-prefix-map=/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7=. -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-06-13-111025-38230-1 -x c++ /build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/Proposal/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanHCFGBuilder.h"
61#include "VPlanPredicator.h"
62#include "VPlanTransforms.h"
63#include "llvm/ADT/APInt.h"
64#include "llvm/ADT/ArrayRef.h"
65#include "llvm/ADT/DenseMap.h"
66#include "llvm/ADT/DenseMapInfo.h"
67#include "llvm/ADT/Hashing.h"
68#include "llvm/ADT/MapVector.h"
69#include "llvm/ADT/None.h"
70#include "llvm/ADT/Optional.h"
71#include "llvm/ADT/STLExtras.h"
72#include "llvm/ADT/SmallPtrSet.h"
73#include "llvm/ADT/SmallSet.h"
74#include "llvm/ADT/SmallVector.h"
75#include "llvm/ADT/Statistic.h"
76#include "llvm/ADT/StringRef.h"
77#include "llvm/ADT/Twine.h"
78#include "llvm/ADT/iterator_range.h"
79#include "llvm/Analysis/AssumptionCache.h"
80#include "llvm/Analysis/BasicAliasAnalysis.h"
81#include "llvm/Analysis/BlockFrequencyInfo.h"
82#include "llvm/Analysis/CFG.h"
83#include "llvm/Analysis/CodeMetrics.h"
84#include "llvm/Analysis/DemandedBits.h"
85#include "llvm/Analysis/GlobalsModRef.h"
86#include "llvm/Analysis/LoopAccessAnalysis.h"
87#include "llvm/Analysis/LoopAnalysisManager.h"
88#include "llvm/Analysis/LoopInfo.h"
89#include "llvm/Analysis/LoopIterator.h"
90#include "llvm/Analysis/MemorySSA.h"
91#include "llvm/Analysis/OptimizationRemarkEmitter.h"
92#include "llvm/Analysis/ProfileSummaryInfo.h"
93#include "llvm/Analysis/ScalarEvolution.h"
94#include "llvm/Analysis/ScalarEvolutionExpressions.h"
95#include "llvm/Analysis/TargetLibraryInfo.h"
96#include "llvm/Analysis/TargetTransformInfo.h"
97#include "llvm/Analysis/VectorUtils.h"
98#include "llvm/IR/Attributes.h"
99#include "llvm/IR/BasicBlock.h"
100#include "llvm/IR/CFG.h"
101#include "llvm/IR/Constant.h"
102#include "llvm/IR/Constants.h"
103#include "llvm/IR/DataLayout.h"
104#include "llvm/IR/DebugInfoMetadata.h"
105#include "llvm/IR/DebugLoc.h"
106#include "llvm/IR/DerivedTypes.h"
107#include "llvm/IR/DiagnosticInfo.h"
108#include "llvm/IR/Dominators.h"
109#include "llvm/IR/Function.h"
110#include "llvm/IR/IRBuilder.h"
111#include "llvm/IR/InstrTypes.h"
112#include "llvm/IR/Instruction.h"
113#include "llvm/IR/Instructions.h"
114#include "llvm/IR/IntrinsicInst.h"
115#include "llvm/IR/Intrinsics.h"
116#include "llvm/IR/LLVMContext.h"
117#include "llvm/IR/Metadata.h"
118#include "llvm/IR/Module.h"
119#include "llvm/IR/Operator.h"
120#include "llvm/IR/PatternMatch.h"
121#include "llvm/IR/Type.h"
122#include "llvm/IR/Use.h"
123#include "llvm/IR/User.h"
124#include "llvm/IR/Value.h"
125#include "llvm/IR/ValueHandle.h"
126#include "llvm/IR/Verifier.h"
127#include "llvm/InitializePasses.h"
128#include "llvm/Pass.h"
129#include "llvm/Support/Casting.h"
130#include "llvm/Support/CommandLine.h"
131#include "llvm/Support/Compiler.h"
132#include "llvm/Support/Debug.h"
133#include "llvm/Support/ErrorHandling.h"
134#include "llvm/Support/InstructionCost.h"
135#include "llvm/Support/MathExtras.h"
136#include "llvm/Support/raw_ostream.h"
137#include "llvm/Transforms/Utils/BasicBlockUtils.h"
138#include "llvm/Transforms/Utils/InjectTLIMappings.h"
139#include "llvm/Transforms/Utils/LoopSimplify.h"
140#include "llvm/Transforms/Utils/LoopUtils.h"
141#include "llvm/Transforms/Utils/LoopVersioning.h"
142#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
143#include "llvm/Transforms/Utils/SizeOpts.h"
144#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
145#include <algorithm>
146#include <cassert>
147#include <cstdint>
148#include <cstdlib>
149#include <functional>
150#include <iterator>
151#include <limits>
152#include <memory>
153#include <string>
154#include <tuple>
155#include <utility>
156
157using namespace llvm;
158
159#define LV_NAME"loop-vectorize" "loop-vectorize"
160#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"
161
162#ifndef NDEBUG
163const char VerboseDebug[] = DEBUG_TYPE"loop-vectorize" "-verbose";
164#endif
165
166/// @{
167/// Metadata attribute names
168const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
169const char LLVMLoopVectorizeFollowupVectorized[] =
170 "llvm.loop.vectorize.followup_vectorized";
171const char LLVMLoopVectorizeFollowupEpilogue[] =
172 "llvm.loop.vectorize.followup_epilogue";
173/// @}
174
175STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = {"loop-vectorize", "LoopsVectorized"
, "Number of loops vectorized"}
;
176STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = {"loop-vectorize", "LoopsAnalyzed"
, "Number of loops analyzed for vectorization"}
;
177STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized")static llvm::Statistic LoopsEpilogueVectorized = {"loop-vectorize"
, "LoopsEpilogueVectorized", "Number of epilogues vectorized"
}
;
178
179static cl::opt<bool> EnableEpilogueVectorization(
180 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
181 cl::desc("Enable vectorization of epilogue loops."));
182
183static cl::opt<unsigned> EpilogueVectorizationForceVF(
184 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
185 cl::desc("When epilogue vectorization is enabled, and a value greater than "
186 "1 is specified, forces the given VF for all applicable epilogue "
187 "loops."));
188
189static cl::opt<unsigned> EpilogueVectorizationMinVF(
190 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
191 cl::desc("Only loops with vectorization factor equal to or larger than "
192 "the specified value are considered for epilogue vectorization."));
193
194/// Loops with a known constant trip count below this number are vectorized only
195/// if no scalar iteration overheads are incurred.
196static cl::opt<unsigned> TinyTripCountVectorThreshold(
197 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
198 cl::desc("Loops with a constant trip count that is smaller than this "
199 "value are vectorized only if no scalar iteration overheads "
200 "are incurred."));
201
202static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
203 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
204 cl::desc("The maximum allowed number of runtime memory checks with a "
205 "vectorize(enable) pragma."));
206
207// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
208// that predication is preferred, and this lists all options. I.e., the
209// vectorizer will try to fold the tail-loop (epilogue) into the vector body
210// and predicate the instructions accordingly. If tail-folding fails, there are
211// different fallback strategies depending on these values:
212namespace PreferPredicateTy {
213 enum Option {
214 ScalarEpilogue = 0,
215 PredicateElseScalarEpilogue,
216 PredicateOrDontVectorize
217 };
218} // namespace PreferPredicateTy
219
220static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
221 "prefer-predicate-over-epilogue",
222 cl::init(PreferPredicateTy::ScalarEpilogue),
223 cl::Hidden,
224 cl::desc("Tail-folding and predication preferences over creating a scalar "
225 "epilogue loop."),
226 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
227 "scalar-epilogue",llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
228 "Don't tail-predicate loops, create scalar epilogue")llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
,
229 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
230 "predicate-else-scalar-epilogue",llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
231 "prefer tail-folding, create scalar epilogue if tail "llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
232 "folding fails.")llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
,
233 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
234 "predicate-dont-vectorize",llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
235 "prefers tail-folding, don't attempt vectorization if "llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
236 "tail-folding fails.")llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
));
237
238static cl::opt<bool> MaximizeBandwidth(
239 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
240 cl::desc("Maximize bandwidth when selecting vectorization factor which "
241 "will be determined by the smallest type in loop."));
242
243static cl::opt<bool> EnableInterleavedMemAccesses(
244 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
245 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
246
247/// An interleave-group may need masking if it resides in a block that needs
248/// predication, or in order to mask away gaps.
249static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
250 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
251 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
252
253static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
254 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
255 cl::desc("We don't interleave loops with a estimated constant trip count "
256 "below this number"));
257
258static cl::opt<unsigned> ForceTargetNumScalarRegs(
259 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
260 cl::desc("A flag that overrides the target's number of scalar registers."));
261
262static cl::opt<unsigned> ForceTargetNumVectorRegs(
263 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
264 cl::desc("A flag that overrides the target's number of vector registers."));
265
266static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
267 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
268 cl::desc("A flag that overrides the target's max interleave factor for "
269 "scalar loops."));
270
271static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
272 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
273 cl::desc("A flag that overrides the target's max interleave factor for "
274 "vectorized loops."));
275
276static cl::opt<unsigned> ForceTargetInstructionCost(
277 "force-target-instruction-cost", cl::init(0), cl::Hidden,
278 cl::desc("A flag that overrides the target's expected cost for "
279 "an instruction to a single constant value. Mostly "
280 "useful for getting consistent testing."));
281
282static cl::opt<bool> ForceTargetSupportsScalableVectors(
283 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
284 cl::desc(
285 "Pretend that scalable vectors are supported, even if the target does "
286 "not support them. This flag should only be used for testing."));
287
288static cl::opt<unsigned> SmallLoopCost(
289 "small-loop-cost", cl::init(20), cl::Hidden,
290 cl::desc(
291 "The cost of a loop that is considered 'small' by the interleaver."));
292
293static cl::opt<bool> LoopVectorizeWithBlockFrequency(
294 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
295 cl::desc("Enable the use of the block frequency analysis to access PGO "
296 "heuristics minimizing code growth in cold regions and being more "
297 "aggressive in hot regions."));
298
299// Runtime interleave loops for load/store throughput.
300static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
301 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
302 cl::desc(
303 "Enable runtime interleaving until load/store ports are saturated"));
304
305/// Interleave small loops with scalar reductions.
306static cl::opt<bool> InterleaveSmallLoopScalarReduction(
307 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
308 cl::desc("Enable interleaving for loops with small iteration counts that "
309 "contain scalar reductions to expose ILP."));
310
311/// The number of stores in a loop that are allowed to need predication.
312static cl::opt<unsigned> NumberOfStoresToPredicate(
313 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
314 cl::desc("Max number of stores to be predicated behind an if."));
315
316static cl::opt<bool> EnableIndVarRegisterHeur(
317 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
318 cl::desc("Count the induction variable only once when interleaving"));
319
320static cl::opt<bool> EnableCondStoresVectorization(
321 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
322 cl::desc("Enable if predication of stores during vectorization."));
323
324static cl::opt<unsigned> MaxNestedScalarReductionIC(
325 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
326 cl::desc("The maximum interleave count to use when interleaving a scalar "
327 "reduction in a nested loop."));
328
329static cl::opt<bool>
330 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
331 cl::Hidden,
332 cl::desc("Prefer in-loop vector reductions, "
333 "overriding the targets preference."));
334
335cl::opt<bool> EnableStrictReductions(
336 "enable-strict-reductions", cl::init(false), cl::Hidden,
337 cl::desc("Enable the vectorisation of loops with in-order (strict) "
338 "FP reductions"));
339
340static cl::opt<bool> PreferPredicatedReductionSelect(
341 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
342 cl::desc(
343 "Prefer predicating a reduction operation over an after loop select."));
344
345cl::opt<bool> EnableVPlanNativePath(
346 "enable-vplan-native-path", cl::init(false), cl::Hidden,
347 cl::desc("Enable VPlan-native vectorization path with "
348 "support for outer loop vectorization."));
349
350// FIXME: Remove this switch once we have divergence analysis. Currently we
351// assume divergent non-backedge branches when this switch is true.
352cl::opt<bool> EnableVPlanPredication(
353 "enable-vplan-predication", cl::init(false), cl::Hidden,
354 cl::desc("Enable VPlan-native vectorization path predicator with "
355 "support for outer loop vectorization."));
356
357// This flag enables the stress testing of the VPlan H-CFG construction in the
358// VPlan-native vectorization path. It must be used in conjuction with
359// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
360// verification of the H-CFGs built.
361static cl::opt<bool> VPlanBuildStressTest(
362 "vplan-build-stress-test", cl::init(false), cl::Hidden,
363 cl::desc(
364 "Build VPlan for every supported loop nest in the function and bail "
365 "out right after the build (stress test the VPlan H-CFG construction "
366 "in the VPlan-native vectorization path)."));
367
368cl::opt<bool> llvm::EnableLoopInterleaving(
369 "interleave-loops", cl::init(true), cl::Hidden,
370 cl::desc("Enable loop interleaving in Loop vectorization passes"));
371cl::opt<bool> llvm::EnableLoopVectorization(
372 "vectorize-loops", cl::init(true), cl::Hidden,
373 cl::desc("Run the Loop vectorization passes"));
374
375cl::opt<bool> PrintVPlansInDotFormat(
376 "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
377 cl::desc("Use dot format instead of plain text when dumping VPlans"));
378
379/// A helper function that returns true if the given type is irregular. The
380/// type is irregular if its allocated size doesn't equal the store size of an
381/// element of the corresponding vector type.
382static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
383 // Determine if an array of N elements of type Ty is "bitcast compatible"
384 // with a <N x Ty> vector.
385 // This is only true if there is no padding between the array elements.
386 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
387}
388
389/// A helper function that returns the reciprocal of the block probability of
390/// predicated blocks. If we return X, we are assuming the predicated block
391/// will execute once for every X iterations of the loop header.
392///
393/// TODO: We should use actual block probability here, if available. Currently,
394/// we always assume predicated blocks have a 50% chance of executing.
395static unsigned getReciprocalPredBlockProb() { return 2; }
396
397/// A helper function that returns an integer or floating-point constant with
398/// value C.
399static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
400 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
401 : ConstantFP::get(Ty, C);
402}
403
404/// Returns "best known" trip count for the specified loop \p L as defined by
405/// the following procedure:
406/// 1) Returns exact trip count if it is known.
407/// 2) Returns expected trip count according to profile data if any.
408/// 3) Returns upper bound estimate if it is known.
409/// 4) Returns None if all of the above failed.
410static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
411 // Check if exact trip count is known.
412 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
413 return ExpectedTC;
414
415 // Check if there is an expected trip count available from profile data.
416 if (LoopVectorizeWithBlockFrequency)
417 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
418 return EstimatedTC;
419
420 // Check if upper bound estimate is known.
421 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
422 return ExpectedTC;
423
424 return None;
425}
426
427// Forward declare GeneratedRTChecks.
428class GeneratedRTChecks;
429
430namespace llvm {
431
432/// InnerLoopVectorizer vectorizes loops which contain only one basic
433/// block to a specified vectorization factor (VF).
434/// This class performs the widening of scalars into vectors, or multiple
435/// scalars. This class also implements the following features:
436/// * It inserts an epilogue loop for handling loops that don't have iteration
437/// counts that are known to be a multiple of the vectorization factor.
438/// * It handles the code generation for reduction variables.
439/// * Scalarization (implementation using scalars) of un-vectorizable
440/// instructions.
441/// InnerLoopVectorizer does not perform any vectorization-legality
442/// checks, and relies on the caller to check for the different legality
443/// aspects. The InnerLoopVectorizer relies on the
444/// LoopVectorizationLegality class to provide information about the induction
445/// and reduction variables that were found to a given vectorization factor.
446class InnerLoopVectorizer {
447public:
448 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
449 LoopInfo *LI, DominatorTree *DT,
450 const TargetLibraryInfo *TLI,
451 const TargetTransformInfo *TTI, AssumptionCache *AC,
452 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
453 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
454 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
455 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
456 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
457 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
458 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
459 PSI(PSI), RTChecks(RTChecks) {
460 // Query this against the original loop and save it here because the profile
461 // of the original loop header may change as the transformation happens.
462 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
463 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
464 }
465
466 virtual ~InnerLoopVectorizer() = default;
467
468 /// Create a new empty loop that will contain vectorized instructions later
469 /// on, while the old loop will be used as the scalar remainder. Control flow
470 /// is generated around the vectorized (and scalar epilogue) loops consisting
471 /// of various checks and bypasses. Return the pre-header block of the new
472 /// loop.
473 /// In the case of epilogue vectorization, this function is overriden to
474 /// handle the more complex control flow around the loops.
475 virtual BasicBlock *createVectorizedLoopSkeleton();
476
477 /// Widen a single instruction within the innermost loop.
478 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
479 VPTransformState &State);
480
481 /// Widen a single call instruction within the innermost loop.
482 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
483 VPTransformState &State);
484
485 /// Widen a single select instruction within the innermost loop.
486 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
487 bool InvariantCond, VPTransformState &State);
488
489 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
490 void fixVectorizedLoop(VPTransformState &State);
491
492 // Return true if any runtime check is added.
493 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
494
495 /// A type for vectorized values in the new loop. Each value from the
496 /// original loop, when vectorized, is represented by UF vector values in the
497 /// new unrolled loop, where UF is the unroll factor.
498 using VectorParts = SmallVector<Value *, 2>;
499
500 /// Vectorize a single GetElementPtrInst based on information gathered and
501 /// decisions taken during planning.
502 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
503 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
504 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
505
506 /// Vectorize a single PHINode in a block. This method handles the induction
507 /// variable canonicalization. It supports both VF = 1 for unrolled loops and
508 /// arbitrary length vectors.
509 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc,
510 VPWidenPHIRecipe *PhiR, VPTransformState &State);
511
512 /// A helper function to scalarize a single Instruction in the innermost loop.
513 /// Generates a sequence of scalar instances for each lane between \p MinLane
514 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
515 /// inclusive. Uses the VPValue operands from \p Operands instead of \p
516 /// Instr's operands.
517 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands,
518 const VPIteration &Instance, bool IfPredicateInstr,
519 VPTransformState &State);
520
521 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
522 /// is provided, the integer induction variable will first be truncated to
523 /// the corresponding type.
524 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc,
525 VPValue *Def, VPValue *CastDef,
526 VPTransformState &State);
527
528 /// Construct the vector value of a scalarized value \p V one lane at a time.
529 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
530 VPTransformState &State);
531
532 /// Try to vectorize interleaved access group \p Group with the base address
533 /// given in \p Addr, optionally masking the vector operations if \p
534 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
535 /// values in the vectorized loop.
536 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
537 ArrayRef<VPValue *> VPDefs,
538 VPTransformState &State, VPValue *Addr,
539 ArrayRef<VPValue *> StoredValues,
540 VPValue *BlockInMask = nullptr);
541
542 /// Vectorize Load and Store instructions with the base address given in \p
543 /// Addr, optionally masking the vector operations if \p BlockInMask is
544 /// non-null. Use \p State to translate given VPValues to IR values in the
545 /// vectorized loop.
546 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
547 VPValue *Def, VPValue *Addr,
548 VPValue *StoredValue, VPValue *BlockInMask);
549
550 /// Set the debug location in the builder using the debug location in
551 /// the instruction.
552 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
553
554 /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
555 void fixNonInductionPHIs(VPTransformState &State);
556
557 /// Returns true if the reordering of FP operations is not allowed, but we are
558 /// able to vectorize with strict in-order reductions for the given RdxDesc.
559 bool useOrderedReductions(RecurrenceDescriptor &RdxDesc);
560
561 /// Create a broadcast instruction. This method generates a broadcast
562 /// instruction (shuffle) for loop invariant values and for the induction
563 /// value. If this is the induction variable then we extend it to N, N+1, ...
564 /// this is needed because each iteration in the loop corresponds to a SIMD
565 /// element.
566 virtual Value *getBroadcastInstrs(Value *V);
567
568protected:
569 friend class LoopVectorizationPlanner;
570
571 /// A small list of PHINodes.
572 using PhiVector = SmallVector<PHINode *, 4>;
573
574 /// A type for scalarized values in the new loop. Each value from the
575 /// original loop, when scalarized, is represented by UF x VF scalar values
576 /// in the new unrolled loop, where UF is the unroll factor and VF is the
577 /// vectorization factor.
578 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
579
580 /// Set up the values of the IVs correctly when exiting the vector loop.
581 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
582 Value *CountRoundDown, Value *EndValue,
583 BasicBlock *MiddleBlock);
584
585 /// Create a new induction variable inside L.
586 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
587 Value *Step, Instruction *DL);
588
589 /// Handle all cross-iteration phis in the header.
590 void fixCrossIterationPHIs(VPTransformState &State);
591
592 /// Fix a first-order recurrence. This is the second phase of vectorizing
593 /// this phi node.
594 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State);
595
596 /// Fix a reduction cross-iteration phi. This is the second phase of
597 /// vectorizing this phi node.
598 void fixReduction(VPWidenPHIRecipe *Phi, VPTransformState &State);
599
600 /// Clear NSW/NUW flags from reduction instructions if necessary.
601 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
602 VPTransformState &State);
603
604 /// Fixup the LCSSA phi nodes in the unique exit block. This simply
605 /// means we need to add the appropriate incoming value from the middle
606 /// block as exiting edges from the scalar epilogue loop (if present) are
607 /// already in place, and we exit the vector loop exclusively to the middle
608 /// block.
609 void fixLCSSAPHIs(VPTransformState &State);
610
611 /// Iteratively sink the scalarized operands of a predicated instruction into
612 /// the block that was created for it.
613 void sinkScalarOperands(Instruction *PredInst);
614
615 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
616 /// represented as.
617 void truncateToMinimalBitwidths(VPTransformState &State);
618
619 /// This function adds
620 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
621 /// to each vector element of Val. The sequence starts at StartIndex.
622 /// \p Opcode is relevant for FP induction variable.
623 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
624 Instruction::BinaryOps Opcode =
625 Instruction::BinaryOpsEnd);
626
627 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
628 /// variable on which to base the steps, \p Step is the size of the step, and
629 /// \p EntryVal is the value from the original loop that maps to the steps.
630 /// Note that \p EntryVal doesn't have to be an induction variable - it
631 /// can also be a truncate instruction.
632 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
633 const InductionDescriptor &ID, VPValue *Def,
634 VPValue *CastDef, VPTransformState &State);
635
636 /// Create a vector induction phi node based on an existing scalar one. \p
637 /// EntryVal is the value from the original loop that maps to the vector phi
638 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
639 /// truncate instruction, instead of widening the original IV, we widen a
640 /// version of the IV truncated to \p EntryVal's type.
641 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
642 Value *Step, Value *Start,
643 Instruction *EntryVal, VPValue *Def,
644 VPValue *CastDef,
645 VPTransformState &State);
646
647 /// Returns true if an instruction \p I should be scalarized instead of
648 /// vectorized for the chosen vectorization factor.
649 bool shouldScalarizeInstruction(Instruction *I) const;
650
651 /// Returns true if we should generate a scalar version of \p IV.
652 bool needsScalarInduction(Instruction *IV) const;
653
654 /// If there is a cast involved in the induction variable \p ID, which should
655 /// be ignored in the vectorized loop body, this function records the
656 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
657 /// cast. We had already proved that the casted Phi is equal to the uncasted
658 /// Phi in the vectorized loop (under a runtime guard), and therefore
659 /// there is no need to vectorize the cast - the same value can be used in the
660 /// vector loop for both the Phi and the cast.
661 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
662 /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
663 ///
664 /// \p EntryVal is the value from the original loop that maps to the vector
665 /// phi node and is used to distinguish what is the IV currently being
666 /// processed - original one (if \p EntryVal is a phi corresponding to the
667 /// original IV) or the "newly-created" one based on the proof mentioned above
668 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
669 /// latter case \p EntryVal is a TruncInst and we must not record anything for
670 /// that IV, but it's error-prone to expect callers of this routine to care
671 /// about that, hence this explicit parameter.
672 void recordVectorLoopValueForInductionCast(
673 const InductionDescriptor &ID, const Instruction *EntryVal,
674 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
675 unsigned Part, unsigned Lane = UINT_MAX(2147483647 *2U +1U));
676
677 /// Generate a shuffle sequence that will reverse the vector Vec.
678 virtual Value *reverseVector(Value *Vec);
679
680 /// Returns (and creates if needed) the original loop trip count.
681 Value *getOrCreateTripCount(Loop *NewLoop);
682
683 /// Returns (and creates if needed) the trip count of the widened loop.
684 Value *getOrCreateVectorTripCount(Loop *NewLoop);
685
686 /// Returns a bitcasted value to the requested vector type.
687 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
688 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
689 const DataLayout &DL);
690
691 /// Emit a bypass check to see if the vector trip count is zero, including if
692 /// it overflows.
693 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
694
695 /// Emit a bypass check to see if all of the SCEV assumptions we've
696 /// had to make are correct. Returns the block containing the checks or
697 /// nullptr if no checks have been added.
698 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
699
700 /// Emit bypass checks to check any memory assumptions we may have made.
701 /// Returns the block containing the checks or nullptr if no checks have been
702 /// added.
703 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
704
705 /// Compute the transformed value of Index at offset StartValue using step
706 /// StepValue.
707 /// For integer induction, returns StartValue + Index * StepValue.
708 /// For pointer induction, returns StartValue[Index * StepValue].
709 /// FIXME: The newly created binary instructions should contain nsw/nuw
710 /// flags, which can be found from the original scalar operations.
711 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
712 const DataLayout &DL,
713 const InductionDescriptor &ID) const;
714
715 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
716 /// vector loop preheader, middle block and scalar preheader. Also
717 /// allocate a loop object for the new vector loop and return it.
718 Loop *createVectorLoopSkeleton(StringRef Prefix);
719
720 /// Create new phi nodes for the induction variables to resume iteration count
721 /// in the scalar epilogue, from where the vectorized loop left off (given by
722 /// \p VectorTripCount).
723 /// In cases where the loop skeleton is more complicated (eg. epilogue
724 /// vectorization) and the resume values can come from an additional bypass
725 /// block, the \p AdditionalBypass pair provides information about the bypass
726 /// block and the end value on the edge from bypass to this loop.
727 void createInductionResumeValues(
728 Loop *L, Value *VectorTripCount,
729 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
730
731 /// Complete the loop skeleton by adding debug MDs, creating appropriate
732 /// conditional branches in the middle block, preparing the builder and
733 /// running the verifier. Take in the vector loop \p L as argument, and return
734 /// the preheader of the completed vector loop.
735 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
736
737 /// Add additional metadata to \p To that was not present on \p Orig.
738 ///
739 /// Currently this is used to add the noalias annotations based on the
740 /// inserted memchecks. Use this for instructions that are *cloned* into the
741 /// vector loop.
742 void addNewMetadata(Instruction *To, const Instruction *Orig);
743
744 /// Add metadata from one instruction to another.
745 ///
746 /// This includes both the original MDs from \p From and additional ones (\see
747 /// addNewMetadata). Use this for *newly created* instructions in the vector
748 /// loop.
749 void addMetadata(Instruction *To, Instruction *From);
750
751 /// Similar to the previous function but it adds the metadata to a
752 /// vector of instructions.
753 void addMetadata(ArrayRef<Value *> To, Instruction *From);
754
755 /// Allow subclasses to override and print debug traces before/after vplan
756 /// execution, when trace information is requested.
757 virtual void printDebugTracesAtStart(){};
758 virtual void printDebugTracesAtEnd(){};
759
760 /// The original loop.
761 Loop *OrigLoop;
762
763 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
764 /// dynamic knowledge to simplify SCEV expressions and converts them to a
765 /// more usable form.
766 PredicatedScalarEvolution &PSE;
767
768 /// Loop Info.
769 LoopInfo *LI;
770
771 /// Dominator Tree.
772 DominatorTree *DT;
773
774 /// Alias Analysis.
775 AAResults *AA;
776
777 /// Target Library Info.
778 const TargetLibraryInfo *TLI;
779
780 /// Target Transform Info.
781 const TargetTransformInfo *TTI;
782
783 /// Assumption Cache.
784 AssumptionCache *AC;
785
786 /// Interface to emit optimization remarks.
787 OptimizationRemarkEmitter *ORE;
788
789 /// LoopVersioning. It's only set up (non-null) if memchecks were
790 /// used.
791 ///
792 /// This is currently only used to add no-alias metadata based on the
793 /// memchecks. The actually versioning is performed manually.
794 std::unique_ptr<LoopVersioning> LVer;
795
796 /// The vectorization SIMD factor to use. Each vector will have this many
797 /// vector elements.
798 ElementCount VF;
799
800 /// The vectorization unroll factor to use. Each scalar is vectorized to this
801 /// many different vector instructions.
802 unsigned UF;
803
804 /// The builder that we use
805 IRBuilder<> Builder;
806
807 // --- Vectorization state ---
808
809 /// The vector-loop preheader.
810 BasicBlock *LoopVectorPreHeader;
811
812 /// The scalar-loop preheader.
813 BasicBlock *LoopScalarPreHeader;
814
815 /// Middle Block between the vector and the scalar.
816 BasicBlock *LoopMiddleBlock;
817
818 /// The (unique) ExitBlock of the scalar loop. Note that
819 /// there can be multiple exiting edges reaching this block.
820 BasicBlock *LoopExitBlock;
821
822 /// The vector loop body.
823 BasicBlock *LoopVectorBody;
824
825 /// The scalar loop body.
826 BasicBlock *LoopScalarBody;
827
828 /// A list of all bypass blocks. The first block is the entry of the loop.
829 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
830
831 /// The new Induction variable which was added to the new block.
832 PHINode *Induction = nullptr;
833
834 /// The induction variable of the old basic block.
835 PHINode *OldInduction = nullptr;
836
837 /// Store instructions that were predicated.
838 SmallVector<Instruction *, 4> PredicatedInstructions;
839
840 /// Trip count of the original loop.
841 Value *TripCount = nullptr;
842
843 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
844 Value *VectorTripCount = nullptr;
845
846 /// The legality analysis.
847 LoopVectorizationLegality *Legal;
848
849 /// The profitablity analysis.
850 LoopVectorizationCostModel *Cost;
851
852 // Record whether runtime checks are added.
853 bool AddedSafetyChecks = false;
854
855 // Holds the end values for each induction variable. We save the end values
856 // so we can later fix-up the external users of the induction variables.
857 DenseMap<PHINode *, Value *> IVEndValues;
858
859 // Vector of original scalar PHIs whose corresponding widened PHIs need to be
860 // fixed up at the end of vector code generation.
861 SmallVector<PHINode *, 8> OrigPHIsToFix;
862
863 /// BFI and PSI are used to check for profile guided size optimizations.
864 BlockFrequencyInfo *BFI;
865 ProfileSummaryInfo *PSI;
866
867 // Whether this loop should be optimized for size based on profile guided size
868 // optimizatios.
869 bool OptForSizeBasedOnProfile;
870
871 /// Structure to hold information about generated runtime checks, responsible
872 /// for cleaning the checks, if vectorization turns out unprofitable.
873 GeneratedRTChecks &RTChecks;
874};
875
876class InnerLoopUnroller : public InnerLoopVectorizer {
877public:
878 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
879 LoopInfo *LI, DominatorTree *DT,
880 const TargetLibraryInfo *TLI,
881 const TargetTransformInfo *TTI, AssumptionCache *AC,
882 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
883 LoopVectorizationLegality *LVL,
884 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
885 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
886 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
887 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
888 BFI, PSI, Check) {}
889
890private:
891 Value *getBroadcastInstrs(Value *V) override;
892 Value *getStepVector(Value *Val, int StartIdx, Value *Step,
893 Instruction::BinaryOps Opcode =
894 Instruction::BinaryOpsEnd) override;
895 Value *reverseVector(Value *Vec) override;
896};
897
898/// Encapsulate information regarding vectorization of a loop and its epilogue.
899/// This information is meant to be updated and used across two stages of
900/// epilogue vectorization.
901struct EpilogueLoopVectorizationInfo {
902 ElementCount MainLoopVF = ElementCount::getFixed(0);
903 unsigned MainLoopUF = 0;
904 ElementCount EpilogueVF = ElementCount::getFixed(0);
905 unsigned EpilogueUF = 0;
906 BasicBlock *MainLoopIterationCountCheck = nullptr;
907 BasicBlock *EpilogueIterationCountCheck = nullptr;
908 BasicBlock *SCEVSafetyCheck = nullptr;
909 BasicBlock *MemSafetyCheck = nullptr;
910 Value *TripCount = nullptr;
911 Value *VectorTripCount = nullptr;
912
913 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
914 unsigned EUF)
915 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
916 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
917 assert(EUF == 1 &&(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 918, __extension__ __PRETTY_FUNCTION__))
918 "A high UF for the epilogue loop is likely not beneficial.")(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 918, __extension__ __PRETTY_FUNCTION__))
;
919 }
920};
921
922/// An extension of the inner loop vectorizer that creates a skeleton for a
923/// vectorized loop that has its epilogue (residual) also vectorized.
924/// The idea is to run the vplan on a given loop twice, firstly to setup the
925/// skeleton and vectorize the main loop, and secondly to complete the skeleton
926/// from the first step and vectorize the epilogue. This is achieved by
927/// deriving two concrete strategy classes from this base class and invoking
928/// them in succession from the loop vectorizer planner.
929class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
930public:
931 InnerLoopAndEpilogueVectorizer(
932 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
933 DominatorTree *DT, const TargetLibraryInfo *TLI,
934 const TargetTransformInfo *TTI, AssumptionCache *AC,
935 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
936 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
937 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
938 GeneratedRTChecks &Checks)
939 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
940 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
941 Checks),
942 EPI(EPI) {}
943
944 // Override this function to handle the more complex control flow around the
945 // three loops.
946 BasicBlock *createVectorizedLoopSkeleton() final override {
947 return createEpilogueVectorizedLoopSkeleton();
948 }
949
950 /// The interface for creating a vectorized skeleton using one of two
951 /// different strategies, each corresponding to one execution of the vplan
952 /// as described above.
953 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
954
955 /// Holds and updates state information required to vectorize the main loop
956 /// and its epilogue in two separate passes. This setup helps us avoid
957 /// regenerating and recomputing runtime safety checks. It also helps us to
958 /// shorten the iteration-count-check path length for the cases where the
959 /// iteration count of the loop is so small that the main vector loop is
960 /// completely skipped.
961 EpilogueLoopVectorizationInfo &EPI;
962};
963
964/// A specialized derived class of inner loop vectorizer that performs
965/// vectorization of *main* loops in the process of vectorizing loops and their
966/// epilogues.
967class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
968public:
969 EpilogueVectorizerMainLoop(
970 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
971 DominatorTree *DT, const TargetLibraryInfo *TLI,
972 const TargetTransformInfo *TTI, AssumptionCache *AC,
973 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
974 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
975 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
976 GeneratedRTChecks &Check)
977 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
978 EPI, LVL, CM, BFI, PSI, Check) {}
979 /// Implements the interface for creating a vectorized skeleton using the
980 /// *main loop* strategy (ie the first pass of vplan execution).
981 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
982
983protected:
984 /// Emits an iteration count bypass check once for the main loop (when \p
985 /// ForEpilogue is false) and once for the epilogue loop (when \p
986 /// ForEpilogue is true).
987 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
988 bool ForEpilogue);
989 void printDebugTracesAtStart() override;
990 void printDebugTracesAtEnd() override;
991};
992
993// A specialized derived class of inner loop vectorizer that performs
994// vectorization of *epilogue* loops in the process of vectorizing loops and
995// their epilogues.
996class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
997public:
998 EpilogueVectorizerEpilogueLoop(
999 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
1000 DominatorTree *DT, const TargetLibraryInfo *TLI,
1001 const TargetTransformInfo *TTI, AssumptionCache *AC,
1002 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
1003 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
1004 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
1005 GeneratedRTChecks &Checks)
1006 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1007 EPI, LVL, CM, BFI, PSI, Checks) {}
1008 /// Implements the interface for creating a vectorized skeleton using the
1009 /// *epilogue loop* strategy (ie the second pass of vplan execution).
1010 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1011
1012protected:
1013 /// Emits an iteration count bypass check after the main vector loop has
1014 /// finished to see if there are any iterations left to execute by either
1015 /// the vector epilogue or the scalar epilogue.
1016 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1017 BasicBlock *Bypass,
1018 BasicBlock *Insert);
1019 void printDebugTracesAtStart() override;
1020 void printDebugTracesAtEnd() override;
1021};
1022} // end namespace llvm
1023
1024/// Look for a meaningful debug location on the instruction or it's
1025/// operands.
1026static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1027 if (!I)
1028 return I;
1029
1030 DebugLoc Empty;
1031 if (I->getDebugLoc() != Empty)
1032 return I;
1033
1034 for (Use &Op : I->operands()) {
1035 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
1036 if (OpInst->getDebugLoc() != Empty)
1037 return OpInst;
1038 }
1039
1040 return I;
1041}
1042
1043void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
1044 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
1045 const DILocation *DIL = Inst->getDebugLoc();
1046
1047 // When a FSDiscriminator is enabled, we don't need to add the multiply
1048 // factors to the discriminators.
1049 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1050 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
1051 // FIXME: For scalable vectors, assume vscale=1.
1052 auto NewDIL =
1053 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1054 if (NewDIL)
1055 B.SetCurrentDebugLocation(NewDIL.getValue());
1056 else
1057 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
1058 << "Failed to create new discriminator: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
1059 << DIL->getFilename() << " Line: " << DIL->getLine())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
;
1060 } else
1061 B.SetCurrentDebugLocation(DIL);
1062 } else
1063 B.SetCurrentDebugLocation(DebugLoc());
1064}
1065
1066/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
1067/// is passed, the message relates to that particular instruction.
1068#ifndef NDEBUG
1069static void debugVectorizationMessage(const StringRef Prefix,
1070 const StringRef DebugMsg,
1071 Instruction *I) {
1072 dbgs() << "LV: " << Prefix << DebugMsg;
1073 if (I != nullptr)
1074 dbgs() << " " << *I;
1075 else
1076 dbgs() << '.';
1077 dbgs() << '\n';
1078}
1079#endif
1080
1081/// Create an analysis remark that explains why vectorization failed
1082///
1083/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
1084/// RemarkName is the identifier for the remark. If \p I is passed it is an
1085/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
1086/// the location of the remark. \return the remark object that can be
1087/// streamed to.
1088static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1089 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1090 Value *CodeRegion = TheLoop->getHeader();
1091 DebugLoc DL = TheLoop->getStartLoc();
1092
1093 if (I) {
1094 CodeRegion = I->getParent();
1095 // If there is no debug location attached to the instruction, revert back to
1096 // using the loop's.
1097 if (I->getDebugLoc())
1098 DL = I->getDebugLoc();
1099 }
1100
1101 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1102}
1103
1104/// Return a value for Step multiplied by VF.
1105static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1106 assert(isa<ConstantInt>(Step) && "Expected an integer step")(static_cast <bool> (isa<ConstantInt>(Step) &&
"Expected an integer step") ? void (0) : __assert_fail ("isa<ConstantInt>(Step) && \"Expected an integer step\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1106, __extension__ __PRETTY_FUNCTION__))
;
1107 Constant *StepVal = ConstantInt::get(
1108 Step->getType(),
1109 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1110 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1111}
1112
1113namespace llvm {
1114
1115/// Return the runtime value for VF.
1116Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
1117 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1118 return VF.isScalable() ? B.CreateVScale(EC) : EC;
1119}
1120
1121void reportVectorizationFailure(const StringRef DebugMsg,
1122 const StringRef OREMsg, const StringRef ORETag,
1123 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1124 Instruction *I) {
1125 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationMessage("Not vectorizing: "
, DebugMsg, I); } } while (false)
;
1126 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1127 ORE->emit(
1128 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1129 << "loop not vectorized: " << OREMsg);
1130}
1131
1132void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1133 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1134 Instruction *I) {
1135 LLVM_DEBUG(debugVectorizationMessage("", Msg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationMessage("", Msg, I); }
} while (false)
;
1136 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1137 ORE->emit(
1138 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1139 << Msg);
1140}
1141
1142} // end namespace llvm
1143
1144#ifndef NDEBUG
1145/// \return string containing a file name and a line # for the given loop.
1146static std::string getDebugLocString(const Loop *L) {
1147 std::string Result;
1148 if (L) {
1149 raw_string_ostream OS(Result);
1150 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1151 LoopDbgLoc.print(OS);
1152 else
1153 // Just print the module name.
1154 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1155 OS.flush();
1156 }
1157 return Result;
1158}
1159#endif
1160
1161void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1162 const Instruction *Orig) {
1163 // If the loop was versioned with memchecks, add the corresponding no-alias
1164 // metadata.
1165 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1166 LVer->annotateInstWithNoAlias(To, Orig);
1167}
1168
1169void InnerLoopVectorizer::addMetadata(Instruction *To,
1170 Instruction *From) {
1171 propagateMetadata(To, From);
1172 addNewMetadata(To, From);
1173}
1174
1175void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1176 Instruction *From) {
1177 for (Value *V : To) {
1178 if (Instruction *I = dyn_cast<Instruction>(V))
1179 addMetadata(I, From);
1180 }
1181}
1182
1183namespace llvm {
1184
1185// Loop vectorization cost-model hints how the scalar epilogue loop should be
1186// lowered.
1187enum ScalarEpilogueLowering {
1188
1189 // The default: allowing scalar epilogues.
1190 CM_ScalarEpilogueAllowed,
1191
1192 // Vectorization with OptForSize: don't allow epilogues.
1193 CM_ScalarEpilogueNotAllowedOptSize,
1194
1195 // A special case of vectorisation with OptForSize: loops with a very small
1196 // trip count are considered for vectorization under OptForSize, thereby
1197 // making sure the cost of their loop body is dominant, free of runtime
1198 // guards and scalar iteration overheads.
1199 CM_ScalarEpilogueNotAllowedLowTripLoop,
1200
1201 // Loop hint predicate indicating an epilogue is undesired.
1202 CM_ScalarEpilogueNotNeededUsePredicate,
1203
1204 // Directive indicating we must either tail fold or not vectorize
1205 CM_ScalarEpilogueNotAllowedUsePredicate
1206};
1207
1208/// ElementCountComparator creates a total ordering for ElementCount
1209/// for the purposes of using it in a set structure.
1210struct ElementCountComparator {
1211 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1212 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1213 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1214 }
1215};
1216using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1217
1218/// LoopVectorizationCostModel - estimates the expected speedups due to
1219/// vectorization.
1220/// In many cases vectorization is not profitable. This can happen because of
1221/// a number of reasons. In this class we mainly attempt to predict the
1222/// expected speedup/slowdowns due to the supported instruction set. We use the
1223/// TargetTransformInfo to query the different backends for the cost of
1224/// different operations.
1225class LoopVectorizationCostModel {
1226public:
1227 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1228 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1229 LoopVectorizationLegality *Legal,
1230 const TargetTransformInfo &TTI,
1231 const TargetLibraryInfo *TLI, DemandedBits *DB,
1232 AssumptionCache *AC,
1233 OptimizationRemarkEmitter *ORE, const Function *F,
1234 const LoopVectorizeHints *Hints,
1235 InterleavedAccessInfo &IAI)
1236 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1237 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1238 Hints(Hints), InterleaveInfo(IAI) {}
1239
1240 /// \return An upper bound for the vectorization factors (both fixed and
1241 /// scalable). If the factors are 0, vectorization and interleaving should be
1242 /// avoided up front.
1243 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1244
1245 /// \return True if runtime checks are required for vectorization, and false
1246 /// otherwise.
1247 bool runtimeChecksRequired();
1248
1249 /// \return The most profitable vectorization factor and the cost of that VF.
1250 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1251 /// then this vectorization factor will be selected if vectorization is
1252 /// possible.
1253 VectorizationFactor
1254 selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1255
1256 VectorizationFactor
1257 selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1258 const LoopVectorizationPlanner &LVP);
1259
1260 /// Setup cost-based decisions for user vectorization factor.
1261 void selectUserVectorizationFactor(ElementCount UserVF) {
1262 collectUniformsAndScalars(UserVF);
1263 collectInstsToScalarize(UserVF);
1264 }
1265
1266 /// \return The size (in bits) of the smallest and widest types in the code
1267 /// that needs to be vectorized. We ignore values that remain scalar such as
1268 /// 64 bit loop indices.
1269 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1270
1271 /// \return The desired interleave count.
1272 /// If interleave count has been specified by metadata it will be returned.
1273 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1274 /// are the selected vectorization factor and the cost of the selected VF.
1275 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1276
1277 /// Memory access instruction may be vectorized in more than one way.
1278 /// Form of instruction after vectorization depends on cost.
1279 /// This function takes cost-based decisions for Load/Store instructions
1280 /// and collects them in a map. This decisions map is used for building
1281 /// the lists of loop-uniform and loop-scalar instructions.
1282 /// The calculated cost is saved with widening decision in order to
1283 /// avoid redundant calculations.
1284 void setCostBasedWideningDecision(ElementCount VF);
1285
1286 /// A struct that represents some properties of the register usage
1287 /// of a loop.
1288 struct RegisterUsage {
1289 /// Holds the number of loop invariant values that are used in the loop.
1290 /// The key is ClassID of target-provided register class.
1291 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1292 /// Holds the maximum number of concurrent live intervals in the loop.
1293 /// The key is ClassID of target-provided register class.
1294 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1295 };
1296
1297 /// \return Returns information about the register usages of the loop for the
1298 /// given vectorization factors.
1299 SmallVector<RegisterUsage, 8>
1300 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1301
1302 /// Collect values we want to ignore in the cost model.
1303 void collectValuesToIgnore();
1304
1305 /// Split reductions into those that happen in the loop, and those that happen
1306 /// outside. In loop reductions are collected into InLoopReductionChains.
1307 void collectInLoopReductions();
1308
1309 /// Returns true if we should use strict in-order reductions for the given
1310 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1311 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1312 /// of FP operations.
1313 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1314 return EnableStrictReductions && !Hints->allowReordering() &&
1315 RdxDesc.isOrdered();
1316 }
1317
1318 /// \returns The smallest bitwidth each instruction can be represented with.
1319 /// The vector equivalents of these instructions should be truncated to this
1320 /// type.
1321 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1322 return MinBWs;
1323 }
1324
1325 /// \returns True if it is more profitable to scalarize instruction \p I for
1326 /// vectorization factor \p VF.
1327 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1328 assert(VF.isVector() &&(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1329, __extension__ __PRETTY_FUNCTION__))
1329 "Profitable to scalarize relevant only for VF > 1.")(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1329, __extension__ __PRETTY_FUNCTION__))
;
1330
1331 // Cost model is not run in the VPlan-native path - return conservative
1332 // result until this changes.
1333 if (EnableVPlanNativePath)
1334 return false;
1335
1336 auto Scalars = InstsToScalarize.find(VF);
1337 assert(Scalars != InstsToScalarize.end() &&(static_cast <bool> (Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability") ? void
(0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1338, __extension__ __PRETTY_FUNCTION__))
1338 "VF not yet analyzed for scalarization profitability")(static_cast <bool> (Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability") ? void
(0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1338, __extension__ __PRETTY_FUNCTION__))
;
1339 return Scalars->second.find(I) != Scalars->second.end();
1340 }
1341
1342 /// Returns true if \p I is known to be uniform after vectorization.
1343 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1344 if (VF.isScalar())
1345 return true;
1346
1347 // Cost model is not run in the VPlan-native path - return conservative
1348 // result until this changes.
1349 if (EnableVPlanNativePath)
1350 return false;
1351
1352 auto UniformsPerVF = Uniforms.find(VF);
1353 assert(UniformsPerVF != Uniforms.end() &&(static_cast <bool> (UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity") ? void (0) : __assert_fail
("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1354, __extension__ __PRETTY_FUNCTION__))
1354 "VF not yet analyzed for uniformity")(static_cast <bool> (UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity") ? void (0) : __assert_fail
("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1354, __extension__ __PRETTY_FUNCTION__))
;
1355 return UniformsPerVF->second.count(I);
1356 }
1357
1358 /// Returns true if \p I is known to be scalar after vectorization.
1359 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1360 if (VF.isScalar())
1361 return true;
1362
1363 // Cost model is not run in the VPlan-native path - return conservative
1364 // result until this changes.
1365 if (EnableVPlanNativePath)
1366 return false;
1367
1368 auto ScalarsPerVF = Scalars.find(VF);
1369 assert(ScalarsPerVF != Scalars.end() &&(static_cast <bool> (ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF") ? void (0) : __assert_fail
("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1370, __extension__ __PRETTY_FUNCTION__))
1370 "Scalar values are not calculated for VF")(static_cast <bool> (ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF") ? void (0) : __assert_fail
("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1370, __extension__ __PRETTY_FUNCTION__))
;
1371 return ScalarsPerVF->second.count(I);
1372 }
1373
1374 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1375 /// for vectorization factor \p VF.
1376 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1377 return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1378 !isProfitableToScalarize(I, VF) &&
1379 !isScalarAfterVectorization(I, VF);
1380 }
1381
1382 /// Decision that was taken during cost calculation for memory instruction.
1383 enum InstWidening {
1384 CM_Unknown,
1385 CM_Widen, // For consecutive accesses with stride +1.
1386 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1387 CM_Interleave,
1388 CM_GatherScatter,
1389 CM_Scalarize
1390 };
1391
1392 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1393 /// instruction \p I and vector width \p VF.
1394 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1395 InstructionCost Cost) {
1396 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1396, __extension__ __PRETTY_FUNCTION__))
;
1397 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1398 }
1399
1400 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1401 /// interleaving group \p Grp and vector width \p VF.
1402 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1403 ElementCount VF, InstWidening W,
1404 InstructionCost Cost) {
1405 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1405, __extension__ __PRETTY_FUNCTION__))
;
1406 /// Broadcast this decicion to all instructions inside the group.
1407 /// But the cost will be assigned to one instruction only.
1408 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1409 if (auto *I = Grp->getMember(i)) {
1410 if (Grp->getInsertPos() == I)
1411 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1412 else
1413 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1414 }
1415 }
1416 }
1417
1418 /// Return the cost model decision for the given instruction \p I and vector
1419 /// width \p VF. Return CM_Unknown if this instruction did not pass
1420 /// through the cost modeling.
1421 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1422 assert(VF.isVector() && "Expected VF to be a vector VF")(static_cast <bool> (VF.isVector() && "Expected VF to be a vector VF"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF to be a vector VF\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1422, __extension__ __PRETTY_FUNCTION__))
;
1423 // Cost model is not run in the VPlan-native path - return conservative
1424 // result until this changes.
1425 if (EnableVPlanNativePath)
1426 return CM_GatherScatter;
1427
1428 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1429 auto Itr = WideningDecisions.find(InstOnVF);
1430 if (Itr == WideningDecisions.end())
1431 return CM_Unknown;
1432 return Itr->second.first;
1433 }
1434
1435 /// Return the vectorization cost for the given instruction \p I and vector
1436 /// width \p VF.
1437 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1438 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1438, __extension__ __PRETTY_FUNCTION__))
;
1439 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1440 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&(static_cast <bool> (WideningDecisions.find(InstOnVF) !=
WideningDecisions.end() && "The cost is not calculated"
) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1441, __extension__ __PRETTY_FUNCTION__))
1441 "The cost is not calculated")(static_cast <bool> (WideningDecisions.find(InstOnVF) !=
WideningDecisions.end() && "The cost is not calculated"
) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1441, __extension__ __PRETTY_FUNCTION__))
;
1442 return WideningDecisions[InstOnVF].second;
1443 }
1444
1445 /// Return True if instruction \p I is an optimizable truncate whose operand
1446 /// is an induction variable. Such a truncate will be removed by adding a new
1447 /// induction variable with the destination type.
1448 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1449 // If the instruction is not a truncate, return false.
1450 auto *Trunc = dyn_cast<TruncInst>(I);
1451 if (!Trunc)
1452 return false;
1453
1454 // Get the source and destination types of the truncate.
1455 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1456 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1457
1458 // If the truncate is free for the given types, return false. Replacing a
1459 // free truncate with an induction variable would add an induction variable
1460 // update instruction to each iteration of the loop. We exclude from this
1461 // check the primary induction variable since it will need an update
1462 // instruction regardless.
1463 Value *Op = Trunc->getOperand(0);
1464 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1465 return false;
1466
1467 // If the truncated value is not an induction variable, return false.
1468 return Legal->isInductionPhi(Op);
1469 }
1470
1471 /// Collects the instructions to scalarize for each predicated instruction in
1472 /// the loop.
1473 void collectInstsToScalarize(ElementCount VF);
1474
1475 /// Collect Uniform and Scalar values for the given \p VF.
1476 /// The sets depend on CM decision for Load/Store instructions
1477 /// that may be vectorized as interleave, gather-scatter or scalarized.
1478 void collectUniformsAndScalars(ElementCount VF) {
1479 // Do the analysis once.
1480 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1481 return;
1482 setCostBasedWideningDecision(VF);
1483 collectLoopUniforms(VF);
1484 collectLoopScalars(VF);
1485 }
1486
1487 /// Returns true if the target machine supports masked store operation
1488 /// for the given \p DataType and kind of access to \p Ptr.
1489 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1490 return Legal->isConsecutivePtr(Ptr) &&
1491 TTI.isLegalMaskedStore(DataType, Alignment);
1492 }
1493
1494 /// Returns true if the target machine supports masked load operation
1495 /// for the given \p DataType and kind of access to \p Ptr.
1496 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1497 return Legal->isConsecutivePtr(Ptr) &&
1498 TTI.isLegalMaskedLoad(DataType, Alignment);
1499 }
1500
1501 /// Returns true if the target machine can represent \p V as a masked gather
1502 /// or scatter operation.
1503 bool isLegalGatherOrScatter(Value *V) {
1504 bool LI = isa<LoadInst>(V);
1505 bool SI = isa<StoreInst>(V);
1506 if (!LI && !SI)
1507 return false;
1508 auto *Ty = getLoadStoreType(V);
1509 Align Align = getLoadStoreAlignment(V);
1510 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1511 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1512 }
1513
1514 /// Returns true if the target machine supports all of the reduction
1515 /// variables found for the given VF.
1516 bool canVectorizeReductions(ElementCount VF) {
1517 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1518 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1519 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1520 }));
1521 }
1522
1523 /// Returns true if \p I is an instruction that will be scalarized with
1524 /// predication. Such instructions include conditional stores and
1525 /// instructions that may divide by zero.
1526 /// If a non-zero VF has been calculated, we check if I will be scalarized
1527 /// predication for that VF.
1528 bool isScalarWithPredication(Instruction *I) const;
1529
1530 // Returns true if \p I is an instruction that will be predicated either
1531 // through scalar predication or masked load/store or masked gather/scatter.
1532 // Superset of instructions that return true for isScalarWithPredication.
1533 bool isPredicatedInst(Instruction *I) {
1534 if (!blockNeedsPredication(I->getParent()))
1535 return false;
1536 // Loads and stores that need some form of masked operation are predicated
1537 // instructions.
1538 if (isa<LoadInst>(I) || isa<StoreInst>(I))
1539 return Legal->isMaskRequired(I);
1540 return isScalarWithPredication(I);
1541 }
1542
1543 /// Returns true if \p I is a memory instruction with consecutive memory
1544 /// access that can be widened.
1545 bool
1546 memoryInstructionCanBeWidened(Instruction *I,
1547 ElementCount VF = ElementCount::getFixed(1));
1548
1549 /// Returns true if \p I is a memory instruction in an interleaved-group
1550 /// of memory accesses that can be vectorized with wide vector loads/stores
1551 /// and shuffles.
1552 bool
1553 interleavedAccessCanBeWidened(Instruction *I,
1554 ElementCount VF = ElementCount::getFixed(1));
1555
1556 /// Check if \p Instr belongs to any interleaved access group.
1557 bool isAccessInterleaved(Instruction *Instr) {
1558 return InterleaveInfo.isInterleaved(Instr);
1559 }
1560
1561 /// Get the interleaved access group that \p Instr belongs to.
1562 const InterleaveGroup<Instruction> *
1563 getInterleavedAccessGroup(Instruction *Instr) {
1564 return InterleaveInfo.getInterleaveGroup(Instr);
1565 }
1566
1567 /// Returns true if we're required to use a scalar epilogue for at least
1568 /// the final iteration of the original loop.
1569 bool requiresScalarEpilogue() const {
1570 if (!isScalarEpilogueAllowed())
1571 return false;
1572 // If we might exit from anywhere but the latch, must run the exiting
1573 // iteration in scalar form.
1574 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1575 return true;
1576 return InterleaveInfo.requiresScalarEpilogue();
1577 }
1578
1579 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1580 /// loop hint annotation.
1581 bool isScalarEpilogueAllowed() const {
1582 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1583 }
1584
1585 /// Returns true if all loop blocks should be masked to fold tail loop.
1586 bool foldTailByMasking() const { return FoldTailByMasking; }
1587
1588 bool blockNeedsPredication(BasicBlock *BB) const {
1589 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1590 }
1591
1592 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1593 /// nodes to the chain of instructions representing the reductions. Uses a
1594 /// MapVector to ensure deterministic iteration order.
1595 using ReductionChainMap =
1596 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1597
1598 /// Return the chain of instructions representing an inloop reduction.
1599 const ReductionChainMap &getInLoopReductionChains() const {
1600 return InLoopReductionChains;
1601 }
1602
1603 /// Returns true if the Phi is part of an inloop reduction.
1604 bool isInLoopReduction(PHINode *Phi) const {
1605 return InLoopReductionChains.count(Phi);
1606 }
1607
1608 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1609 /// with factor VF. Return the cost of the instruction, including
1610 /// scalarization overhead if it's needed.
1611 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1612
1613 /// Estimate cost of a call instruction CI if it were vectorized with factor
1614 /// VF. Return the cost of the instruction, including scalarization overhead
1615 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1616 /// scalarized -
1617 /// i.e. either vector version isn't available, or is too expensive.
1618 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1619 bool &NeedToScalarize) const;
1620
1621 /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1622 /// that of B.
1623 bool isMoreProfitable(const VectorizationFactor &A,
1624 const VectorizationFactor &B) const;
1625
1626 /// Invalidates decisions already taken by the cost model.
1627 void invalidateCostModelingDecisions() {
1628 WideningDecisions.clear();
1629 Uniforms.clear();
1630 Scalars.clear();
1631 }
1632
1633private:
1634 unsigned NumPredStores = 0;
1635
1636 /// \return An upper bound for the vectorization factors for both
1637 /// fixed and scalable vectorization, where the minimum-known number of
1638 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1639 /// disabled or unsupported, then the scalable part will be equal to
1640 /// ElementCount::getScalable(0).
1641 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1642 ElementCount UserVF);
1643
1644 /// \return the maximized element count based on the targets vector
1645 /// registers and the loop trip-count, but limited to a maximum safe VF.
1646 /// This is a helper function of computeFeasibleMaxVF.
1647 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1648 /// issue that occurred on one of the buildbots which cannot be reproduced
1649 /// without having access to the properietary compiler (see comments on
1650 /// D98509). The issue is currently under investigation and this workaround
1651 /// will be removed as soon as possible.
1652 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1653 unsigned SmallestType,
1654 unsigned WidestType,
1655 const ElementCount &MaxSafeVF);
1656
1657 /// \return the maximum legal scalable VF, based on the safe max number
1658 /// of elements.
1659 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1660
1661 /// The vectorization cost is a combination of the cost itself and a boolean
1662 /// indicating whether any of the contributing operations will actually
1663 /// operate on
1664 /// vector values after type legalization in the backend. If this latter value
1665 /// is
1666 /// false, then all operations will be scalarized (i.e. no vectorization has
1667 /// actually taken place).
1668 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1669
1670 /// Returns the expected execution cost. The unit of the cost does
1671 /// not matter because we use the 'cost' units to compare different
1672 /// vector widths. The cost that is returned is *not* normalized by
1673 /// the factor width.
1674 VectorizationCostTy expectedCost(ElementCount VF);
1675
1676 /// Returns the execution time cost of an instruction for a given vector
1677 /// width. Vector width of one means scalar.
1678 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1679
1680 /// The cost-computation logic from getInstructionCost which provides
1681 /// the vector type as an output parameter.
1682 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1683 Type *&VectorTy);
1684
1685 /// Return the cost of instructions in an inloop reduction pattern, if I is
1686 /// part of that pattern.
1687 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF,
1688 Type *VectorTy,
1689 TTI::TargetCostKind CostKind);
1690
1691 /// Calculate vectorization cost of memory instruction \p I.
1692 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1693
1694 /// The cost computation for scalarized memory instruction.
1695 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1696
1697 /// The cost computation for interleaving group of memory instructions.
1698 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1699
1700 /// The cost computation for Gather/Scatter instruction.
1701 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1702
1703 /// The cost computation for widening instruction \p I with consecutive
1704 /// memory access.
1705 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1706
1707 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1708 /// Load: scalar load + broadcast.
1709 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1710 /// element)
1711 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1712
1713 /// Estimate the overhead of scalarizing an instruction. This is a
1714 /// convenience wrapper for the type-based getScalarizationOverhead API.
1715 InstructionCost getScalarizationOverhead(Instruction *I,
1716 ElementCount VF) const;
1717
1718 /// Returns whether the instruction is a load or store and will be a emitted
1719 /// as a vector operation.
1720 bool isConsecutiveLoadOrStore(Instruction *I);
1721
1722 /// Returns true if an artificially high cost for emulated masked memrefs
1723 /// should be used.
1724 bool useEmulatedMaskMemRefHack(Instruction *I);
1725
1726 /// Map of scalar integer values to the smallest bitwidth they can be legally
1727 /// represented as. The vector equivalents of these values should be truncated
1728 /// to this type.
1729 MapVector<Instruction *, uint64_t> MinBWs;
1730
1731 /// A type representing the costs for instructions if they were to be
1732 /// scalarized rather than vectorized. The entries are Instruction-Cost
1733 /// pairs.
1734 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1735
1736 /// A set containing all BasicBlocks that are known to present after
1737 /// vectorization as a predicated block.
1738 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1739
1740 /// Records whether it is allowed to have the original scalar loop execute at
1741 /// least once. This may be needed as a fallback loop in case runtime
1742 /// aliasing/dependence checks fail, or to handle the tail/remainder
1743 /// iterations when the trip count is unknown or doesn't divide by the VF,
1744 /// or as a peel-loop to handle gaps in interleave-groups.
1745 /// Under optsize and when the trip count is very small we don't allow any
1746 /// iterations to execute in the scalar loop.
1747 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1748
1749 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1750 bool FoldTailByMasking = false;
1751
1752 /// A map holding scalar costs for different vectorization factors. The
1753 /// presence of a cost for an instruction in the mapping indicates that the
1754 /// instruction will be scalarized when vectorizing with the associated
1755 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1756 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1757
1758 /// Holds the instructions known to be uniform after vectorization.
1759 /// The data is collected per VF.
1760 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1761
1762 /// Holds the instructions known to be scalar after vectorization.
1763 /// The data is collected per VF.
1764 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1765
1766 /// Holds the instructions (address computations) that are forced to be
1767 /// scalarized.
1768 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1769
1770 /// PHINodes of the reductions that should be expanded in-loop along with
1771 /// their associated chains of reduction operations, in program order from top
1772 /// (PHI) to bottom
1773 ReductionChainMap InLoopReductionChains;
1774
1775 /// A Map of inloop reduction operations and their immediate chain operand.
1776 /// FIXME: This can be removed once reductions can be costed correctly in
1777 /// vplan. This was added to allow quick lookup to the inloop operations,
1778 /// without having to loop through InLoopReductionChains.
1779 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1780
1781 /// Returns the expected difference in cost from scalarizing the expression
1782 /// feeding a predicated instruction \p PredInst. The instructions to
1783 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1784 /// non-negative return value implies the expression will be scalarized.
1785 /// Currently, only single-use chains are considered for scalarization.
1786 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1787 ElementCount VF);
1788
1789 /// Collect the instructions that are uniform after vectorization. An
1790 /// instruction is uniform if we represent it with a single scalar value in
1791 /// the vectorized loop corresponding to each vector iteration. Examples of
1792 /// uniform instructions include pointer operands of consecutive or
1793 /// interleaved memory accesses. Note that although uniformity implies an
1794 /// instruction will be scalar, the reverse is not true. In general, a
1795 /// scalarized instruction will be represented by VF scalar values in the
1796 /// vectorized loop, each corresponding to an iteration of the original
1797 /// scalar loop.
1798 void collectLoopUniforms(ElementCount VF);
1799
1800 /// Collect the instructions that are scalar after vectorization. An
1801 /// instruction is scalar if it is known to be uniform or will be scalarized
1802 /// during vectorization. Non-uniform scalarized instructions will be
1803 /// represented by VF values in the vectorized loop, each corresponding to an
1804 /// iteration of the original scalar loop.
1805 void collectLoopScalars(ElementCount VF);
1806
1807 /// Keeps cost model vectorization decision and cost for instructions.
1808 /// Right now it is used for memory instructions only.
1809 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1810 std::pair<InstWidening, InstructionCost>>;
1811
1812 DecisionList WideningDecisions;
1813
1814 /// Returns true if \p V is expected to be vectorized and it needs to be
1815 /// extracted.
1816 bool needsExtract(Value *V, ElementCount VF) const {
1817 Instruction *I = dyn_cast<Instruction>(V);
1818 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1819 TheLoop->isLoopInvariant(I))
1820 return false;
1821
1822 // Assume we can vectorize V (and hence we need extraction) if the
1823 // scalars are not computed yet. This can happen, because it is called
1824 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1825 // the scalars are collected. That should be a safe assumption in most
1826 // cases, because we check if the operands have vectorizable types
1827 // beforehand in LoopVectorizationLegality.
1828 return Scalars.find(VF) == Scalars.end() ||
1829 !isScalarAfterVectorization(I, VF);
1830 };
1831
1832 /// Returns a range containing only operands needing to be extracted.
1833 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1834 ElementCount VF) const {
1835 return SmallVector<Value *, 4>(make_filter_range(
1836 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1837 }
1838
1839 /// Determines if we have the infrastructure to vectorize loop \p L and its
1840 /// epilogue, assuming the main loop is vectorized by \p VF.
1841 bool isCandidateForEpilogueVectorization(const Loop &L,
1842 const ElementCount VF) const;
1843
1844 /// Returns true if epilogue vectorization is considered profitable, and
1845 /// false otherwise.
1846 /// \p VF is the vectorization factor chosen for the original loop.
1847 bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1848
1849public:
1850 /// The loop that we evaluate.
1851 Loop *TheLoop;
1852
1853 /// Predicated scalar evolution analysis.
1854 PredicatedScalarEvolution &PSE;
1855
1856 /// Loop Info analysis.
1857 LoopInfo *LI;
1858
1859 /// Vectorization legality.
1860 LoopVectorizationLegality *Legal;
1861
1862 /// Vector target information.
1863 const TargetTransformInfo &TTI;
1864
1865 /// Target Library Info.
1866 const TargetLibraryInfo *TLI;
1867
1868 /// Demanded bits analysis.
1869 DemandedBits *DB;
1870
1871 /// Assumption cache.
1872 AssumptionCache *AC;
1873
1874 /// Interface to emit optimization remarks.
1875 OptimizationRemarkEmitter *ORE;
1876
1877 const Function *TheFunction;
1878
1879 /// Loop Vectorize Hint.
1880 const LoopVectorizeHints *Hints;
1881
1882 /// The interleave access information contains groups of interleaved accesses
1883 /// with the same stride and close to each other.
1884 InterleavedAccessInfo &InterleaveInfo;
1885
1886 /// Values to ignore in the cost model.
1887 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1888
1889 /// Values to ignore in the cost model when VF > 1.
1890 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1891
1892 /// Profitable vector factors.
1893 SmallVector<VectorizationFactor, 8> ProfitableVFs;
1894};
1895} // end namespace llvm
1896
1897/// Helper struct to manage generating runtime checks for vectorization.
1898///
1899/// The runtime checks are created up-front in temporary blocks to allow better
1900/// estimating the cost and un-linked from the existing IR. After deciding to
1901/// vectorize, the checks are moved back. If deciding not to vectorize, the
1902/// temporary blocks are completely removed.
1903class GeneratedRTChecks {
1904 /// Basic block which contains the generated SCEV checks, if any.
1905 BasicBlock *SCEVCheckBlock = nullptr;
1906
1907 /// The value representing the result of the generated SCEV checks. If it is
1908 /// nullptr, either no SCEV checks have been generated or they have been used.
1909 Value *SCEVCheckCond = nullptr;
1910
1911 /// Basic block which contains the generated memory runtime checks, if any.
1912 BasicBlock *MemCheckBlock = nullptr;
1913
1914 /// The value representing the result of the generated memory runtime checks.
1915 /// If it is nullptr, either no memory runtime checks have been generated or
1916 /// they have been used.
1917 Instruction *MemRuntimeCheckCond = nullptr;
1918
1919 DominatorTree *DT;
1920 LoopInfo *LI;
1921
1922 SCEVExpander SCEVExp;
1923 SCEVExpander MemCheckExp;
1924
1925public:
1926 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1927 const DataLayout &DL)
1928 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1929 MemCheckExp(SE, DL, "scev.check") {}
1930
1931 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1932 /// accurately estimate the cost of the runtime checks. The blocks are
1933 /// un-linked from the IR and is added back during vector code generation. If
1934 /// there is no vector code generation, the check blocks are removed
1935 /// completely.
1936 void Create(Loop *L, const LoopAccessInfo &LAI,
1937 const SCEVUnionPredicate &UnionPred) {
1938
1939 BasicBlock *LoopHeader = L->getHeader();
1940 BasicBlock *Preheader = L->getLoopPreheader();
1941
1942 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1943 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1944 // may be used by SCEVExpander. The blocks will be un-linked from their
1945 // predecessors and removed from LI & DT at the end of the function.
1946 if (!UnionPred.isAlwaysTrue()) {
1947 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1948 nullptr, "vector.scevcheck");
1949
1950 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1951 &UnionPred, SCEVCheckBlock->getTerminator());
1952 }
1953
1954 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1955 if (RtPtrChecking.Need) {
1956 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1957 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1958 "vector.memcheck");
1959
1960 std::tie(std::ignore, MemRuntimeCheckCond) =
1961 addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1962 RtPtrChecking.getChecks(), MemCheckExp);
1963 assert(MemRuntimeCheckCond &&(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1965, __extension__ __PRETTY_FUNCTION__))
1964 "no RT checks generated although RtPtrChecking "(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1965, __extension__ __PRETTY_FUNCTION__))
1965 "claimed checks are required")(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1965, __extension__ __PRETTY_FUNCTION__))
;
1966 }
1967
1968 if (!MemCheckBlock && !SCEVCheckBlock)
1969 return;
1970
1971 // Unhook the temporary block with the checks, update various places
1972 // accordingly.
1973 if (SCEVCheckBlock)
1974 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1975 if (MemCheckBlock)
1976 MemCheckBlock->replaceAllUsesWith(Preheader);
1977
1978 if (SCEVCheckBlock) {
1979 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1980 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1981 Preheader->getTerminator()->eraseFromParent();
1982 }
1983 if (MemCheckBlock) {
1984 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1985 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1986 Preheader->getTerminator()->eraseFromParent();
1987 }
1988
1989 DT->changeImmediateDominator(LoopHeader, Preheader);
1990 if (MemCheckBlock) {
1991 DT->eraseNode(MemCheckBlock);
1992 LI->removeBlock(MemCheckBlock);
1993 }
1994 if (SCEVCheckBlock) {
1995 DT->eraseNode(SCEVCheckBlock);
1996 LI->removeBlock(SCEVCheckBlock);
1997 }
1998 }
1999
2000 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2001 /// unused.
2002 ~GeneratedRTChecks() {
2003 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
2004 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
2005 if (!SCEVCheckCond)
2006 SCEVCleaner.markResultUsed();
2007
2008 if (!MemRuntimeCheckCond)
2009 MemCheckCleaner.markResultUsed();
2010
2011 if (MemRuntimeCheckCond) {
2012 auto &SE = *MemCheckExp.getSE();
2013 // Memory runtime check generation creates compares that use expanded
2014 // values. Remove them before running the SCEVExpanderCleaners.
2015 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2016 if (MemCheckExp.isInsertedInstruction(&I))
2017 continue;
2018 SE.forgetValue(&I);
2019 SE.eraseValueFromMap(&I);
2020 I.eraseFromParent();
2021 }
2022 }
2023 MemCheckCleaner.cleanup();
2024 SCEVCleaner.cleanup();
2025
2026 if (SCEVCheckCond)
2027 SCEVCheckBlock->eraseFromParent();
2028 if (MemRuntimeCheckCond)
2029 MemCheckBlock->eraseFromParent();
2030 }
2031
2032 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2033 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2034 /// depending on the generated condition.
2035 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
2036 BasicBlock *LoopVectorPreHeader,
2037 BasicBlock *LoopExitBlock) {
2038 if (!SCEVCheckCond)
2039 return nullptr;
2040 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2041 if (C->isZero())
2042 return nullptr;
2043
2044 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2045
2046 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2047 // Create new preheader for vector loop.
2048 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2049 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2050
2051 SCEVCheckBlock->getTerminator()->eraseFromParent();
2052 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2053 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2054 SCEVCheckBlock);
2055
2056 DT->addNewBlock(SCEVCheckBlock, Pred);
2057 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2058
2059 ReplaceInstWithInst(
2060 SCEVCheckBlock->getTerminator(),
2061 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2062 // Mark the check as used, to prevent it from being removed during cleanup.
2063 SCEVCheckCond = nullptr;
2064 return SCEVCheckBlock;
2065 }
2066
2067 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2068 /// the branches to branch to the vector preheader or \p Bypass, depending on
2069 /// the generated condition.
2070 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2071 BasicBlock *LoopVectorPreHeader) {
2072 // Check if we generated code that checks in runtime if arrays overlap.
2073 if (!MemRuntimeCheckCond)
2074 return nullptr;
2075
2076 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2077 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2078 MemCheckBlock);
2079
2080 DT->addNewBlock(MemCheckBlock, Pred);
2081 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2082 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2083
2084 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2085 PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2086
2087 ReplaceInstWithInst(
2088 MemCheckBlock->getTerminator(),
2089 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2090 MemCheckBlock->getTerminator()->setDebugLoc(
2091 Pred->getTerminator()->getDebugLoc());
2092
2093 // Mark the check as used, to prevent it from being removed during cleanup.
2094 MemRuntimeCheckCond = nullptr;
2095 return MemCheckBlock;
2096 }
2097};
2098
2099// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2100// vectorization. The loop needs to be annotated with #pragma omp simd
2101// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2102// vector length information is not provided, vectorization is not considered
2103// explicit. Interleave hints are not allowed either. These limitations will be
2104// relaxed in the future.
2105// Please, note that we are currently forced to abuse the pragma 'clang
2106// vectorize' semantics. This pragma provides *auto-vectorization hints*
2107// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2108// provides *explicit vectorization hints* (LV can bypass legal checks and
2109// assume that vectorization is legal). However, both hints are implemented
2110// using the same metadata (llvm.loop.vectorize, processed by
2111// LoopVectorizeHints). This will be fixed in the future when the native IR
2112// representation for pragma 'omp simd' is introduced.
2113static bool isExplicitVecOuterLoop(Loop *OuterLp,
2114 OptimizationRemarkEmitter *ORE) {
2115 assert(!OuterLp->isInnermost() && "This is not an outer loop")(static_cast <bool> (!OuterLp->isInnermost() &&
"This is not an outer loop") ? void (0) : __assert_fail ("!OuterLp->isInnermost() && \"This is not an outer loop\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2115, __extension__ __PRETTY_FUNCTION__))
;
2116 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2117
2118 // Only outer loops with an explicit vectorization hint are supported.
2119 // Unannotated outer loops are ignored.
2120 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2121 return false;
2122
2123 Function *Fn = OuterLp->getHeader()->getParent();
2124 if (!Hints.allowVectorization(Fn, OuterLp,
2125 true /*VectorizeOnlyWhenForced*/)) {
2126 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"
; } } while (false)
;
2127 return false;
2128 }
2129
2130 if (Hints.getInterleave() > 1) {
2131 // TODO: Interleave support is future work.
2132 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
2133 "outer loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
;
2134 Hints.emitRemarkWithHints();
2135 return false;
2136 }
2137
2138 return true;
2139}
2140
2141static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2142 OptimizationRemarkEmitter *ORE,
2143 SmallVectorImpl<Loop *> &V) {
2144 // Collect inner loops and outer loops without irreducible control flow. For
2145 // now, only collect outer loops that have explicit vectorization hints. If we
2146 // are stress testing the VPlan H-CFG construction, we collect the outermost
2147 // loop of every loop nest.
2148 if (L.isInnermost() || VPlanBuildStressTest ||
2149 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2150 LoopBlocksRPO RPOT(&L);
2151 RPOT.perform(LI);
2152 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2153 V.push_back(&L);
2154 // TODO: Collect inner loops inside marked outer loops in case
2155 // vectorization fails for the outer loop. Do not invoke
2156 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2157 // already known to be reducible. We can use an inherited attribute for
2158 // that.
2159 return;
2160 }
2161 }
2162 for (Loop *InnerL : L)
2163 collectSupportedLoops(*InnerL, LI, ORE, V);
2164}
2165
2166namespace {
2167
2168/// The LoopVectorize Pass.
2169struct LoopVectorize : public FunctionPass {
2170 /// Pass identification, replacement for typeid
2171 static char ID;
2172
2173 LoopVectorizePass Impl;
2174
2175 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2176 bool VectorizeOnlyWhenForced = false)
2177 : FunctionPass(ID),
2178 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2179 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2180 }
2181
2182 bool runOnFunction(Function &F) override {
2183 if (skipFunction(F))
2184 return false;
2185
2186 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2187 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2188 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2189 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2190 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2191 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2192 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2193 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2194 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2195 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2196 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2197 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2198 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2199
2200 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2201 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2202
2203 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2204 GetLAA, *ORE, PSI).MadeAnyChange;
2205 }
2206
2207 void getAnalysisUsage(AnalysisUsage &AU) const override {
2208 AU.addRequired<AssumptionCacheTracker>();
2209 AU.addRequired<BlockFrequencyInfoWrapperPass>();
2210 AU.addRequired<DominatorTreeWrapperPass>();
2211 AU.addRequired<LoopInfoWrapperPass>();
2212 AU.addRequired<ScalarEvolutionWrapperPass>();
2213 AU.addRequired<TargetTransformInfoWrapperPass>();
2214 AU.addRequired<AAResultsWrapperPass>();
2215 AU.addRequired<LoopAccessLegacyAnalysis>();
2216 AU.addRequired<DemandedBitsWrapperPass>();
2217 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2218 AU.addRequired<InjectTLIMappingsLegacy>();
2219
2220 // We currently do not preserve loopinfo/dominator analyses with outer loop
2221 // vectorization. Until this is addressed, mark these analyses as preserved
2222 // only for non-VPlan-native path.
2223 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2224 if (!EnableVPlanNativePath) {
2225 AU.addPreserved<LoopInfoWrapperPass>();
2226 AU.addPreserved<DominatorTreeWrapperPass>();
2227 }
2228
2229 AU.addPreserved<BasicAAWrapperPass>();
2230 AU.addPreserved<GlobalsAAWrapperPass>();
2231 AU.addRequired<ProfileSummaryInfoWrapperPass>();
2232 }
2233};
2234
2235} // end anonymous namespace
2236
2237//===----------------------------------------------------------------------===//
2238// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2239// LoopVectorizationCostModel and LoopVectorizationPlanner.
2240//===----------------------------------------------------------------------===//
2241
2242Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2243 // We need to place the broadcast of invariant variables outside the loop,
2244 // but only if it's proven safe to do so. Else, broadcast will be inside
2245 // vector loop body.
2246 Instruction *Instr = dyn_cast<Instruction>(V);
2247 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2248 (!Instr ||
2249 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2250 // Place the code for broadcasting invariant variables in the new preheader.
2251 IRBuilder<>::InsertPointGuard Guard(Builder);
2252 if (SafeToHoist)
2253 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2254
2255 // Broadcast the scalar into all locations in the vector.
2256 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2257
2258 return Shuf;
2259}
2260
2261void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2262 const InductionDescriptor &II, Value *Step, Value *Start,
2263 Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
2264 VPTransformState &State) {
2265 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2266, __extension__ __PRETTY_FUNCTION__))
2266 "Expected either an induction phi-node or a truncate of it!")(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2266, __extension__ __PRETTY_FUNCTION__))
;
2267
2268 // Construct the initial value of the vector IV in the vector loop preheader
2269 auto CurrIP = Builder.saveIP();
2270 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2271 if (isa<TruncInst>(EntryVal)) {
2272 assert(Start->getType()->isIntegerTy() &&(static_cast <bool> (Start->getType()->isIntegerTy
() && "Truncation requires an integer type") ? void (
0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2273, __extension__ __PRETTY_FUNCTION__))
2273 "Truncation requires an integer type")(static_cast <bool> (Start->getType()->isIntegerTy
() && "Truncation requires an integer type") ? void (
0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2273, __extension__ __PRETTY_FUNCTION__))
;
2274 auto *TruncType = cast<IntegerType>(EntryVal->getType());
2275 Step = Builder.CreateTrunc(Step, TruncType);
2276 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2277 }
2278 Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2279 Value *SteppedStart =
2280 getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2281
2282 // We create vector phi nodes for both integer and floating-point induction
2283 // variables. Here, we determine the kind of arithmetic we will perform.
2284 Instruction::BinaryOps AddOp;
2285 Instruction::BinaryOps MulOp;
2286 if (Step->getType()->isIntegerTy()) {
2287 AddOp = Instruction::Add;
2288 MulOp = Instruction::Mul;
2289 } else {
2290 AddOp = II.getInductionOpcode();
2291 MulOp = Instruction::FMul;
2292 }
2293
2294 // Multiply the vectorization factor by the step using integer or
2295 // floating-point arithmetic as appropriate.
2296 Type *StepType = Step->getType();
2297 if (Step->getType()->isFloatingPointTy())
2298 StepType = IntegerType::get(StepType->getContext(),
2299 StepType->getScalarSizeInBits());
2300 Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF);
2301 if (Step->getType()->isFloatingPointTy())
2302 RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType());
2303 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2304
2305 // Create a vector splat to use in the induction update.
2306 //
2307 // FIXME: If the step is non-constant, we create the vector splat with
2308 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2309 // handle a constant vector splat.
2310 Value *SplatVF = isa<Constant>(Mul)
2311 ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2312 : Builder.CreateVectorSplat(VF, Mul);
2313 Builder.restoreIP(CurrIP);
2314
2315 // We may need to add the step a number of times, depending on the unroll
2316 // factor. The last of those goes into the PHI.
2317 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2318 &*LoopVectorBody->getFirstInsertionPt());
2319 VecInd->setDebugLoc(EntryVal->getDebugLoc());
2320 Instruction *LastInduction = VecInd;
2321 for (unsigned Part = 0; Part < UF; ++Part) {
2322 State.set(Def, LastInduction, Part);
2323
2324 if (isa<TruncInst>(EntryVal))
2325 addMetadata(LastInduction, EntryVal);
2326 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
2327 State, Part);
2328
2329 LastInduction = cast<Instruction>(
2330 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2331 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2332 }
2333
2334 // Move the last step to the end of the latch block. This ensures consistent
2335 // placement of all induction updates.
2336 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2337 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2338 auto *ICmp = cast<Instruction>(Br->getCondition());
2339 LastInduction->moveBefore(ICmp);
2340 LastInduction->setName("vec.ind.next");
2341
2342 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2343 VecInd->addIncoming(LastInduction, LoopVectorLatch);
2344}
2345
2346bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2347 return Cost->isScalarAfterVectorization(I, VF) ||
2348 Cost->isProfitableToScalarize(I, VF);
2349}
2350
2351bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2352 if (shouldScalarizeInstruction(IV))
2353 return true;
2354 auto isScalarInst = [&](User *U) -> bool {
2355 auto *I = cast<Instruction>(U);
2356 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2357 };
2358 return llvm::any_of(IV->users(), isScalarInst);
2359}
2360
2361void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2362 const InductionDescriptor &ID, const Instruction *EntryVal,
2363 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
2364 unsigned Part, unsigned Lane) {
2365 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2366, __extension__ __PRETTY_FUNCTION__))
2366 "Expected either an induction phi-node or a truncate of it!")(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2366, __extension__ __PRETTY_FUNCTION__))
;
2367
2368 // This induction variable is not the phi from the original loop but the
2369 // newly-created IV based on the proof that casted Phi is equal to the
2370 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2371 // re-uses the same InductionDescriptor that original IV uses but we don't
2372 // have to do any recording in this case - that is done when original IV is
2373 // processed.
2374 if (isa<TruncInst>(EntryVal))
2375 return;
2376
2377 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2378 if (Casts.empty())
2379 return;
2380 // Only the first Cast instruction in the Casts vector is of interest.
2381 // The rest of the Casts (if exist) have no uses outside the
2382 // induction update chain itself.
2383 if (Lane < UINT_MAX(2147483647 *2U +1U))
2384 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
2385 else
2386 State.set(CastDef, VectorLoopVal, Part);
2387}
2388
2389void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
2390 TruncInst *Trunc, VPValue *Def,
2391 VPValue *CastDef,
2392 VPTransformState &State) {
2393 assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&(static_cast <bool> ((IV->getType()->isIntegerTy(
) || IV != OldInduction) && "Primary induction variable must have an integer type"
) ? void (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2394, __extension__ __PRETTY_FUNCTION__))
2394 "Primary induction variable must have an integer type")(static_cast <bool> ((IV->getType()->isIntegerTy(
) || IV != OldInduction) && "Primary induction variable must have an integer type"
) ? void (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2394, __extension__ __PRETTY_FUNCTION__))
;
2395
2396 auto II = Legal->getInductionVars().find(IV);
2397 assert(II != Legal->getInductionVars().end() && "IV is not an induction")(static_cast <bool> (II != Legal->getInductionVars()
.end() && "IV is not an induction") ? void (0) : __assert_fail
("II != Legal->getInductionVars().end() && \"IV is not an induction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2397, __extension__ __PRETTY_FUNCTION__))
;
2398
2399 auto ID = II->second;
2400 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match")(static_cast <bool> (IV->getType() == ID.getStartValue
()->getType() && "Types must match") ? void (0) : __assert_fail
("IV->getType() == ID.getStartValue()->getType() && \"Types must match\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2400, __extension__ __PRETTY_FUNCTION__))
;
2401
2402 // The value from the original loop to which we are mapping the new induction
2403 // variable.
2404 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2405
2406 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2407
2408 // Generate code for the induction step. Note that induction steps are
2409 // required to be loop-invariant
2410 auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2411 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&(static_cast <bool> (PSE.getSE()->isLoopInvariant(Step
, OrigLoop) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(Step, OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2412, __extension__ __PRETTY_FUNCTION__))
2412 "Induction step should be loop invariant")(static_cast <bool> (PSE.getSE()->isLoopInvariant(Step
, OrigLoop) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(Step, OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2412, __extension__ __PRETTY_FUNCTION__))
;
2413 if (PSE.getSE()->isSCEVable(IV->getType())) {
2414 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2415 return Exp.expandCodeFor(Step, Step->getType(),
2416 LoopVectorPreHeader->getTerminator());
2417 }
2418 return cast<SCEVUnknown>(Step)->getValue();
2419 };
2420
2421 // The scalar value to broadcast. This is derived from the canonical
2422 // induction variable. If a truncation type is given, truncate the canonical
2423 // induction variable and step. Otherwise, derive these values from the
2424 // induction descriptor.
2425 auto CreateScalarIV = [&](Value *&Step) -> Value * {
2426 Value *ScalarIV = Induction;
2427 if (IV != OldInduction) {
2428 ScalarIV = IV->getType()->isIntegerTy()
2429 ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2430 : Builder.CreateCast(Instruction::SIToFP, Induction,
2431 IV->getType());
2432 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2433 ScalarIV->setName("offset.idx");
2434 }
2435 if (Trunc) {
2436 auto *TruncType = cast<IntegerType>(Trunc->getType());
2437 assert(Step->getType()->isIntegerTy() &&(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2438, __extension__ __PRETTY_FUNCTION__))
2438 "Truncation requires an integer step")(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2438, __extension__ __PRETTY_FUNCTION__))
;
2439 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2440 Step = Builder.CreateTrunc(Step, TruncType);
2441 }
2442 return ScalarIV;
2443 };
2444
2445 // Create the vector values from the scalar IV, in the absence of creating a
2446 // vector IV.
2447 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2448 Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2449 for (unsigned Part = 0; Part < UF; ++Part) {
2450 assert(!VF.isScalable() && "scalable vectors not yet supported.")(static_cast <bool> (!VF.isScalable() && "scalable vectors not yet supported."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2450, __extension__ __PRETTY_FUNCTION__))
;
2451 Value *EntryPart =
2452 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2453 ID.getInductionOpcode());
2454 State.set(Def, EntryPart, Part);
2455 if (Trunc)
2456 addMetadata(EntryPart, Trunc);
2457 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
2458 State, Part);
2459 }
2460 };
2461
2462 // Fast-math-flags propagate from the original induction instruction.
2463 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2464 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2465 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2466
2467 // Now do the actual transformations, and start with creating the step value.
2468 Value *Step = CreateStepValue(ID.getStep());
2469 if (VF.isZero() || VF.isScalar()) {
2470 Value *ScalarIV = CreateScalarIV(Step);
2471 CreateSplatIV(ScalarIV, Step);
2472 return;
2473 }
2474
2475 // Determine if we want a scalar version of the induction variable. This is
2476 // true if the induction variable itself is not widened, or if it has at
2477 // least one user in the loop that is not widened.
2478 auto NeedsScalarIV = needsScalarInduction(EntryVal);
2479 if (!NeedsScalarIV) {
2480 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2481 State);
2482 return;
2483 }
2484
2485 // Try to create a new independent vector induction variable. If we can't
2486 // create the phi node, we will splat the scalar induction variable in each
2487 // loop iteration.
2488 if (!shouldScalarizeInstruction(EntryVal)) {
2489 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2490 State);
2491 Value *ScalarIV = CreateScalarIV(Step);
2492 // Create scalar steps that can be used by instructions we will later
2493 // scalarize. Note that the addition of the scalar steps will not increase
2494 // the number of instructions in the loop in the common case prior to
2495 // InstCombine. We will be trading one vector extract for each scalar step.
2496 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2497 return;
2498 }
2499
2500 // All IV users are scalar instructions, so only emit a scalar IV, not a
2501 // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2502 // predicate used by the masked loads/stores.
2503 Value *ScalarIV = CreateScalarIV(Step);
2504 if (!Cost->isScalarEpilogueAllowed())
2505 CreateSplatIV(ScalarIV, Step);
2506 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2507}
2508
2509Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2510 Instruction::BinaryOps BinOp) {
2511 // Create and check the types.
2512 auto *ValVTy = cast<VectorType>(Val->getType());
2513 ElementCount VLen = ValVTy->getElementCount();
2514
2515 Type *STy = Val->getType()->getScalarType();
2516 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2517, __extension__ __PRETTY_FUNCTION__))
2517 "Induction Step must be an integer or FP")(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2517, __extension__ __PRETTY_FUNCTION__))
;
2518 assert(Step->getType() == STy && "Step has wrong type")(static_cast <bool> (Step->getType() == STy &&
"Step has wrong type") ? void (0) : __assert_fail ("Step->getType() == STy && \"Step has wrong type\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2518, __extension__ __PRETTY_FUNCTION__))
;
2519
2520 SmallVector<Constant *, 8> Indices;
2521
2522 // Create a vector of consecutive numbers from zero to VF.
2523 VectorType *InitVecValVTy = ValVTy;
2524 Type *InitVecValSTy = STy;
2525 if (STy->isFloatingPointTy()) {
2526 InitVecValSTy =
2527 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2528 InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2529 }
2530 Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2531
2532 // Add on StartIdx
2533 Value *StartIdxSplat = Builder.CreateVectorSplat(
2534 VLen, ConstantInt::get(InitVecValSTy, StartIdx));
2535 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2536
2537 if (STy->isIntegerTy()) {
2538 Step = Builder.CreateVectorSplat(VLen, Step);
2539 assert(Step->getType() == Val->getType() && "Invalid step vec")(static_cast <bool> (Step->getType() == Val->getType
() && "Invalid step vec") ? void (0) : __assert_fail (
"Step->getType() == Val->getType() && \"Invalid step vec\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2539, __extension__ __PRETTY_FUNCTION__))
;
2540 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2541 // which can be found from the original scalar operations.
2542 Step = Builder.CreateMul(InitVec, Step);
2543 return Builder.CreateAdd(Val, Step, "induction");
2544 }
2545
2546 // Floating point induction.
2547 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
== Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2548, __extension__ __PRETTY_FUNCTION__))
2548 "Binary Opcode should be specified for FP induction")(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
== Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2548, __extension__ __PRETTY_FUNCTION__))
;
2549 InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2550 Step = Builder.CreateVectorSplat(VLen, Step);
2551 Value *MulOp = Builder.CreateFMul(InitVec, Step);
2552 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2553}
2554
2555void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2556 Instruction *EntryVal,
2557 const InductionDescriptor &ID,
2558 VPValue *Def, VPValue *CastDef,
2559 VPTransformState &State) {
2560 // We shouldn't have to build scalar steps if we aren't vectorizing.
2561 assert(VF.isVector() && "VF should be greater than one")(static_cast <bool> (VF.isVector() && "VF should be greater than one"
) ? void (0) : __assert_fail ("VF.isVector() && \"VF should be greater than one\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2561, __extension__ __PRETTY_FUNCTION__))
;
2562 // Get the value type and ensure it and the step have the same integer type.
2563 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2564 assert(ScalarIVTy == Step->getType() &&(static_cast <bool> (ScalarIVTy == Step->getType() &&
"Val and Step should have the same type") ? void (0) : __assert_fail
("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2565, __extension__ __PRETTY_FUNCTION__))
2565 "Val and Step should have the same type")(static_cast <bool> (ScalarIVTy == Step->getType() &&
"Val and Step should have the same type") ? void (0) : __assert_fail
("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2565, __extension__ __PRETTY_FUNCTION__))
;
2566
2567 // We build scalar steps for both integer and floating-point induction
2568 // variables. Here, we determine the kind of arithmetic we will perform.
2569 Instruction::BinaryOps AddOp;
2570 Instruction::BinaryOps MulOp;
2571 if (ScalarIVTy->isIntegerTy()) {
2572 AddOp = Instruction::Add;
2573 MulOp = Instruction::Mul;
2574 } else {
2575 AddOp = ID.getInductionOpcode();
2576 MulOp = Instruction::FMul;
2577 }
2578
2579 // Determine the number of scalars we need to generate for each unroll
2580 // iteration. If EntryVal is uniform, we only need to generate the first
2581 // lane. Otherwise, we generate all VF values.
2582 bool IsUniform =
2583 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF);
2584 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue();
2585 // Compute the scalar steps and save the results in State.
2586 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2587 ScalarIVTy->getScalarSizeInBits());
2588 Type *VecIVTy = nullptr;
2589 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2590 if (!IsUniform && VF.isScalable()) {
2591 VecIVTy = VectorType::get(ScalarIVTy, VF);
2592 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF));
2593 SplatStep = Builder.CreateVectorSplat(VF, Step);
2594 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV);
2595 }
2596
2597 for (unsigned Part = 0; Part < UF; ++Part) {
2598 Value *StartIdx0 =
2599 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2600
2601 if (!IsUniform && VF.isScalable()) {
2602 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0);
2603 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2604 if (ScalarIVTy->isFloatingPointTy())
2605 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2606 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2607 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2608 State.set(Def, Add, Part);
2609 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2610 Part);
2611 // It's useful to record the lane values too for the known minimum number
2612 // of elements so we do those below. This improves the code quality when
2613 // trying to extract the first element, for example.
2614 }
2615
2616 if (ScalarIVTy->isFloatingPointTy())
2617 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2618
2619 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2620 Value *StartIdx = Builder.CreateBinOp(
2621 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2622 // The step returned by `createStepForVF` is a runtime-evaluated value
2623 // when VF is scalable. Otherwise, it should be folded into a Constant.
2624 assert((VF.isScalable() || isa<Constant>(StartIdx)) &&(static_cast <bool> ((VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2626, __extension__ __PRETTY_FUNCTION__))
2625 "Expected StartIdx to be folded to a constant when VF is not "(static_cast <bool> ((VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2626, __extension__ __PRETTY_FUNCTION__))
2626 "scalable")(static_cast <bool> ((VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2626, __extension__ __PRETTY_FUNCTION__))
;
2627 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2628 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2629 State.set(Def, Add, VPIteration(Part, Lane));
2630 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2631 Part, Lane);
2632 }
2633 }
2634}
2635
2636void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2637 const VPIteration &Instance,
2638 VPTransformState &State) {
2639 Value *ScalarInst = State.get(Def, Instance);
2640 Value *VectorValue = State.get(Def, Instance.Part);
2641 VectorValue = Builder.CreateInsertElement(
2642 VectorValue, ScalarInst,
2643 Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2644 State.set(Def, VectorValue, Instance.Part);
2645}
2646
2647Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2648 assert(Vec->getType()->isVectorTy() && "Invalid type")(static_cast <bool> (Vec->getType()->isVectorTy()
&& "Invalid type") ? void (0) : __assert_fail ("Vec->getType()->isVectorTy() && \"Invalid type\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2648, __extension__ __PRETTY_FUNCTION__))
;
2649 return Builder.CreateVectorReverse(Vec, "reverse");
2650}
2651
2652// Return whether we allow using masked interleave-groups (for dealing with
2653// strided loads/stores that reside in predicated blocks, or for dealing
2654// with gaps).
2655static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2656 // If an override option has been passed in for interleaved accesses, use it.
2657 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2658 return EnableMaskedInterleavedMemAccesses;
2659
2660 return TTI.enableMaskedInterleavedAccessVectorization();
2661}
2662
2663// Try to vectorize the interleave group that \p Instr belongs to.
2664//
2665// E.g. Translate following interleaved load group (factor = 3):
2666// for (i = 0; i < N; i+=3) {
2667// R = Pic[i]; // Member of index 0
2668// G = Pic[i+1]; // Member of index 1
2669// B = Pic[i+2]; // Member of index 2
2670// ... // do something to R, G, B
2671// }
2672// To:
2673// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2674// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2675// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2676// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2677//
2678// Or translate following interleaved store group (factor = 3):
2679// for (i = 0; i < N; i+=3) {
2680// ... do something to R, G, B
2681// Pic[i] = R; // Member of index 0
2682// Pic[i+1] = G; // Member of index 1
2683// Pic[i+2] = B; // Member of index 2
2684// }
2685// To:
2686// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2687// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2688// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2689// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2690// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2691void InnerLoopVectorizer::vectorizeInterleaveGroup(
2692 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2693 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2694 VPValue *BlockInMask) {
2695 Instruction *Instr = Group->getInsertPos();
2696 const DataLayout &DL = Instr->getModule()->getDataLayout();
2697
2698 // Prepare for the vector type of the interleaved load/store.
2699 Type *ScalarTy = getLoadStoreType(Instr);
2700 unsigned InterleaveFactor = Group->getFactor();
2701 assert(!VF.isScalable() && "scalable vectors not yet supported.")(static_cast <bool> (!VF.isScalable() && "scalable vectors not yet supported."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2701, __extension__ __PRETTY_FUNCTION__))
;
2702 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2703
2704 // Prepare for the new pointers.
2705 SmallVector<Value *, 2> AddrParts;
2706 unsigned Index = Group->getIndex(Instr);
2707
2708 // TODO: extend the masked interleaved-group support to reversed access.
2709 assert((!BlockInMask || !Group->isReverse()) &&(static_cast <bool> ((!BlockInMask || !Group->isReverse
()) && "Reversed masked interleave-group not supported."
) ? void (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2710, __extension__ __PRETTY_FUNCTION__))
2710 "Reversed masked interleave-group not supported.")(static_cast <bool> ((!BlockInMask || !Group->isReverse
()) && "Reversed masked interleave-group not supported."
) ? void (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2710, __extension__ __PRETTY_FUNCTION__))
;
2711
2712 // If the group is reverse, adjust the index to refer to the last vector lane
2713 // instead of the first. We adjust the index from the first vector lane,
2714 // rather than directly getting the pointer for lane VF - 1, because the
2715 // pointer operand of the interleaved access is supposed to be uniform. For
2716 // uniform instructions, we're only required to generate a value for the
2717 // first vector lane in each unroll iteration.
2718 if (Group->isReverse())
2719 Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2720
2721 for (unsigned Part = 0; Part < UF; Part++) {
2722 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2723 setDebugLocFromInst(Builder, AddrPart);
2724
2725 // Notice current instruction could be any index. Need to adjust the address
2726 // to the member of index 0.
2727 //
2728 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2729 // b = A[i]; // Member of index 0
2730 // Current pointer is pointed to A[i+1], adjust it to A[i].
2731 //
2732 // E.g. A[i+1] = a; // Member of index 1
2733 // A[i] = b; // Member of index 0
2734 // A[i+2] = c; // Member of index 2 (Current instruction)
2735 // Current pointer is pointed to A[i+2], adjust it to A[i].
2736
2737 bool InBounds = false;
2738 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2739 InBounds = gep->isInBounds();
2740 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2741 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2742
2743 // Cast to the vector pointer type.
2744 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2745 Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2746 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2747 }
2748
2749 setDebugLocFromInst(Builder, Instr);
2750 Value *PoisonVec = PoisonValue::get(VecTy);
2751
2752 Value *MaskForGaps = nullptr;
2753 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2754 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2755 assert(MaskForGaps && "Mask for Gaps is required but it is null")(static_cast <bool> (MaskForGaps && "Mask for Gaps is required but it is null"
) ? void (0) : __assert_fail ("MaskForGaps && \"Mask for Gaps is required but it is null\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2755, __extension__ __PRETTY_FUNCTION__))
;
2756 }
2757
2758 // Vectorize the interleaved load group.
2759 if (isa<LoadInst>(Instr)) {
2760 // For each unroll part, create a wide load for the group.
2761 SmallVector<Value *, 2> NewLoads;
2762 for (unsigned Part = 0; Part < UF; Part++) {
2763 Instruction *NewLoad;
2764 if (BlockInMask || MaskForGaps) {
2765 assert(useMaskedInterleavedAccesses(*TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(*TTI)
&& "masked interleaved groups are not allowed.") ? void
(0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2766, __extension__ __PRETTY_FUNCTION__))
2766 "masked interleaved groups are not allowed.")(static_cast <bool> (useMaskedInterleavedAccesses(*TTI)
&& "masked interleaved groups are not allowed.") ? void
(0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2766, __extension__ __PRETTY_FUNCTION__))
;
2767 Value *GroupMask = MaskForGaps;
2768 if (BlockInMask) {
2769 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2770 Value *ShuffledMask = Builder.CreateShuffleVector(
2771 BlockInMaskPart,
2772 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2773 "interleaved.mask");
2774 GroupMask = MaskForGaps
2775 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2776 MaskForGaps)
2777 : ShuffledMask;
2778 }
2779 NewLoad =
2780 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2781 GroupMask, PoisonVec, "wide.masked.vec");
2782 }
2783 else
2784 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2785 Group->getAlign(), "wide.vec");
2786 Group->addMetadata(NewLoad);
2787 NewLoads.push_back(NewLoad);
2788 }
2789
2790 // For each member in the group, shuffle out the appropriate data from the
2791 // wide loads.
2792 unsigned J = 0;
2793 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2794 Instruction *Member = Group->getMember(I);
2795
2796 // Skip the gaps in the group.
2797 if (!Member)
2798 continue;
2799
2800 auto StrideMask =
2801 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2802 for (unsigned Part = 0; Part < UF; Part++) {
2803 Value *StridedVec = Builder.CreateShuffleVector(
2804 NewLoads[Part], StrideMask, "strided.vec");
2805
2806 // If this member has different type, cast the result type.
2807 if (Member->getType() != ScalarTy) {
2808 assert(!VF.isScalable() && "VF is assumed to be non scalable.")(static_cast <bool> (!VF.isScalable() && "VF is assumed to be non scalable."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2808, __extension__ __PRETTY_FUNCTION__))
;
2809 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2810 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2811 }
2812
2813 if (Group->isReverse())
2814 StridedVec = reverseVector(StridedVec);
2815
2816 State.set(VPDefs[J], StridedVec, Part);
2817 }
2818 ++J;
2819 }
2820 return;
2821 }
2822
2823 // The sub vector type for current instruction.
2824 auto *SubVT = VectorType::get(ScalarTy, VF);
2825
2826 // Vectorize the interleaved store group.
2827 for (unsigned Part = 0; Part < UF; Part++) {
2828 // Collect the stored vector from each member.
2829 SmallVector<Value *, 4> StoredVecs;
2830 for (unsigned i = 0; i < InterleaveFactor; i++) {
2831 // Interleaved store group doesn't allow a gap, so each index has a member
2832 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group")(static_cast <bool> (Group->getMember(i) && "Fail to get a member from an interleaved store group"
) ? void (0) : __assert_fail ("Group->getMember(i) && \"Fail to get a member from an interleaved store group\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2832, __extension__ __PRETTY_FUNCTION__))
;
2833
2834 Value *StoredVec = State.get(StoredValues[i], Part);
2835
2836 if (Group->isReverse())
2837 StoredVec = reverseVector(StoredVec);
2838
2839 // If this member has different type, cast it to a unified type.
2840
2841 if (StoredVec->getType() != SubVT)
2842 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2843
2844 StoredVecs.push_back(StoredVec);
2845 }
2846
2847 // Concatenate all vectors into a wide vector.
2848 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2849
2850 // Interleave the elements in the wide vector.
2851 Value *IVec = Builder.CreateShuffleVector(
2852 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2853 "interleaved.vec");
2854
2855 Instruction *NewStoreInstr;
2856 if (BlockInMask) {
2857 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2858 Value *ShuffledMask = Builder.CreateShuffleVector(
2859 BlockInMaskPart,
2860 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2861 "interleaved.mask");
2862 NewStoreInstr = Builder.CreateMaskedStore(
2863 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2864 }
2865 else
2866 NewStoreInstr =
2867 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2868
2869 Group->addMetadata(NewStoreInstr);
2870 }
2871}
2872
2873void InnerLoopVectorizer::vectorizeMemoryInstruction(
2874 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2875 VPValue *StoredValue, VPValue *BlockInMask) {
2876 // Attempt to issue a wide load.
2877 LoadInst *LI = dyn_cast<LoadInst>(Instr);
2878 StoreInst *SI = dyn_cast<StoreInst>(Instr);
2879
2880 assert((LI || SI) && "Invalid Load/Store instruction")(static_cast <bool> ((LI || SI) && "Invalid Load/Store instruction"
) ? void (0) : __assert_fail ("(LI || SI) && \"Invalid Load/Store instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2880, __extension__ __PRETTY_FUNCTION__))
;
2881 assert((!SI || StoredValue) && "No stored value provided for widened store")(static_cast <bool> ((!SI || StoredValue) && "No stored value provided for widened store"
) ? void (0) : __assert_fail ("(!SI || StoredValue) && \"No stored value provided for widened store\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2881, __extension__ __PRETTY_FUNCTION__))
;
2882 assert((!LI || !StoredValue) && "Stored value provided for widened load")(static_cast <bool> ((!LI || !StoredValue) && "Stored value provided for widened load"
) ? void (0) : __assert_fail ("(!LI || !StoredValue) && \"Stored value provided for widened load\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2882, __extension__ __PRETTY_FUNCTION__))
;
2883
2884 LoopVectorizationCostModel::InstWidening Decision =
2885 Cost->getWideningDecision(Instr, VF);
2886 assert((Decision == LoopVectorizationCostModel::CM_Widen ||(static_cast <bool> ((Decision == LoopVectorizationCostModel
::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse
|| Decision == LoopVectorizationCostModel::CM_GatherScatter)
&& "CM decision is not to widen the memory instruction"
) ? void (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2889, __extension__ __PRETTY_FUNCTION__))
2887 Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||(static_cast <bool> ((Decision == LoopVectorizationCostModel
::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse
|| Decision == LoopVectorizationCostModel::CM_GatherScatter)
&& "CM decision is not to widen the memory instruction"
) ? void (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2889, __extension__ __PRETTY_FUNCTION__))
2888 Decision == LoopVectorizationCostModel::CM_GatherScatter) &&(static_cast <bool> ((Decision == LoopVectorizationCostModel
::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse
|| Decision == LoopVectorizationCostModel::CM_GatherScatter)
&& "CM decision is not to widen the memory instruction"
) ? void (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2889, __extension__ __PRETTY_FUNCTION__))
2889 "CM decision is not to widen the memory instruction")(static_cast <bool> ((Decision == LoopVectorizationCostModel
::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse
|| Decision == LoopVectorizationCostModel::CM_GatherScatter)
&& "CM decision is not to widen the memory instruction"
) ? void (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2889, __extension__ __PRETTY_FUNCTION__))
;
2890
2891 Type *ScalarDataTy = getLoadStoreType(Instr);
2892
2893 auto *DataTy = VectorType::get(ScalarDataTy, VF);
2894 const Align Alignment = getLoadStoreAlignment(Instr);
2895
2896 // Determine if the pointer operand of the access is either consecutive or
2897 // reverse consecutive.
2898 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2899 bool ConsecutiveStride =
2900 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2901 bool CreateGatherScatter =
2902 (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2903
2904 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2905 // gather/scatter. Otherwise Decision should have been to Scalarize.
2906 assert((ConsecutiveStride || CreateGatherScatter) &&(static_cast <bool> ((ConsecutiveStride || CreateGatherScatter
) && "The instruction should be scalarized") ? void (
0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2907, __extension__ __PRETTY_FUNCTION__))
2907 "The instruction should be scalarized")(static_cast <bool> ((ConsecutiveStride || CreateGatherScatter
) && "The instruction should be scalarized") ? void (
0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2907, __extension__ __PRETTY_FUNCTION__))
;
2908 (void)ConsecutiveStride;
2909
2910 VectorParts BlockInMaskParts(UF);
2911 bool isMaskRequired = BlockInMask;
2912 if (isMaskRequired)
2913 for (unsigned Part = 0; Part < UF; ++Part)
2914 BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2915
2916 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2917 // Calculate the pointer for the specific unroll-part.
2918 GetElementPtrInst *PartPtr = nullptr;
2919
2920 bool InBounds = false;
2921 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2922 InBounds = gep->isInBounds();
2923 if (Reverse) {
2924 // If the address is consecutive but reversed, then the
2925 // wide store needs to start at the last vector element.
2926 // RunTimeVF = VScale * VF.getKnownMinValue()
2927 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
2928 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2929 // NumElt = -Part * RunTimeVF
2930 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
2931 // LastLane = 1 - RunTimeVF
2932 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
2933 PartPtr =
2934 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
2935 PartPtr->setIsInBounds(InBounds);
2936 PartPtr = cast<GetElementPtrInst>(
2937 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
2938 PartPtr->setIsInBounds(InBounds);
2939 if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2940 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2941 } else {
2942 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2943 PartPtr = cast<GetElementPtrInst>(
2944 Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2945 PartPtr->setIsInBounds(InBounds);
2946 }
2947
2948 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2949 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2950 };
2951
2952 // Handle Stores:
2953 if (SI) {
2954 setDebugLocFromInst(Builder, SI);
2955
2956 for (unsigned Part = 0; Part < UF; ++Part) {
2957 Instruction *NewSI = nullptr;
2958 Value *StoredVal = State.get(StoredValue, Part);
2959 if (CreateGatherScatter) {
2960 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2961 Value *VectorGep = State.get(Addr, Part);
2962 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2963 MaskPart);
2964 } else {
2965 if (Reverse) {
2966 // If we store to reverse consecutive memory locations, then we need
2967 // to reverse the order of elements in the stored value.
2968 StoredVal = reverseVector(StoredVal);
2969 // We don't want to update the value in the map as it might be used in
2970 // another expression. So don't call resetVectorValue(StoredVal).
2971 }
2972 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2973 if (isMaskRequired)
2974 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2975 BlockInMaskParts[Part]);
2976 else
2977 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2978 }
2979 addMetadata(NewSI, SI);
2980 }
2981 return;
2982 }
2983
2984 // Handle loads.
2985 assert(LI && "Must have a load instruction")(static_cast <bool> (LI && "Must have a load instruction"
) ? void (0) : __assert_fail ("LI && \"Must have a load instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2985, __extension__ __PRETTY_FUNCTION__))
;
2986 setDebugLocFromInst(Builder, LI);
2987 for (unsigned Part = 0; Part < UF; ++Part) {
2988 Value *NewLI;
2989 if (CreateGatherScatter) {
2990 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2991 Value *VectorGep = State.get(Addr, Part);
2992 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2993 nullptr, "wide.masked.gather");
2994 addMetadata(NewLI, LI);
2995 } else {
2996 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2997 if (isMaskRequired)
2998 NewLI = Builder.CreateMaskedLoad(
2999 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy),
3000 "wide.masked.load");
3001 else
3002 NewLI =
3003 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
3004
3005 // Add metadata to the load, but setVectorValue to the reverse shuffle.
3006 addMetadata(NewLI, LI);
3007 if (Reverse)
3008 NewLI = reverseVector(NewLI);
3009 }
3010
3011 State.set(Def, NewLI, Part);
3012 }
3013}
3014
3015void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
3016 VPUser &User,
3017 const VPIteration &Instance,
3018 bool IfPredicateInstr,
3019 VPTransformState &State) {
3020 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")(static_cast <bool> (!Instr->getType()->isAggregateType
() && "Can't handle vectors") ? void (0) : __assert_fail
("!Instr->getType()->isAggregateType() && \"Can't handle vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3020, __extension__ __PRETTY_FUNCTION__))
;
3021
3022 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
3023 // the first lane and part.
3024 if (isa<NoAliasScopeDeclInst>(Instr))
3025 if (!Instance.isFirstIteration())
3026 return;
3027
3028 setDebugLocFromInst(Builder, Instr);
3029
3030 // Does this instruction return a value ?
3031 bool IsVoidRetTy = Instr->getType()->isVoidTy();
3032
3033 Instruction *Cloned = Instr->clone();
3034 if (!IsVoidRetTy)
3035 Cloned->setName(Instr->getName() + ".cloned");
3036
3037 State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
3038 Builder.GetInsertPoint());
3039 // Replace the operands of the cloned instructions with their scalar
3040 // equivalents in the new loop.
3041 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
3042 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
3043 auto InputInstance = Instance;
3044 if (!Operand || !OrigLoop->contains(Operand) ||
3045 (Cost->isUniformAfterVectorization(Operand, State.VF)))
3046 InputInstance.Lane = VPLane::getFirstLane();
3047 auto *NewOp = State.get(User.getOperand(op), InputInstance);
3048 Cloned->setOperand(op, NewOp);
3049 }
3050 addNewMetadata(Cloned, Instr);
3051
3052 // Place the cloned scalar in the new loop.
3053 Builder.Insert(Cloned);
3054
3055 State.set(Def, Cloned, Instance);
3056
3057 // If we just cloned a new assumption, add it the assumption cache.
3058 if (auto *II = dyn_cast<AssumeInst>(Cloned))
3059 AC->registerAssumption(II);
3060
3061 // End if-block.
3062 if (IfPredicateInstr)
3063 PredicatedInstructions.push_back(Cloned);
3064}
3065
3066PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
3067 Value *End, Value *Step,
3068 Instruction *DL) {
3069 BasicBlock *Header = L->getHeader();
3070 BasicBlock *Latch = L->getLoopLatch();
3071 // As we're just creating this loop, it's possible no latch exists
3072 // yet. If so, use the header as this will be a single block loop.
3073 if (!Latch)
3074 Latch = Header;
3075
3076 IRBuilder<> Builder(&*Header->getFirstInsertionPt());
3077 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3078 setDebugLocFromInst(Builder, OldInst);
3079 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
3080
3081 Builder.SetInsertPoint(Latch->getTerminator());
3082 setDebugLocFromInst(Builder, OldInst);
3083
3084 // Create i+1 and fill the PHINode.
3085 //
3086 // If the tail is not folded, we know that End - Start >= Step (either
3087 // statically or through the minimum iteration checks). We also know that both
3088 // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV +
3089 // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned
3090 // overflows and we can mark the induction increment as NUW.
3091 Value *Next =
3092 Builder.CreateAdd(Induction, Step, "index.next",
3093 /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false);
3094 Induction->addIncoming(Start, L->getLoopPreheader());
3095 Induction->addIncoming(Next, Latch);
3096 // Create the compare.
3097 Value *ICmp = Builder.CreateICmpEQ(Next, End);
3098 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
3099
3100 // Now we have two terminators. Remove the old one from the block.
3101 Latch->getTerminator()->eraseFromParent();
3102
3103 return Induction;
3104}
3105
3106Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3107 if (TripCount)
3108 return TripCount;
3109
3110 assert(L && "Create Trip Count for null loop.")(static_cast <bool> (L && "Create Trip Count for null loop."
) ? void (0) : __assert_fail ("L && \"Create Trip Count for null loop.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3110, __extension__ __PRETTY_FUNCTION__))
;
3111 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3112 // Find the loop boundaries.
3113 ScalarEvolution *SE = PSE.getSE();
3114 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3115 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&(static_cast <bool> (!isa<SCEVCouldNotCompute>(BackedgeTakenCount
) && "Invalid loop count") ? void (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3116, __extension__ __PRETTY_FUNCTION__))
3116 "Invalid loop count")(static_cast <bool> (!isa<SCEVCouldNotCompute>(BackedgeTakenCount
) && "Invalid loop count") ? void (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3116, __extension__ __PRETTY_FUNCTION__))
;
3117
3118 Type *IdxTy = Legal->getWidestInductionType();
3119 assert(IdxTy && "No type for induction")(static_cast <bool> (IdxTy && "No type for induction"
) ? void (0) : __assert_fail ("IdxTy && \"No type for induction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3119, __extension__ __PRETTY_FUNCTION__))
;
3120
3121 // The exit count might have the type of i64 while the phi is i32. This can
3122 // happen if we have an induction variable that is sign extended before the
3123 // compare. The only way that we get a backedge taken count is that the
3124 // induction variable was signed and as such will not overflow. In such a case
3125 // truncation is legal.
3126 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3127 IdxTy->getPrimitiveSizeInBits())
3128 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3129 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3130
3131 // Get the total trip count from the count by adding 1.
3132 const SCEV *ExitCount = SE->getAddExpr(
3133 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3134
3135 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3136
3137 // Expand the trip count and place the new instructions in the preheader.
3138 // Notice that the pre-header does not change, only the loop body.
3139 SCEVExpander Exp(*SE, DL, "induction");
3140
3141 // Count holds the overall loop count (N).
3142 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3143 L->getLoopPreheader()->getTerminator());
3144
3145 if (TripCount->getType()->isPointerTy())
3146 TripCount =
3147 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3148 L->getLoopPreheader()->getTerminator());
3149
3150 return TripCount;
3151}
3152
3153Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3154 if (VectorTripCount)
3155 return VectorTripCount;
3156
3157 Value *TC = getOrCreateTripCount(L);
3158 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3159
3160 Type *Ty = TC->getType();
3161 // This is where we can make the step a runtime constant.
3162 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
3163
3164 // If the tail is to be folded by masking, round the number of iterations N
3165 // up to a multiple of Step instead of rounding down. This is done by first
3166 // adding Step-1 and then rounding down. Note that it's ok if this addition
3167 // overflows: the vector induction variable will eventually wrap to zero given
3168 // that it starts at zero and its Step is a power of two; the loop will then
3169 // exit, with the last early-exit vector comparison also producing all-true.
3170 if (Cost->foldTailByMasking()) {
3171 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
) * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3172, __extension__ __PRETTY_FUNCTION__))
3172 "VF*UF must be a power of 2 when folding tail by masking")(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
) * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3172, __extension__ __PRETTY_FUNCTION__))
;
3173 assert(!VF.isScalable() &&(static_cast <bool> (!VF.isScalable() && "Tail folding not yet supported for scalable vectors"
) ? void (0) : __assert_fail ("!VF.isScalable() && \"Tail folding not yet supported for scalable vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3174, __extension__ __PRETTY_FUNCTION__))
3174 "Tail folding not yet supported for scalable vectors")(static_cast <bool> (!VF.isScalable() && "Tail folding not yet supported for scalable vectors"
) ? void (0) : __assert_fail ("!VF.isScalable() && \"Tail folding not yet supported for scalable vectors\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3174, __extension__ __PRETTY_FUNCTION__))
;
3175 TC = Builder.CreateAdd(
3176 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3177 }
3178
3179 // Now we need to generate the expression for the part of the loop that the
3180 // vectorized body will execute. This is equal to N - (N % Step) if scalar
3181 // iterations are not required for correctness, or N - Step, otherwise. Step
3182 // is equal to the vectorization factor (number of SIMD elements) times the
3183 // unroll factor (number of SIMD instructions).
3184 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3185
3186 // There are two cases where we need to ensure (at least) the last iteration
3187 // runs in the scalar remainder loop. Thus, if the step evenly divides
3188 // the trip count, we set the remainder to be equal to the step. If the step
3189 // does not evenly divide the trip count, no adjustment is necessary since
3190 // there will already be scalar iterations. Note that the minimum iterations
3191 // check ensures that N >= Step. The cases are:
3192 // 1) If there is a non-reversed interleaved group that may speculatively
3193 // access memory out-of-bounds.
3194 // 2) If any instruction may follow a conditionally taken exit. That is, if
3195 // the loop contains multiple exiting blocks, or a single exiting block
3196 // which is not the latch.
3197 if (VF.isVector() && Cost->requiresScalarEpilogue()) {
3198 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3199 R = Builder.CreateSelect(IsZero, Step, R);
3200 }
3201
3202 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3203
3204 return VectorTripCount;
3205}
3206
3207Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3208 const DataLayout &DL) {
3209 // Verify that V is a vector type with same number of elements as DstVTy.
3210 auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3211 unsigned VF = DstFVTy->getNumElements();
3212 auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3213 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match")(static_cast <bool> ((VF == SrcVecTy->getNumElements
()) && "Vector dimensions do not match") ? void (0) :
__assert_fail ("(VF == SrcVecTy->getNumElements()) && \"Vector dimensions do not match\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3213, __extension__ __PRETTY_FUNCTION__))
;
3214 Type *SrcElemTy = SrcVecTy->getElementType();
3215 Type *DstElemTy = DstFVTy->getElementType();
3216 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3217, __extension__ __PRETTY_FUNCTION__))
3217 "Vector elements must have same size")(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3217, __extension__ __PRETTY_FUNCTION__))
;
3218
3219 // Do a direct cast if element types are castable.
3220 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3221 return Builder.CreateBitOrPointerCast(V, DstFVTy);
3222 }
3223 // V cannot be directly casted to desired vector type.
3224 // May happen when V is a floating point vector but DstVTy is a vector of
3225 // pointers or vice-versa. Handle this using a two-step bitcast using an
3226 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3227 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3228, __extension__ __PRETTY_FUNCTION__))
3228 "Only one type should be a pointer type")(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3228, __extension__ __PRETTY_FUNCTION__))
;
3229 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&(static_cast <bool> ((DstElemTy->isFloatingPointTy()
!= SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3230, __extension__ __PRETTY_FUNCTION__))
3230 "Only one type should be a floating point type")(static_cast <bool> ((DstElemTy->isFloatingPointTy()
!= SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3230, __extension__ __PRETTY_FUNCTION__))
;
3231 Type *IntTy =
3232 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3233 auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3234 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3235 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3236}
3237
3238void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3239 BasicBlock *Bypass) {
3240 Value *Count = getOrCreateTripCount(L);
3241 // Reuse existing vector loop preheader for TC checks.
3242 // Note that new preheader block is generated for vector loop.
3243 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3244 IRBuilder<> Builder(TCCheckBlock->getTerminator());
3245
3246 // Generate code to check if the loop's trip count is less than VF * UF, or
3247 // equal to it in case a scalar epilogue is required; this implies that the
3248 // vector trip count is zero. This check also covers the case where adding one
3249 // to the backedge-taken count overflowed leading to an incorrect trip count
3250 // of zero. In this case we will also jump to the scalar loop.
3251 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3252 : ICmpInst::ICMP_ULT;
3253
3254 // If tail is to be folded, vector loop takes care of all iterations.
3255 Value *CheckMinIters = Builder.getFalse();
3256 if (!Cost->foldTailByMasking()) {
3257 Value *Step =
3258 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3259 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3260 }
3261 // Create new preheader for vector loop.
3262 LoopVectorPreHeader =
3263 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3264 "vector.ph");
3265
3266 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3268, __extension__ __PRETTY_FUNCTION__))
3267 DT->getNode(Bypass)->getIDom()) &&(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3268, __extension__ __PRETTY_FUNCTION__))
3268 "TC check is expected to dominate Bypass")(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3268, __extension__ __PRETTY_FUNCTION__))
;
3269
3270 // Update dominator for Bypass & LoopExit.
3271 DT->changeImmediateDominator(Bypass, TCCheckBlock);
3272 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3273
3274 ReplaceInstWithInst(
3275 TCCheckBlock->getTerminator(),
3276 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3277 LoopBypassBlocks.push_back(TCCheckBlock);
3278}
3279
3280BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3281
3282 BasicBlock *const SCEVCheckBlock =
3283 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3284 if (!SCEVCheckBlock)
3285 return nullptr;
3286
3287 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3290, __extension__ __PRETTY_FUNCTION__))
3288 (OptForSizeBasedOnProfile &&(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3290, __extension__ __PRETTY_FUNCTION__))
3289 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3290, __extension__ __PRETTY_FUNCTION__))
3290 "Cannot SCEV check stride or overflow when optimizing for size")(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3290, __extension__ __PRETTY_FUNCTION__))
;
3291
3292
3293 // Update dominator only if this is first RT check.
3294 if (LoopBypassBlocks.empty()) {
3295 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3296 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3297 }
3298
3299 LoopBypassBlocks.push_back(SCEVCheckBlock);
3300 AddedSafetyChecks = true;
3301 return SCEVCheckBlock;
3302}
3303
3304BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3305 BasicBlock *Bypass) {
3306 // VPlan-native path does not do any analysis for runtime checks currently.
3307 if (EnableVPlanNativePath)
3308 return nullptr;
3309
3310 BasicBlock *const MemCheckBlock =
3311 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3312
3313 // Check if we generated code that checks in runtime if arrays overlap. We put
3314 // the checks into a separate block to make the more common case of few
3315 // elements faster.
3316 if (!MemCheckBlock)
3317 return nullptr;
3318
3319 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3320 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3322, __extension__ __PRETTY_FUNCTION__))
3321 "Cannot emit memory checks when optimizing for size, unless forced "(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3322, __extension__ __PRETTY_FUNCTION__))
3322 "to vectorize.")(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3322, __extension__ __PRETTY_FUNCTION__))
;
3323 ORE->emit([&]() {
3324 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationCodeSize",
3325 L->getStartLoc(), L->getHeader())
3326 << "Code-size may be reduced by not forcing "
3327 "vectorization, or by source-code modifications "
3328 "eliminating the need for runtime checks "
3329 "(e.g., adding 'restrict').";
3330 });
3331 }
3332
3333 LoopBypassBlocks.push_back(MemCheckBlock);
3334
3335 AddedSafetyChecks = true;
3336
3337 // We currently don't use LoopVersioning for the actual loop cloning but we
3338 // still use it to add the noalias metadata.
3339 LVer = std::make_unique<LoopVersioning>(
3340 *Legal->getLAI(),
3341 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3342 DT, PSE.getSE());
3343 LVer->prepareNoAliasMetadata();
3344 return MemCheckBlock;
3345}
3346
3347Value *InnerLoopVectorizer::emitTransformedIndex(
3348 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3349 const InductionDescriptor &ID) const {
3350
3351 SCEVExpander Exp(*SE, DL, "induction");
3352 auto Step = ID.getStep();
3353 auto StartValue = ID.getStartValue();
3354 assert(Index->getType()->getScalarType() == Step->getType() &&(static_cast <bool> (Index->getType()->getScalarType
() == Step->getType() && "Index scalar type does not match StepValue type"
) ? void (0) : __assert_fail ("Index->getType()->getScalarType() == Step->getType() && \"Index scalar type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3355, __extension__ __PRETTY_FUNCTION__))
3355 "Index scalar type does not match StepValue type")(static_cast <bool> (Index->getType()->getScalarType
() == Step->getType() && "Index scalar type does not match StepValue type"
) ? void (0) : __assert_fail ("Index->getType()->getScalarType() == Step->getType() && \"Index scalar type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3355, __extension__ __PRETTY_FUNCTION__))
;
3356
3357 // Note: the IR at this point is broken. We cannot use SE to create any new
3358 // SCEV and then expand it, hoping that SCEV's simplification will give us
3359 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3360 // lead to various SCEV crashes. So all we can do is to use builder and rely
3361 // on InstCombine for future simplifications. Here we handle some trivial
3362 // cases only.
3363 auto CreateAdd = [&B](Value *X, Value *Y) {
3364 assert(X->getType() == Y->getType() && "Types don't match!")(static_cast <bool> (X->getType() == Y->getType()
&& "Types don't match!") ? void (0) : __assert_fail (
"X->getType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3364, __extension__ __PRETTY_FUNCTION__))
;
3365 if (auto *CX = dyn_cast<ConstantInt>(X))
3366 if (CX->isZero())
3367 return Y;
3368 if (auto *CY = dyn_cast<ConstantInt>(Y))
3369 if (CY->isZero())
3370 return X;
3371 return B.CreateAdd(X, Y);
3372 };
3373
3374 // We allow X to be a vector type, in which case Y will potentially be
3375 // splatted into a vector with the same element count.
3376 auto CreateMul = [&B](Value *X, Value *Y) {
3377 assert(X->getType()->getScalarType() == Y->getType() &&(static_cast <bool> (X->getType()->getScalarType(
) == Y->getType() && "Types don't match!") ? void (
0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3378, __extension__ __PRETTY_FUNCTION__))
3378 "Types don't match!")(static_cast <bool> (X->getType()->getScalarType(
) == Y->getType() && "Types don't match!") ? void (
0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3378, __extension__ __PRETTY_FUNCTION__))
;
3379 if (auto *CX = dyn_cast<ConstantInt>(X))
3380 if (CX->isOne())
3381 return Y;
3382 if (auto *CY = dyn_cast<ConstantInt>(Y))
3383 if (CY->isOne())
3384 return X;
3385 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
3386 if (XVTy && !isa<VectorType>(Y->getType()))
3387 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
3388 return B.CreateMul(X, Y);
3389 };
3390
3391 // Get a suitable insert point for SCEV expansion. For blocks in the vector
3392 // loop, choose the end of the vector loop header (=LoopVectorBody), because
3393 // the DomTree is not kept up-to-date for additional blocks generated in the
3394 // vector loop. By using the header as insertion point, we guarantee that the
3395 // expanded instructions dominate all their uses.
3396 auto GetInsertPoint = [this, &B]() {
3397 BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3398 if (InsertBB != LoopVectorBody &&
3399 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3400 return LoopVectorBody->getTerminator();
3401 return &*B.GetInsertPoint();
3402 };
3403
3404 switch (ID.getKind()) {
3405 case InductionDescriptor::IK_IntInduction: {
3406 assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for integer inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3407, __extension__ __PRETTY_FUNCTION__))
3407 "Vector indices not supported for integer inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for integer inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3407, __extension__ __PRETTY_FUNCTION__))
;
3408 assert(Index->getType() == StartValue->getType() &&(static_cast <bool> (Index->getType() == StartValue->
getType() && "Index type does not match StartValue type"
) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3409, __extension__ __PRETTY_FUNCTION__))
3409 "Index type does not match StartValue type")(static_cast <bool> (Index->getType() == StartValue->
getType() && "Index type does not match StartValue type"
) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3409, __extension__ __PRETTY_FUNCTION__))
;
3410 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3411 return B.CreateSub(StartValue, Index);
3412 auto *Offset = CreateMul(
3413 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3414 return CreateAdd(StartValue, Offset);
3415 }
3416 case InductionDescriptor::IK_PtrInduction: {
3417 assert(isa<SCEVConstant>(Step) &&(static_cast <bool> (isa<SCEVConstant>(Step) &&
"Expected constant step for pointer induction") ? void (0) :
__assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3418, __extension__ __PRETTY_FUNCTION__))
3418 "Expected constant step for pointer induction")(static_cast <bool> (isa<SCEVConstant>(Step) &&
"Expected constant step for pointer induction") ? void (0) :
__assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3418, __extension__ __PRETTY_FUNCTION__))
;
3419 return B.CreateGEP(
3420 StartValue->getType()->getPointerElementType(), StartValue,
3421 CreateMul(Index,
3422 Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
3423 GetInsertPoint())));
3424 }
3425 case InductionDescriptor::IK_FpInduction: {
3426 assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for FP inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3427, __extension__ __PRETTY_FUNCTION__))
3427 "Vector indices not supported for FP inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for FP inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3427, __extension__ __PRETTY_FUNCTION__))
;
3428 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value")(static_cast <bool> (Step->getType()->isFloatingPointTy
() && "Expected FP Step value") ? void (0) : __assert_fail
("Step->getType()->isFloatingPointTy() && \"Expected FP Step value\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3428, __extension__ __PRETTY_FUNCTION__))
;
3429 auto InductionBinOp = ID.getInductionBinOp();
3430 assert(InductionBinOp &&(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3433, __extension__ __PRETTY_FUNCTION__))
3431 (InductionBinOp->getOpcode() == Instruction::FAdd ||(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3433, __extension__ __PRETTY_FUNCTION__))
3432 InductionBinOp->getOpcode() == Instruction::FSub) &&(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3433, __extension__ __PRETTY_FUNCTION__))
3433 "Original bin op should be defined for FP induction")(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3433, __extension__ __PRETTY_FUNCTION__))
;
3434
3435 Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3436 Value *MulExp = B.CreateFMul(StepValue, Index);
3437 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3438 "induction");
3439 }
3440 case InductionDescriptor::IK_NoInduction:
3441 return nullptr;
3442 }
3443 llvm_unreachable("invalid enum")::llvm::llvm_unreachable_internal("invalid enum", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3443)
;
3444}
3445
3446Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3447 LoopScalarBody = OrigLoop->getHeader();
3448 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3449 LoopExitBlock = OrigLoop->getUniqueExitBlock();
3450 assert(LoopExitBlock && "Must have an exit block")(static_cast <bool> (LoopExitBlock && "Must have an exit block"
) ? void (0) : __assert_fail ("LoopExitBlock && \"Must have an exit block\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3450, __extension__ __PRETTY_FUNCTION__))
;
3451 assert(LoopVectorPreHeader && "Invalid loop structure")(static_cast <bool> (LoopVectorPreHeader && "Invalid loop structure"
) ? void (0) : __assert_fail ("LoopVectorPreHeader && \"Invalid loop structure\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3451, __extension__ __PRETTY_FUNCTION__))
;
3452
3453 LoopMiddleBlock =
3454 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3455 LI, nullptr, Twine(Prefix) + "middle.block");
3456 LoopScalarPreHeader =
3457 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3458 nullptr, Twine(Prefix) + "scalar.ph");
3459
3460 // Set up branch from middle block to the exit and scalar preheader blocks.
3461 // completeLoopSkeleton will update the condition to use an iteration check,
3462 // if required to decide whether to execute the remainder.
3463 BranchInst *BrInst =
3464 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
3465 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3466 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3467 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3468
3469 // We intentionally don't let SplitBlock to update LoopInfo since
3470 // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3471 // LoopVectorBody is explicitly added to the correct place few lines later.
3472 LoopVectorBody =
3473 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3474 nullptr, nullptr, Twine(Prefix) + "vector.body");
3475
3476 // Update dominator for loop exit.
3477 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3478
3479 // Create and register the new vector loop.
3480 Loop *Lp = LI->AllocateLoop();
3481 Loop *ParentLoop = OrigLoop->getParentLoop();
3482
3483 // Insert the new loop into the loop nest and register the new basic blocks
3484 // before calling any utilities such as SCEV that require valid LoopInfo.
3485 if (ParentLoop) {
3486 ParentLoop->addChildLoop(Lp);
3487 } else {
3488 LI->addTopLevelLoop(Lp);
3489 }
3490 Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3491 return Lp;
3492}
3493
3494void InnerLoopVectorizer::createInductionResumeValues(
3495 Loop *L, Value *VectorTripCount,
3496 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3497 assert(VectorTripCount && L && "Expected valid arguments")(static_cast <bool> (VectorTripCount && L &&
"Expected valid arguments") ? void (0) : __assert_fail ("VectorTripCount && L && \"Expected valid arguments\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3497, __extension__ __PRETTY_FUNCTION__))
;
3498 assert(((AdditionalBypass.first && AdditionalBypass.second) ||(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3500, __extension__ __PRETTY_FUNCTION__))
3499 (!AdditionalBypass.first && !AdditionalBypass.second)) &&(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3500, __extension__ __PRETTY_FUNCTION__))
3500 "Inconsistent information about additional bypass.")(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3500, __extension__ __PRETTY_FUNCTION__))
;
3501 // We are going to resume the execution of the scalar loop.
3502 // Go over all of the induction variables that we found and fix the
3503 // PHIs that are left in the scalar version of the loop.
3504 // The starting values of PHI nodes depend on the counter of the last
3505 // iteration in the vectorized loop.
3506 // If we come from a bypass edge then we need to start from the original
3507 // start value.
3508 for (auto &InductionEntry : Legal->getInductionVars()) {
3509 PHINode *OrigPhi = InductionEntry.first;
3510 InductionDescriptor II = InductionEntry.second;
3511
3512 // Create phi nodes to merge from the backedge-taken check block.
3513 PHINode *BCResumeVal =
3514 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3515 LoopScalarPreHeader->getTerminator());
3516 // Copy original phi DL over to the new one.
3517 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3518 Value *&EndValue = IVEndValues[OrigPhi];
3519 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3520 if (OrigPhi == OldInduction) {
3521 // We know what the end value is.
3522 EndValue = VectorTripCount;
3523 } else {
3524 IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3525
3526 // Fast-math-flags propagate from the original induction instruction.
3527 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3528 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3529
3530 Type *StepType = II.getStep()->getType();
3531 Instruction::CastOps CastOp =
3532 CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3533 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3534 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3535 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3536 EndValue->setName("ind.end");
3537
3538 // Compute the end value for the additional bypass (if applicable).
3539 if (AdditionalBypass.first) {
3540 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3541 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3542 StepType, true);
3543 CRD =
3544 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3545 EndValueFromAdditionalBypass =
3546 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3547 EndValueFromAdditionalBypass->setName("ind.end");
3548 }
3549 }
3550 // The new PHI merges the original incoming value, in case of a bypass,
3551 // or the value at the end of the vectorized loop.
3552 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3553
3554 // Fix the scalar body counter (PHI node).
3555 // The old induction's phi node in the scalar body needs the truncated
3556 // value.
3557 for (BasicBlock *BB : LoopBypassBlocks)
3558 BCResumeVal->addIncoming(II.getStartValue(), BB);
3559
3560 if (AdditionalBypass.first)
3561 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3562 EndValueFromAdditionalBypass);
3563
3564 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3565 }
3566}
3567
3568BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3569 MDNode *OrigLoopID) {
3570 assert(L && "Expected valid loop.")(static_cast <bool> (L && "Expected valid loop."
) ? void (0) : __assert_fail ("L && \"Expected valid loop.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3570, __extension__ __PRETTY_FUNCTION__))
;
3571
3572 // The trip counts should be cached by now.
3573 Value *Count = getOrCreateTripCount(L);
3574 Value *VectorTripCount = getOrCreateVectorTripCount(L);
3575
3576 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3577
3578 // Add a check in the middle block to see if we have completed
3579 // all of the iterations in the first vector loop.
3580 // If (N - N%VF) == N, then we *don't* need to run the remainder.
3581 // If tail is to be folded, we know we don't need to run the remainder.
3582 if (!Cost->foldTailByMasking()) {
3583 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3584 Count, VectorTripCount, "cmp.n",
3585 LoopMiddleBlock->getTerminator());
3586
3587 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3588 // of the corresponding compare because they may have ended up with
3589 // different line numbers and we want to avoid awkward line stepping while
3590 // debugging. Eg. if the compare has got a line number inside the loop.
3591 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3592 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3593 }
3594
3595 // Get ready to start creating new instructions into the vectorized body.
3596 assert(LoopVectorPreHeader == L->getLoopPreheader() &&(static_cast <bool> (LoopVectorPreHeader == L->getLoopPreheader
() && "Inconsistent vector loop preheader") ? void (0
) : __assert_fail ("LoopVectorPreHeader == L->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3597, __extension__ __PRETTY_FUNCTION__))
3597 "Inconsistent vector loop preheader")(static_cast <bool> (LoopVectorPreHeader == L->getLoopPreheader
() && "Inconsistent vector loop preheader") ? void (0
) : __assert_fail ("LoopVectorPreHeader == L->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3597, __extension__ __PRETTY_FUNCTION__))
;
3598 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3599
3600 Optional<MDNode *> VectorizedLoopID =
3601 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3602 LLVMLoopVectorizeFollowupVectorized});
3603 if (VectorizedLoopID.hasValue()) {
3604 L->setLoopID(VectorizedLoopID.getValue());
3605
3606 // Do not setAlreadyVectorized if loop attributes have been defined
3607 // explicitly.
3608 return LoopVectorPreHeader;
3609 }
3610
3611 // Keep all loop hints from the original loop on the vector loop (we'll
3612 // replace the vectorizer-specific hints below).
3613 if (MDNode *LID = OrigLoop->getLoopID())
3614 L->setLoopID(LID);
3615
3616 LoopVectorizeHints Hints(L, true, *ORE);
3617 Hints.setAlreadyVectorized();
3618
3619#ifdef EXPENSIVE_CHECKS
3620 assert(DT->verify(DominatorTree::VerificationLevel::Fast))(static_cast <bool> (DT->verify(DominatorTree::VerificationLevel
::Fast)) ? void (0) : __assert_fail ("DT->verify(DominatorTree::VerificationLevel::Fast)"
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3620, __extension__ __PRETTY_FUNCTION__))
;
3621 LI->verify(*DT);
3622#endif
3623
3624 return LoopVectorPreHeader;
3625}
3626
3627BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3628 /*
3629 In this function we generate a new loop. The new loop will contain
3630 the vectorized instructions while the old loop will continue to run the
3631 scalar remainder.
3632
3633 [ ] <-- loop iteration number check.
3634 / |
3635 / v
3636 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3637 | / |
3638 | / v
3639 || [ ] <-- vector pre header.
3640 |/ |
3641 | v
3642 | [ ] \
3643 | [ ]_| <-- vector loop.
3644 | |
3645 | v
3646 | -[ ] <--- middle-block.
3647 | / |
3648 | / v
3649 -|- >[ ] <--- new preheader.
3650 | |
3651 | v
3652 | [ ] \
3653 | [ ]_| <-- old scalar loop to handle remainder.
3654 \ |
3655 \ v
3656 >[ ] <-- exit block.
3657 ...
3658 */
3659
3660 // Get the metadata of the original loop before it gets modified.
3661 MDNode *OrigLoopID = OrigLoop->getLoopID();
3662
3663 // Workaround! Compute the trip count of the original loop and cache it
3664 // before we start modifying the CFG. This code has a systemic problem
3665 // wherein it tries to run analysis over partially constructed IR; this is
3666 // wrong, and not simply for SCEV. The trip count of the original loop
3667 // simply happens to be prone to hitting this in practice. In theory, we
3668 // can hit the same issue for any SCEV, or ValueTracking query done during
3669 // mutation. See PR49900.
3670 getOrCreateTripCount(OrigLoop);
3671
3672 // Create an empty vector loop, and prepare basic blocks for the runtime
3673 // checks.
3674 Loop *Lp = createVectorLoopSkeleton("");
3675
3676 // Now, compare the new count to zero. If it is zero skip the vector loop and
3677 // jump to the scalar loop. This check also covers the case where the
3678 // backedge-taken count is uint##_max: adding one to it will overflow leading
3679 // to an incorrect trip count of zero. In this (rare) case we will also jump
3680 // to the scalar loop.
3681 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3682
3683 // Generate the code to check any assumptions that we've made for SCEV
3684 // expressions.
3685 emitSCEVChecks(Lp, LoopScalarPreHeader);
3686
3687 // Generate the code that checks in runtime if arrays overlap. We put the
3688 // checks into a separate block to make the more common case of few elements
3689 // faster.
3690 emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3691
3692 // Some loops have a single integer induction variable, while other loops
3693 // don't. One example is c++ iterators that often have multiple pointer
3694 // induction variables. In the code below we also support a case where we
3695 // don't have a single induction variable.
3696 //
3697 // We try to obtain an induction variable from the original loop as hard
3698 // as possible. However if we don't find one that:
3699 // - is an integer
3700 // - counts from zero, stepping by one
3701 // - is the size of the widest induction variable type
3702 // then we create a new one.
3703 OldInduction = Legal->getPrimaryInduction();
3704 Type *IdxTy = Legal->getWidestInductionType();
3705 Value *StartIdx = ConstantInt::get(IdxTy, 0);
3706 // The loop step is equal to the vectorization factor (num of SIMD elements)
3707 // times the unroll factor (num of SIMD instructions).
3708 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3709 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3710 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3711 Induction =
3712 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3713 getDebugLocFromInstOrOperands(OldInduction));
3714
3715 // Emit phis for the new starting index of the scalar loop.
3716 createInductionResumeValues(Lp, CountRoundDown);
3717
3718 return completeLoopSkeleton(Lp, OrigLoopID);
3719}
3720
3721// Fix up external users of the induction variable. At this point, we are
3722// in LCSSA form, with all external PHIs that use the IV having one input value,
3723// coming from the remainder loop. We need those PHIs to also have a correct
3724// value for the IV when arriving directly from the middle block.
3725void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3726 const InductionDescriptor &II,
3727 Value *CountRoundDown, Value *EndValue,
3728 BasicBlock *MiddleBlock) {
3729 // There are two kinds of external IV usages - those that use the value
3730 // computed in the last iteration (the PHI) and those that use the penultimate
3731 // value (the value that feeds into the phi from the loop latch).
3732 // We allow both, but they, obviously, have different values.
3733
3734 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block")(static_cast <bool> (OrigLoop->getUniqueExitBlock() &&
"Expected a single exit block") ? void (0) : __assert_fail (
"OrigLoop->getUniqueExitBlock() && \"Expected a single exit block\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3734, __extension__ __PRETTY_FUNCTION__))
;
3735
3736 DenseMap<Value *, Value *> MissingVals;
3737
3738 // An external user of the last iteration's value should see the value that
3739 // the remainder loop uses to initialize its own IV.
3740 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3741 for (User *U : PostInc->users()) {
3742 Instruction *UI = cast<Instruction>(U);
3743 if (!OrigLoop->contains(UI)) {
3744 assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3744, __extension__ __PRETTY_FUNCTION__))
;
3745 MissingVals[UI] = EndValue;
3746 }
3747 }
3748
3749 // An external user of the penultimate value need to see EndValue - Step.
3750 // The simplest way to get this is to recompute it from the constituent SCEVs,
3751 // that is Start + (Step * (CRD - 1)).
3752 for (User *U : OrigPhi->users()) {
3753 auto *UI = cast<Instruction>(U);
3754 if (!OrigLoop->contains(UI)) {
3755 const DataLayout &DL =
3756 OrigLoop->getHeader()->getModule()->getDataLayout();
3757 assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3757, __extension__ __PRETTY_FUNCTION__))
;
3758
3759 IRBuilder<> B(MiddleBlock->getTerminator());
3760
3761 // Fast-math-flags propagate from the original induction instruction.
3762 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3763 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3764
3765 Value *CountMinusOne = B.CreateSub(
3766 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3767 Value *CMO =
3768 !II.getStep()->getType()->isIntegerTy()
3769 ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3770 II.getStep()->getType())
3771 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3772 CMO->setName("cast.cmo");
3773 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3774 Escape->setName("ind.escape");
3775 MissingVals[UI] = Escape;
3776 }
3777 }
3778
3779 for (auto &I : MissingVals) {
3780 PHINode *PHI = cast<PHINode>(I.first);
3781 // One corner case we have to handle is two IVs "chasing" each-other,
3782 // that is %IV2 = phi [...], [ %IV1, %latch ]
3783 // In this case, if IV1 has an external use, we need to avoid adding both
3784 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3785 // don't already have an incoming value for the middle block.
3786 if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3787 PHI->addIncoming(I.second, MiddleBlock);
3788 }
3789}
3790
3791namespace {
3792
3793struct CSEDenseMapInfo {
3794 static bool canHandle(const Instruction *I) {
3795 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3796 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3797 }
3798
3799 static inline Instruction *getEmptyKey() {
3800 return DenseMapInfo<Instruction *>::getEmptyKey();
3801 }
3802
3803 static inline Instruction *getTombstoneKey() {
3804 return DenseMapInfo<Instruction *>::getTombstoneKey();
3805 }
3806
3807 static unsigned getHashValue(const Instruction *I) {
3808 assert(canHandle(I) && "Unknown instruction!")(static_cast <bool> (canHandle(I) && "Unknown instruction!"
) ? void (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3808, __extension__ __PRETTY_FUNCTION__))
;
3809 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3810 I->value_op_end()));
3811 }
3812
3813 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3814 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3815 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3816 return LHS == RHS;
3817 return LHS->isIdenticalTo(RHS);
3818 }
3819};
3820
3821} // end anonymous namespace
3822
3823///Perform cse of induction variable instructions.
3824static void cse(BasicBlock *BB) {
3825 // Perform simple cse.
3826 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3827 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3828 Instruction *In = &*I++;
3829
3830 if (!CSEDenseMapInfo::canHandle(In))
3831 continue;
3832
3833 // Check if we can replace this instruction with any of the
3834 // visited instructions.
3835 if (Instruction *V = CSEMap.lookup(In)) {
3836 In->replaceAllUsesWith(V);
3837 In->eraseFromParent();
3838 continue;
3839 }
3840
3841 CSEMap[In] = In;
3842 }
3843}
3844
3845InstructionCost
3846LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3847 bool &NeedToScalarize) const {
3848 Function *F = CI->getCalledFunction();
3849 Type *ScalarRetTy = CI->getType();
3850 SmallVector<Type *, 4> Tys, ScalarTys;
3851 for (auto &ArgOp : CI->arg_operands())
3852 ScalarTys.push_back(ArgOp->getType());
3853
3854 // Estimate cost of scalarized vector call. The source operands are assumed
3855 // to be vectors, so we need to extract individual elements from there,
3856 // execute VF scalar calls, and then gather the result into the vector return
3857 // value.
3858 InstructionCost ScalarCallCost =
3859 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3860 if (VF.isScalar())
3861 return ScalarCallCost;
3862
3863 // Compute corresponding vector type for return value and arguments.
3864 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3865 for (Type *ScalarTy : ScalarTys)
3866 Tys.push_back(ToVectorTy(ScalarTy, VF));
3867
3868 // Compute costs of unpacking argument values for the scalar calls and
3869 // packing the return values to a vector.
3870 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3871
3872 InstructionCost Cost =
3873 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3874
3875 // If we can't emit a vector call for this function, then the currently found
3876 // cost is the cost we need to return.
3877 NeedToScalarize = true;
3878 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3879 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3880
3881 if (!TLI || CI->isNoBuiltin() || !VecFunc)
3882 return Cost;
3883
3884 // If the corresponding vector cost is cheaper, return its cost.
3885 InstructionCost VectorCallCost =
3886 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3887 if (VectorCallCost < Cost) {
3888 NeedToScalarize = false;
3889 Cost = VectorCallCost;
3890 }
3891 return Cost;
3892}
3893
3894static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3895 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3896 return Elt;
3897 return VectorType::get(Elt, VF);
3898}
3899
3900InstructionCost
3901LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3902 ElementCount VF) const {
3903 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3904 assert(ID && "Expected intrinsic call!")(static_cast <bool> (ID && "Expected intrinsic call!"
) ? void (0) : __assert_fail ("ID && \"Expected intrinsic call!\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3904, __extension__ __PRETTY_FUNCTION__))
;
3905 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3906 FastMathFlags FMF;
3907 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3908 FMF = FPMO->getFastMathFlags();
3909
3910 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
3911 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3912 SmallVector<Type *> ParamTys;
3913 std::transform(FTy->param_begin(), FTy->param_end(),
3914 std::back_inserter(ParamTys),
3915 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3916
3917 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3918 dyn_cast<IntrinsicInst>(CI));
3919 return TTI.getIntrinsicInstrCost(CostAttrs,
3920 TargetTransformInfo::TCK_RecipThroughput);
3921}
3922
3923static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3924 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3925 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3926 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3927}
3928
3929static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3930 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3931 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3932 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3933}
3934
3935void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3936 // For every instruction `I` in MinBWs, truncate the operands, create a
3937 // truncated version of `I` and reextend its result. InstCombine runs
3938 // later and will remove any ext/trunc pairs.
3939 SmallPtrSet<Value *, 4> Erased;
3940 for (const auto &KV : Cost->getMinimalBitwidths()) {
3941 // If the value wasn't vectorized, we must maintain the original scalar
3942 // type. The absence of the value from State indicates that it
3943 // wasn't vectorized.
3944 VPValue *Def = State.Plan->getVPValue(KV.first);
3945 if (!State.hasAnyVectorValue(Def))
3946 continue;
3947 for (unsigned Part = 0; Part < UF; ++Part) {
3948 Value *I = State.get(Def, Part);
3949 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3950 continue;
3951 Type *OriginalTy = I->getType();
3952 Type *ScalarTruncatedTy =
3953 IntegerType::get(OriginalTy->getContext(), KV.second);
3954 auto *TruncatedTy = FixedVectorType::get(
3955 ScalarTruncatedTy,
3956 cast<FixedVectorType>(OriginalTy)->getNumElements());
3957 if (TruncatedTy == OriginalTy)
3958 continue;
3959
3960 IRBuilder<> B(cast<Instruction>(I));
3961 auto ShrinkOperand = [&](Value *V) -> Value * {
3962 if (auto *ZI = dyn_cast<ZExtInst>(V))
3963 if (ZI->getSrcTy() == TruncatedTy)
3964 return ZI->getOperand(0);
3965 return B.CreateZExtOrTrunc(V, TruncatedTy);
3966 };
3967
3968 // The actual instruction modification depends on the instruction type,
3969 // unfortunately.
3970 Value *NewI = nullptr;
3971 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3972 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3973 ShrinkOperand(BO->getOperand(1)));
3974
3975 // Any wrapping introduced by shrinking this operation shouldn't be
3976 // considered undefined behavior. So, we can't unconditionally copy
3977 // arithmetic wrapping flags to NewI.
3978 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3979 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3980 NewI =
3981 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3982 ShrinkOperand(CI->getOperand(1)));
3983 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3984 NewI = B.CreateSelect(SI->getCondition(),
3985 ShrinkOperand(SI->getTrueValue()),
3986 ShrinkOperand(SI->getFalseValue()));
3987 } else if (auto *CI = dyn_cast<CastInst>(I)) {
3988 switch (CI->getOpcode()) {
3989 default:
3990 llvm_unreachable("Unhandled cast!")::llvm::llvm_unreachable_internal("Unhandled cast!", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3990)
;
3991 case Instruction::Trunc:
3992 NewI = ShrinkOperand(CI->getOperand(0));
3993 break;
3994 case Instruction::SExt:
3995 NewI = B.CreateSExtOrTrunc(
3996 CI->getOperand(0),
3997 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3998 break;
3999 case Instruction::ZExt:
4000 NewI = B.CreateZExtOrTrunc(
4001 CI->getOperand(0),
4002 smallestIntegerVectorType(OriginalTy, TruncatedTy));
4003 break;
4004 }
4005 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
4006 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
4007 ->getNumElements();
4008 auto *O0 = B.CreateZExtOrTrunc(
4009 SI->getOperand(0),
4010 FixedVectorType::get(ScalarTruncatedTy, Elements0));
4011 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
4012 ->getNumElements();
4013 auto *O1 = B.CreateZExtOrTrunc(
4014 SI->getOperand(1),
4015 FixedVectorType::get(ScalarTruncatedTy, Elements1));
4016
4017 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
4018 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
4019 // Don't do anything with the operands, just extend the result.
4020 continue;
4021 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
4022 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
4023 ->getNumElements();
4024 auto *O0 = B.CreateZExtOrTrunc(
4025 IE->getOperand(0),
4026 FixedVectorType::get(ScalarTruncatedTy, Elements));
4027 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
4028 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
4029 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
4030 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
4031 ->getNumElements();
4032 auto *O0 = B.CreateZExtOrTrunc(
4033 EE->getOperand(0),
4034 FixedVectorType::get(ScalarTruncatedTy, Elements));
4035 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
4036 } else {
4037 // If we don't know what to do, be conservative and don't do anything.
4038 continue;
4039 }
4040
4041 // Lastly, extend the result.
4042 NewI->takeName(cast<Instruction>(I));
4043 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
4044 I->replaceAllUsesWith(Res);
4045 cast<Instruction>(I)->eraseFromParent();
4046 Erased.insert(I);
4047 State.reset(Def, Res, Part);
4048 }
4049 }
4050
4051 // We'll have created a bunch of ZExts that are now parentless. Clean up.
4052 for (const auto &KV : Cost->getMinimalBitwidths()) {
4053 // If the value wasn't vectorized, we must maintain the original scalar
4054 // type. The absence of the value from State indicates that it
4055 // wasn't vectorized.
4056 VPValue *Def = State.Plan->getVPValue(KV.first);
4057 if (!State.hasAnyVectorValue(Def))
4058 continue;
4059 for (unsigned Part = 0; Part < UF; ++Part) {
4060 Value *I = State.get(Def, Part);
4061 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
4062 if (Inst && Inst->use_empty()) {
4063 Value *NewI = Inst->getOperand(0);
4064 Inst->eraseFromParent();
4065 State.reset(Def, NewI, Part);
4066 }
4067 }
4068 }
4069}
4070
4071void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
4072 // Insert truncates and extends for any truncated instructions as hints to
4073 // InstCombine.
4074 if (VF.isVector())
4075 truncateToMinimalBitwidths(State);
4076
4077 // Fix widened non-induction PHIs by setting up the PHI operands.
4078 if (OrigPHIsToFix.size()) {
4079 assert(EnableVPlanNativePath &&(static_cast <bool> (EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4080, __extension__ __PRETTY_FUNCTION__))
4080 "Unexpected non-induction PHIs for fixup in non VPlan-native path")(static_cast <bool> (EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4080, __extension__ __PRETTY_FUNCTION__))
;
4081 fixNonInductionPHIs(State);
4082 }
4083
4084 // At this point every instruction in the original loop is widened to a
4085 // vector form. Now we need to fix the recurrences in the loop. These PHI
4086 // nodes are currently empty because we did not want to introduce cycles.
4087 // This is the second stage of vectorizing recurrences.
4088 fixCrossIterationPHIs(State);
4089
4090 // Forget the original basic block.
4091 PSE.getSE()->forgetLoop(OrigLoop);
4092
4093 // Fix-up external users of the induction variables.
4094 for (auto &Entry : Legal->getInductionVars())
4095 fixupIVUsers(Entry.first, Entry.second,
4096 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4097 IVEndValues[Entry.first], LoopMiddleBlock);
4098
4099 fixLCSSAPHIs(State);
4100 for (Instruction *PI : PredicatedInstructions)
4101 sinkScalarOperands(&*PI);
4102
4103 // Remove redundant induction instructions.
4104 cse(LoopVectorBody);
4105
4106 // Set/update profile weights for the vector and remainder loops as original
4107 // loop iterations are now distributed among them. Note that original loop
4108 // represented by LoopScalarBody becomes remainder loop after vectorization.
4109 //
4110 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4111 // end up getting slightly roughened result but that should be OK since
4112 // profile is not inherently precise anyway. Note also possible bypass of
4113 // vector code caused by legality checks is ignored, assigning all the weight
4114 // to the vector loop, optimistically.
4115 //
4116 // For scalable vectorization we can't know at compile time how many iterations
4117 // of the loop are handled in one vector iteration, so instead assume a pessimistic
4118 // vscale of '1'.
4119 setProfileInfoAfterUnrolling(
4120 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
4121 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
4122}
4123
4124void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
4125 // In order to support recurrences we need to be able to vectorize Phi nodes.
4126 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4127 // stage #2: We now need to fix the recurrences by adding incoming edges to
4128 // the currently empty PHI nodes. At this point every instruction in the
4129 // original loop is widened to a vector form so we can use them to construct
4130 // the incoming edges.
4131 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
4132 for (VPRecipeBase &R : Header->phis()) {
4133 auto *PhiR = dyn_cast<VPWidenPHIRecipe>(&R);
4134 if (!PhiR)
4135 continue;
4136 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4137 if (PhiR->getRecurrenceDescriptor()) {
4138 fixReduction(PhiR, State);
4139 } else if (Legal->isFirstOrderRecurrence(OrigPhi))
4140 fixFirstOrderRecurrence(OrigPhi, State);
4141 }
4142}
4143
4144void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi,
4145 VPTransformState &State) {
4146 // This is the second phase of vectorizing first-order recurrences. An
4147 // overview of the transformation is described below. Suppose we have the
4148 // following loop.
4149 //
4150 // for (int i = 0; i < n; ++i)
4151 // b[i] = a[i] - a[i - 1];
4152 //
4153 // There is a first-order recurrence on "a". For this loop, the shorthand
4154 // scalar IR looks like:
4155 //
4156 // scalar.ph:
4157 // s_init = a[-1]
4158 // br scalar.body
4159 //
4160 // scalar.body:
4161 // i = phi [0, scalar.ph], [i+1, scalar.body]
4162 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4163 // s2 = a[i]
4164 // b[i] = s2 - s1
4165 // br cond, scalar.body, ...
4166 //
4167 // In this example, s1 is a recurrence because it's value depends on the
4168 // previous iteration. In the first phase of vectorization, we created a
4169 // temporary value for s1. We now complete the vectorization and produce the
4170 // shorthand vector IR shown below (for VF = 4, UF = 1).
4171 //
4172 // vector.ph:
4173 // v_init = vector(..., ..., ..., a[-1])
4174 // br vector.body
4175 //
4176 // vector.body
4177 // i = phi [0, vector.ph], [i+4, vector.body]
4178 // v1 = phi [v_init, vector.ph], [v2, vector.body]
4179 // v2 = a[i, i+1, i+2, i+3];
4180 // v3 = vector(v1(3), v2(0, 1, 2))
4181 // b[i, i+1, i+2, i+3] = v2 - v3
4182 // br cond, vector.body, middle.block
4183 //
4184 // middle.block:
4185 // x = v2(3)
4186 // br scalar.ph
4187 //
4188 // scalar.ph:
4189 // s_init = phi [x, middle.block], [a[-1], otherwise]
4190 // br scalar.body
4191 //
4192 // After execution completes the vector loop, we extract the next value of
4193 // the recurrence (x) to use as the initial value in the scalar loop.
4194
4195 // Get the original loop preheader and single loop latch.
4196 auto *Preheader = OrigLoop->getLoopPreheader();
4197 auto *Latch = OrigLoop->getLoopLatch();
4198
4199 // Get the initial and previous values of the scalar recurrence.
4200 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4201 auto *Previous = Phi->getIncomingValueForBlock(Latch);
4202
4203 auto *IdxTy = Builder.getInt32Ty();
4204 auto *One = ConstantInt::get(IdxTy, 1);
4205
4206 // Create a vector from the initial value.
4207 auto *VectorInit = ScalarInit;
4208 if (VF.isVector()) {
4209 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4210 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4211 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4212 VectorInit = Builder.CreateInsertElement(
4213 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)),
4214 VectorInit, LastIdx, "vector.recur.init");
4215 }
4216
4217 VPValue *PhiDef = State.Plan->getVPValue(Phi);
4218 VPValue *PreviousDef = State.Plan->getVPValue(Previous);
4219 // We constructed a temporary phi node in the first phase of vectorization.
4220 // This phi node will eventually be deleted.
4221 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0)));
4222
4223 // Create a phi node for the new recurrence. The current value will either be
4224 // the initial value inserted into a vector or loop-varying vector value.
4225 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4226 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4227
4228 // Get the vectorized previous value of the last part UF - 1. It appears last
4229 // among all unrolled iterations, due to the order of their construction.
4230 Value *PreviousLastPart = State.get(PreviousDef, UF - 1);
4231
4232 // Find and set the insertion point after the previous value if it is an
4233 // instruction.
4234 BasicBlock::iterator InsertPt;
4235 // Note that the previous value may have been constant-folded so it is not
4236 // guaranteed to be an instruction in the vector loop.
4237 // FIXME: Loop invariant values do not form recurrences. We should deal with
4238 // them earlier.
4239 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
4240 InsertPt = LoopVectorBody->getFirstInsertionPt();
4241 else {
4242 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
4243 if (isa<PHINode>(PreviousLastPart))
4244 // If the previous value is a phi node, we should insert after all the phi
4245 // nodes in the block containing the PHI to avoid breaking basic block
4246 // verification. Note that the basic block may be different to
4247 // LoopVectorBody, in case we predicate the loop.
4248 InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
4249 else
4250 InsertPt = ++PreviousInst->getIterator();
4251 }
4252 Builder.SetInsertPoint(&*InsertPt);
4253
4254 // The vector from which to take the initial value for the current iteration
4255 // (actual or unrolled). Initially, this is the vector phi node.
4256 Value *Incoming = VecPhi;
4257
4258 // Shuffle the current and previous vector and update the vector parts.
4259 for (unsigned Part = 0; Part < UF; ++Part) {
4260 Value *PreviousPart = State.get(PreviousDef, Part);
4261 Value *PhiPart = State.get(PhiDef, Part);
4262 auto *Shuffle = VF.isVector()
4263 ? Builder.CreateVectorSplice(Incoming, PreviousPart, -1)
4264 : Incoming;
4265 PhiPart->replaceAllUsesWith(Shuffle);
4266 cast<Instruction>(PhiPart)->eraseFromParent();
4267 State.reset(PhiDef, Shuffle, Part);
4268 Incoming = PreviousPart;
4269 }
4270
4271 // Fix the latch value of the new recurrence in the vector loop.
4272 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4273
4274 // Extract the last vector element in the middle block. This will be the
4275 // initial value for the recurrence when jumping to the scalar loop.
4276 auto *ExtractForScalar = Incoming;
4277 if (VF.isVector()) {
4278 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4279 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4280 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4281 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
4282 "vector.recur.extract");
4283 }
4284 // Extract the second last element in the middle block if the
4285 // Phi is used outside the loop. We need to extract the phi itself
4286 // and not the last element (the phi update in the current iteration). This
4287 // will be the value when jumping to the exit block from the LoopMiddleBlock,
4288 // when the scalar loop is not run at all.
4289 Value *ExtractForPhiUsedOutsideLoop = nullptr;
4290 if (VF.isVector()) {
4291 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4292 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
4293 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4294 Incoming, Idx, "vector.recur.extract.for.phi");
4295 } else if (UF > 1)
4296 // When loop is unrolled without vectorizing, initialize
4297 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
4298 // of `Incoming`. This is analogous to the vectorized case above: extracting
4299 // the second last element when VF > 1.
4300 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4301
4302 // Fix the initial value of the original recurrence in the scalar loop.
4303 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4304 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4305 for (auto *BB : predecessors(LoopScalarPreHeader)) {
4306 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4307 Start->addIncoming(Incoming, BB);
4308 }
4309
4310 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4311 Phi->setName("scalar.recur");
4312
4313 // Finally, fix users of the recurrence outside the loop. The users will need
4314 // either the last value of the scalar recurrence or the last value of the
4315 // vector recurrence we extracted in the middle block. Since the loop is in
4316 // LCSSA form, we just need to find all the phi nodes for the original scalar
4317 // recurrence in the exit block, and then add an edge for the middle block.
4318 // Note that LCSSA does not imply single entry when the original scalar loop
4319 // had multiple exiting edges (as we always run the last iteration in the
4320 // scalar epilogue); in that case, the exiting path through middle will be
4321 // dynamically dead and the value picked for the phi doesn't matter.
4322 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4323 if (any_of(LCSSAPhi.incoming_values(),
4324 [Phi](Value *V) { return V == Phi; }))
4325 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4326}
4327
4328void InnerLoopVectorizer::fixReduction(VPWidenPHIRecipe *PhiR,
4329 VPTransformState &State) {
4330 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4331 // Get it's reduction variable descriptor.
4332 assert(Legal->isReductionVariable(OrigPhi) &&(static_cast <bool> (Legal->isReductionVariable(OrigPhi
) && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4333, __extension__ __PRETTY_FUNCTION__))
4333 "Unable to find the reduction variable")(static_cast <bool> (Legal->isReductionVariable(OrigPhi
) && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4333, __extension__ __PRETTY_FUNCTION__))
;
4334 const RecurrenceDescriptor &RdxDesc = *PhiR->getRecurrenceDescriptor();
4335
4336 RecurKind RK = RdxDesc.getRecurrenceKind();
4337 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4338 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4339 setDebugLocFromInst(Builder, ReductionStartValue);
4340 bool IsInLoopReductionPhi = Cost->isInLoopReduction(OrigPhi);
4341
4342 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst);
4343 // This is the vector-clone of the value that leaves the loop.
4344 Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4345
4346 // Wrap flags are in general invalid after vectorization, clear them.
4347 clearReductionWrapFlags(RdxDesc, State);
4348
4349 // Fix the vector-loop phi.
4350
4351 // Reductions do not have to start at zero. They can start with
4352 // any loop invariant values.
4353 BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4354
4355 bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi &&
4356 Cost->useOrderedReductions(RdxDesc);
4357
4358 for (unsigned Part = 0; Part < UF; ++Part) {
4359 if (IsOrdered && Part > 0)
4360 break;
4361 Value *VecRdxPhi = State.get(PhiR->getVPSingleValue(), Part);
4362 Value *Val = State.get(PhiR->getBackedgeValue(), Part);
4363 if (IsOrdered)
4364 Val = State.get(PhiR->getBackedgeValue(), UF - 1);
4365
4366 cast<PHINode>(VecRdxPhi)->addIncoming(Val, VectorLoopLatch);
4367 }
4368
4369 // Before each round, move the insertion point right between
4370 // the PHIs and the values we are going to write.
4371 // This allows us to write both PHINodes and the extractelement
4372 // instructions.
4373 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4374
4375 setDebugLocFromInst(Builder, LoopExitInst);
4376
4377 Type *PhiTy = OrigPhi->getType();
4378 // If tail is folded by masking, the vector value to leave the loop should be
4379 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4380 // instead of the former. For an inloop reduction the reduction will already
4381 // be predicated, and does not need to be handled here.
4382 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4383 for (unsigned Part = 0; Part < UF; ++Part) {
4384 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4385 Value *Sel = nullptr;
4386 for (User *U : VecLoopExitInst->users()) {
4387 if (isa<SelectInst>(U)) {
4388 assert(!Sel && "Reduction exit feeding two selects")(static_cast <bool> (!Sel && "Reduction exit feeding two selects"
) ? void (0) : __assert_fail ("!Sel && \"Reduction exit feeding two selects\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4388, __extension__ __PRETTY_FUNCTION__))
;
4389 Sel = U;
4390 } else
4391 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select")(static_cast <bool> (isa<PHINode>(U) && "Reduction exit must feed Phi's or select"
) ? void (0) : __assert_fail ("isa<PHINode>(U) && \"Reduction exit must feed Phi's or select\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4391, __extension__ __PRETTY_FUNCTION__))
;
4392 }
4393 assert(Sel && "Reduction exit feeds no select")(static_cast <bool> (Sel && "Reduction exit feeds no select"
) ? void (0) : __assert_fail ("Sel && \"Reduction exit feeds no select\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4393, __extension__ __PRETTY_FUNCTION__))
;
4394 State.reset(LoopExitInstDef, Sel, Part);
4395
4396 // If the target can create a predicated operator for the reduction at no
4397 // extra cost in the loop (for example a predicated vadd), it can be
4398 // cheaper for the select to remain in the loop than be sunk out of it,
4399 // and so use the select value for the phi instead of the old
4400 // LoopExitValue.
4401 if (PreferPredicatedReductionSelect ||
4402 TTI->preferPredicatedReductionSelect(
4403 RdxDesc.getOpcode(), PhiTy,
4404 TargetTransformInfo::ReductionFlags())) {
4405 auto *VecRdxPhi =
4406 cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part));
4407 VecRdxPhi->setIncomingValueForBlock(
4408 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4409 }
4410 }
4411 }
4412
4413 // If the vector reduction can be performed in a smaller type, we truncate
4414 // then extend the loop exit value to enable InstCombine to evaluate the
4415 // entire expression in the smaller type.
4416 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4417 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!")(static_cast <bool> (!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"
) ? void (0) : __assert_fail ("!IsInLoopReductionPhi && \"Unexpected truncated inloop reduction!\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4417, __extension__ __PRETTY_FUNCTION__))
;
4418 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4419 Builder.SetInsertPoint(
4420 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4421 VectorParts RdxParts(UF);
4422 for (unsigned Part = 0; Part < UF; ++Part) {
4423 RdxParts[Part] = State.get(LoopExitInstDef, Part);
4424 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4425 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4426 : Builder.CreateZExt(Trunc, VecTy);
4427 for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4428 UI != RdxParts[Part]->user_end();)
4429 if (*UI != Trunc) {
4430 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4431 RdxParts[Part] = Extnd;
4432 } else {
4433 ++UI;
4434 }
4435 }
4436 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4437 for (unsigned Part = 0; Part < UF; ++Part) {
4438 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4439 State.reset(LoopExitInstDef, RdxParts[Part], Part);
4440 }
4441 }
4442
4443 // Reduce all of the unrolled parts into a single vector.
4444 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4445 unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4446
4447 // The middle block terminator has already been assigned a DebugLoc here (the
4448 // OrigLoop's single latch terminator). We want the whole middle block to
4449 // appear to execute on this line because: (a) it is all compiler generated,
4450 // (b) these instructions are always executed after evaluating the latch
4451 // conditional branch, and (c) other passes may add new predecessors which
4452 // terminate on this line. This is the easiest way to ensure we don't
4453 // accidentally cause an extra step back into the loop while debugging.
4454 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4455 if (IsOrdered)
4456 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
4457 else {
4458 // Floating-point operations should have some FMF to enable the reduction.
4459 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4460 Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4461 for (unsigned Part = 1; Part < UF; ++Part) {
4462 Value *RdxPart = State.get(LoopExitInstDef, Part);
4463 if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4464 ReducedPartRdx = Builder.CreateBinOp(
4465 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4466 } else {
4467 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4468 }
4469 }
4470 }
4471
4472 // Create the reduction after the loop. Note that inloop reductions create the
4473 // target reduction in the loop using a Reduction recipe.
4474 if (VF.isVector() && !IsInLoopReductionPhi) {
4475 ReducedPartRdx =
4476 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
4477 // If the reduction can be performed in a smaller type, we need to extend
4478 // the reduction to the wider type before we branch to the original loop.
4479 if (PhiTy != RdxDesc.getRecurrenceType())
4480 ReducedPartRdx = RdxDesc.isSigned()
4481 ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4482 : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4483 }
4484
4485 // Create a phi node that merges control-flow from the backedge-taken check
4486 // block and the middle block.
4487 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4488 LoopScalarPreHeader->getTerminator());
4489 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4490 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4491 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4492
4493 // Now, we need to fix the users of the reduction variable
4494 // inside and outside of the scalar remainder loop.
4495
4496 // We know that the loop is in LCSSA form. We need to update the PHI nodes
4497 // in the exit blocks. See comment on analogous loop in
4498 // fixFirstOrderRecurrence for a more complete explaination of the logic.
4499 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4500 if (any_of(LCSSAPhi.incoming_values(),
4501 [LoopExitInst](Value *V) { return V == LoopExitInst; }))
4502 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4503
4504 // Fix the scalar loop reduction variable with the incoming reduction sum
4505 // from the vector body and from the backedge value.
4506 int IncomingEdgeBlockIdx =
4507 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4508 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")(static_cast <bool> (IncomingEdgeBlockIdx >= 0 &&
"Invalid block index") ? void (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4508, __extension__ __PRETTY_FUNCTION__))
;
4509 // Pick the other block.
4510 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4511 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4512 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4513}
4514
4515void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4516 VPTransformState &State) {
4517 RecurKind RK = RdxDesc.getRecurrenceKind();
4518 if (RK != RecurKind::Add && RK != RecurKind::Mul)
4519 return;
4520
4521 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4522 assert(LoopExitInstr && "null loop exit instruction")(static_cast <bool> (LoopExitInstr && "null loop exit instruction"
) ? void (0) : __assert_fail ("LoopExitInstr && \"null loop exit instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4522, __extension__ __PRETTY_FUNCTION__))
;
4523 SmallVector<Instruction *, 8> Worklist;
4524 SmallPtrSet<Instruction *, 8> Visited;
4525 Worklist.push_back(LoopExitInstr);
4526 Visited.insert(LoopExitInstr);
4527
4528 while (!Worklist.empty()) {
4529 Instruction *Cur = Worklist.pop_back_val();
4530 if (isa<OverflowingBinaryOperator>(Cur))
4531 for (unsigned Part = 0; Part < UF; ++Part) {
4532 Value *V = State.get(State.Plan->getVPValue(Cur), Part);
4533 cast<Instruction>(V)->dropPoisonGeneratingFlags();
4534 }
4535
4536 for (User *U : Cur->users()) {
4537 Instruction *UI = cast<Instruction>(U);
4538 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4539 Visited.insert(UI).second)
4540 Worklist.push_back(UI);
4541 }
4542 }
4543}
4544
4545void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4546 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4547 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4548 // Some phis were already hand updated by the reduction and recurrence
4549 // code above, leave them alone.
4550 continue;
4551
4552 auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4553 // Non-instruction incoming values will have only one value.
4554
4555 VPLane Lane = VPLane::getFirstLane();
4556 if (isa<Instruction>(IncomingValue) &&
4557 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4558 VF))
4559 Lane = VPLane::getLastLaneForVF(VF);
4560
4561 // Can be a loop invariant incoming value or the last scalar value to be
4562 // extracted from the vectorized loop.
4563 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4564 Value *lastIncomingValue =
4565 OrigLoop->isLoopInvariant(IncomingValue)
4566 ? IncomingValue
4567 : State.get(State.Plan->getVPValue(IncomingValue),
4568 VPIteration(UF - 1, Lane));
4569 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4570 }
4571}
4572
4573void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4574 // The basic block and loop containing the predicated instruction.
4575 auto *PredBB = PredInst->getParent();
4576 auto *VectorLoop = LI->getLoopFor(PredBB);
4577
4578 // Initialize a worklist with the operands of the predicated instruction.
4579 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4580
4581 // Holds instructions that we need to analyze again. An instruction may be
4582 // reanalyzed if we don't yet know if we can sink it or not.
4583 SmallVector<Instruction *, 8> InstsToReanalyze;
4584
4585 // Returns true if a given use occurs in the predicated block. Phi nodes use
4586 // their operands in their corresponding predecessor blocks.
4587 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4588 auto *I = cast<Instruction>(U.getUser());
4589 BasicBlock *BB = I->getParent();
4590 if (auto *Phi = dyn_cast<PHINode>(I))
4591 BB = Phi->getIncomingBlock(
4592 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4593 return BB == PredBB;
4594 };
4595
4596 // Iteratively sink the scalarized operands of the predicated instruction
4597 // into the block we created for it. When an instruction is sunk, it's
4598 // operands are then added to the worklist. The algorithm ends after one pass
4599 // through the worklist doesn't sink a single instruction.
4600 bool Changed;
4601 do {
4602 // Add the instructions that need to be reanalyzed to the worklist, and
4603 // reset the changed indicator.
4604 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4605 InstsToReanalyze.clear();
4606 Changed = false;
4607
4608 while (!Worklist.empty()) {
4609 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4610
4611 // We can't sink an instruction if it is a phi node, is not in the loop,
4612 // or may have side effects.
4613 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4614 I->mayHaveSideEffects())
4615 continue;
4616
4617 // If the instruction is already in PredBB, check if we can sink its
4618 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4619 // sinking the scalar instruction I, hence it appears in PredBB; but it
4620 // may have failed to sink I's operands (recursively), which we try
4621 // (again) here.
4622 if (I->getParent() == PredBB) {
4623 Worklist.insert(I->op_begin(), I->op_end());
4624 continue;
4625 }
4626
4627 // It's legal to sink the instruction if all its uses occur in the
4628 // predicated block. Otherwise, there's nothing to do yet, and we may
4629 // need to reanalyze the instruction.
4630 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4631 InstsToReanalyze.push_back(I);
4632 continue;
4633 }
4634
4635 // Move the instruction to the beginning of the predicated block, and add
4636 // it's operands to the worklist.
4637 I->moveBefore(&*PredBB->getFirstInsertionPt());
4638 Worklist.insert(I->op_begin(), I->op_end());
4639
4640 // The sinking may have enabled other instructions to be sunk, so we will
4641 // need to iterate.
4642 Changed = true;
4643 }
4644 } while (Changed);
4645}
4646
4647void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4648 for (PHINode *OrigPhi : OrigPHIsToFix) {
4649 VPWidenPHIRecipe *VPPhi =
4650 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4651 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4652 // Make sure the builder has a valid insert point.
4653 Builder.SetInsertPoint(NewPhi);
4654 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4655 VPValue *Inc = VPPhi->getIncomingValue(i);
4656 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4657 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4658 }
4659 }
4660}
4661
4662bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
4663 return Cost->useOrderedReductions(RdxDesc);
4664}
4665
4666void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4667 VPUser &Operands, unsigned UF,
4668 ElementCount VF, bool IsPtrLoopInvariant,
4669 SmallBitVector &IsIndexLoopInvariant,
4670 VPTransformState &State) {
4671 // Construct a vector GEP by widening the operands of the scalar GEP as
4672 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4673 // results in a vector of pointers when at least one operand of the GEP
4674 // is vector-typed. Thus, to keep the representation compact, we only use
4675 // vector-typed operands for loop-varying values.
4676
4677 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4678 // If we are vectorizing, but the GEP has only loop-invariant operands,
4679 // the GEP we build (by only using vector-typed operands for
4680 // loop-varying values) would be a scalar pointer. Thus, to ensure we
4681 // produce a vector of pointers, we need to either arbitrarily pick an
4682 // operand to broadcast, or broadcast a clone of the original GEP.
4683 // Here, we broadcast a clone of the original.
4684 //
4685 // TODO: If at some point we decide to scalarize instructions having
4686 // loop-invariant operands, this special case will no longer be
4687 // required. We would add the scalarization decision to
4688 // collectLoopScalars() and teach getVectorValue() to broadcast
4689 // the lane-zero scalar value.
4690 auto *Clone = Builder.Insert(GEP->clone());
4691 for (unsigned Part = 0; Part < UF; ++Part) {
4692 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4693 State.set(VPDef, EntryPart, Part);
4694 addMetadata(EntryPart, GEP);
4695 }
4696 } else {
4697 // If the GEP has at least one loop-varying operand, we are sure to
4698 // produce a vector of pointers. But if we are only unrolling, we want
4699 // to produce a scalar GEP for each unroll part. Thus, the GEP we
4700 // produce with the code below will be scalar (if VF == 1) or vector
4701 // (otherwise). Note that for the unroll-only case, we still maintain
4702 // values in the vector mapping with initVector, as we do for other
4703 // instructions.
4704 for (unsigned Part = 0; Part < UF; ++Part) {
4705 // The pointer operand of the new GEP. If it's loop-invariant, we
4706 // won't broadcast it.
4707 auto *Ptr = IsPtrLoopInvariant
4708 ? State.get(Operands.getOperand(0), VPIteration(0, 0))
4709 : State.get(Operands.getOperand(0), Part);
4710
4711 // Collect all the indices for the new GEP. If any index is
4712 // loop-invariant, we won't broadcast it.
4713 SmallVector<Value *, 4> Indices;
4714 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4715 VPValue *Operand = Operands.getOperand(I);
4716 if (IsIndexLoopInvariant[I - 1])
4717 Indices.push_back(State.get(Operand, VPIteration(0, 0)));
4718 else
4719 Indices.push_back(State.get(Operand, Part));
4720 }
4721
4722 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4723 // but it should be a vector, otherwise.
4724 auto *NewGEP =
4725 GEP->isInBounds()
4726 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4727 Indices)
4728 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4729 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&(static_cast <bool> ((VF.isScalar() || NewGEP->getType
()->isVectorTy()) && "NewGEP is not a pointer vector"
) ? void (0) : __assert_fail ("(VF.isScalar() || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4730, __extension__ __PRETTY_FUNCTION__))
4730 "NewGEP is not a pointer vector")(static_cast <bool> ((VF.isScalar() || NewGEP->getType
()->isVectorTy()) && "NewGEP is not a pointer vector"
) ? void (0) : __assert_fail ("(VF.isScalar() || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4730, __extension__ __PRETTY_FUNCTION__))
;
4731 State.set(VPDef, NewGEP, Part);
4732 addMetadata(NewGEP, GEP);
4733 }
4734 }
4735}
4736
4737void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4738 RecurrenceDescriptor *RdxDesc,
4739 VPWidenPHIRecipe *PhiR,
4740 VPTransformState &State) {
4741 PHINode *P = cast<PHINode>(PN);
4742 if (EnableVPlanNativePath) {
4743 // Currently we enter here in the VPlan-native path for non-induction
4744 // PHIs where all control flow is uniform. We simply widen these PHIs.
4745 // Create a vector phi with no operands - the vector phi operands will be
4746 // set at the end of vector code generation.
4747 Type *VecTy = (State.VF.isScalar())
4748 ? PN->getType()
4749 : VectorType::get(PN->getType(), State.VF);
4750 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4751 State.set(PhiR, VecPhi, 0);
4752 OrigPHIsToFix.push_back(P);
4753
4754 return;
4755 }
4756
4757 assert(PN->getParent() == OrigLoop->getHeader() &&(static_cast <bool> (PN->getParent() == OrigLoop->
getHeader() && "Non-header phis should have been handled elsewhere"
) ? void (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4758, __extension__ __PRETTY_FUNCTION__))
4758 "Non-header phis should have been handled elsewhere")(static_cast <bool> (PN->getParent() == OrigLoop->
getHeader() && "Non-header phis should have been handled elsewhere"
) ? void (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4758, __extension__ __PRETTY_FUNCTION__))
;
4759
4760 VPValue *StartVPV = PhiR->getStartValue();
4761 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr;
4762 // In order to support recurrences we need to be able to vectorize Phi nodes.
4763 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4764 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4765 // this value when we vectorize all of the instructions that use the PHI.
4766 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) {
4767 Value *Iden = nullptr;
4768 bool ScalarPHI =
4769 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4770 Type *VecTy =
4771 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
4772
4773 if (RdxDesc) {
4774 assert(Legal->isReductionVariable(P) && StartV &&(static_cast <bool> (Legal->isReductionVariable(P) &&
StartV && "RdxDesc should only be set for reduction variables; in that case "
"a StartV is also required") ? void (0) : __assert_fail ("Legal->isReductionVariable(P) && StartV && \"RdxDesc should only be set for reduction variables; in that case \" \"a StartV is also required\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4776, __extension__ __PRETTY_FUNCTION__))
4775 "RdxDesc should only be set for reduction variables; in that case "(static_cast <bool> (Legal->isReductionVariable(P) &&
StartV && "RdxDesc should only be set for reduction variables; in that case "
"a StartV is also required") ? void (0) : __assert_fail ("Legal->isReductionVariable(P) && StartV && \"RdxDesc should only be set for reduction variables; in that case \" \"a StartV is also required\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4776, __extension__ __PRETTY_FUNCTION__))
4776 "a StartV is also required")(static_cast <bool> (Legal->isReductionVariable(P) &&
StartV && "RdxDesc should only be set for reduction variables; in that case "
"a StartV is also required") ? void (0) : __assert_fail ("Legal->isReductionVariable(P) && StartV && \"RdxDesc should only be set for reduction variables; in that case \" \"a StartV is also required\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4776, __extension__ __PRETTY_FUNCTION__))
;
4777 RecurKind RK = RdxDesc->getRecurrenceKind();
4778 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
4779 // MinMax reduction have the start value as their identify.
4780 if (ScalarPHI) {
4781 Iden = StartV;
4782 } else {
4783 IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4784 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4785 StartV = Iden =
4786 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
4787 }
4788 } else {
4789 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity(
4790 RK, VecTy->getScalarType(), RdxDesc->getFastMathFlags());
4791 Iden = IdenC;
4792
4793 if (!ScalarPHI) {
4794 Iden = ConstantVector::getSplat(State.VF, IdenC);
4795 IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4796 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4797 Constant *Zero = Builder.getInt32(0);
4798 StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
4799 }
4800 }
4801 }
4802
4803 bool IsOrdered = State.VF.isVector() &&
4804 Cost->isInLoopReduction(cast<PHINode>(PN)) &&
4805 Cost->useOrderedReductions(*RdxDesc);
4806
4807 for (unsigned Part = 0; Part < State.UF; ++Part) {
4808 // This is phase one of vectorizing PHIs.
4809 if (Part > 0 && IsOrdered)
4810 return;
4811 Value *EntryPart = PHINode::Create(
4812 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4813 State.set(PhiR, EntryPart, Part);
4814 if (StartV) {
4815 // Make sure to add the reduction start value only to the
4816 // first unroll part.
4817 Value *StartVal = (Part == 0) ? StartV : Iden;
4818 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader);
4819 }
4820 }
4821 return;
4822 }
4823
4824 assert(!Legal->isReductionVariable(P) &&(static_cast <bool> (!Legal->isReductionVariable(P) &&
"reductions should be handled above") ? void (0) : __assert_fail
("!Legal->isReductionVariable(P) && \"reductions should be handled above\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4825, __extension__ __PRETTY_FUNCTION__))
4825 "reductions should be handled above")(static_cast <bool> (!Legal->isReductionVariable(P) &&
"reductions should be handled above") ? void (0) : __assert_fail
("!Legal->isReductionVariable(P) && \"reductions should be handled above\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4825, __extension__ __PRETTY_FUNCTION__))
;
4826
4827 setDebugLocFromInst(Builder, P);
4828
4829 // This PHINode must be an induction variable.
4830 // Make sure that we know about it.
4831 assert(Legal->getInductionVars().count(P) && "Not an induction variable")(static_cast <bool> (Legal->getInductionVars().count
(P) && "Not an induction variable") ? void (0) : __assert_fail
("Legal->getInductionVars().count(P) && \"Not an induction variable\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4831, __extension__ __PRETTY_FUNCTION__))
;
4832
4833 InductionDescriptor II = Legal->getInductionVars().lookup(P);
4834 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4835
4836 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4837 // which can be found from the original scalar operations.
4838 switch (II.getKind()) {
4839 case InductionDescriptor::IK_NoInduction:
4840 llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4840)
;
4841 case InductionDescriptor::IK_IntInduction:
4842 case InductionDescriptor::IK_FpInduction:
4843 llvm_unreachable("Integer/fp induction is handled elsewhere.")::llvm::llvm_unreachable_internal("Integer/fp induction is handled elsewhere."
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4843)
;
4844 case InductionDescriptor::IK_PtrInduction: {
4845 // Handle the pointer induction variable case.
4846 assert(P->getType()->isPointerTy() && "Unexpected type.")(static_cast <bool> (P->getType()->isPointerTy() &&
"Unexpected type.") ? void (0) : __assert_fail ("P->getType()->isPointerTy() && \"Unexpected type.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4846, __extension__ __PRETTY_FUNCTION__))
;
4847
4848 if (Cost->isScalarAfterVectorization(P, State.VF)) {
4849 // This is the normalized GEP that starts counting at zero.
4850 Value *PtrInd =
4851 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4852 // Determine the number of scalars we need to generate for each unroll
4853 // iteration. If the instruction is uniform, we only need to generate the
4854 // first lane. Otherwise, we generate all VF values.
4855 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
4856 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
4857
4858 bool NeedsVectorIndex = !IsUniform && VF.isScalable();
4859 Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr;
4860 if (NeedsVectorIndex) {
4861 Type *VecIVTy = VectorType::get(PtrInd->getType(), VF);
4862 UnitStepVec = Builder.CreateStepVector(VecIVTy);
4863 PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd);
4864 }
4865
4866 for (unsigned Part = 0; Part < UF; ++Part) {
4867 Value *PartStart = createStepForVF(
4868 Builder, ConstantInt::get(PtrInd->getType(), Part), VF);
4869
4870 if (NeedsVectorIndex) {
4871 Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
4872 Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
4873 Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
4874 Value *SclrGep =
4875 emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
4876 SclrGep->setName("next.gep");
4877 State.set(PhiR, SclrGep, Part);
4878 // We've cached the whole vector, which means we can support the
4879 // extraction of any lane.
4880 continue;
4881 }
4882
4883 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4884 Value *Idx = Builder.CreateAdd(
4885 PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4886 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4887 Value *SclrGep =
4888 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4889 SclrGep->setName("next.gep");
4890 State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4891 }
4892 }
4893 return;
4894 }
4895 assert(isa<SCEVConstant>(II.getStep()) &&(static_cast <bool> (isa<SCEVConstant>(II.getStep
()) && "Induction step not a SCEV constant!") ? void (
0) : __assert_fail ("isa<SCEVConstant>(II.getStep()) && \"Induction step not a SCEV constant!\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4896, __extension__ __PRETTY_FUNCTION__))
4896 "Induction step not a SCEV constant!")(static_cast <bool> (isa<SCEVConstant>(II.getStep
()) && "Induction step not a SCEV constant!") ? void (
0) : __assert_fail ("isa<SCEVConstant>(II.getStep()) && \"Induction step not a SCEV constant!\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4896, __extension__ __PRETTY_FUNCTION__))
;
4897 Type *PhiType = II.getStep()->getType();
4898
4899 // Build a pointer phi
4900 Value *ScalarStartValue = II.getStartValue();
4901 Type *ScStValueType = ScalarStartValue->getType();
4902 PHINode *NewPointerPhi =
4903 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4904 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4905
4906 // A pointer induction, performed by using a gep
4907 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4908 Instruction *InductionLoc = LoopLatch->getTerminator();
4909 const SCEV *ScalarStep = II.getStep();
4910 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4911 Value *ScalarStepValue =
4912 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4913 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4914 Value *NumUnrolledElems =
4915 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4916 Value *InductionGEP = GetElementPtrInst::Create(
4917 ScStValueType->getPointerElementType(), NewPointerPhi,
4918 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4919 InductionLoc);
4920 NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4921
4922 // Create UF many actual address geps that use the pointer
4923 // phi as base and a vectorized version of the step value
4924 // (<step*0, ..., step*N>) as offset.
4925 for (unsigned Part = 0; Part < State.UF; ++Part) {
4926 Type *VecPhiType = VectorType::get(PhiType, State.VF);
4927 Value *StartOffsetScalar =
4928 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4929 Value *StartOffset =
4930 Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4931 // Create a vector of consecutive numbers from zero to VF.
4932 StartOffset =
4933 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4934
4935 Value *GEP = Builder.CreateGEP(
4936 ScStValueType->getPointerElementType(), NewPointerPhi,
4937 Builder.CreateMul(
4938 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4939 "vector.gep"));
4940 State.set(PhiR, GEP, Part);
4941 }
4942 }
4943 }
4944}
4945
4946/// A helper function for checking whether an integer division-related
4947/// instruction may divide by zero (in which case it must be predicated if
4948/// executed conditionally in the scalar code).
4949/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4950/// Non-zero divisors that are non compile-time constants will not be
4951/// converted into multiplication, so we will still end up scalarizing
4952/// the division, but can do so w/o predication.
4953static bool mayDivideByZero(Instruction &I) {
4954 assert((I.getOpcode() == Instruction::UDiv ||(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
|| I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4958, __extension__ __PRETTY_FUNCTION__))
4955 I.getOpcode() == Instruction::SDiv ||(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
|| I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4958, __extension__ __PRETTY_FUNCTION__))
4956 I.getOpcode() == Instruction::URem ||(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
|| I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4958, __extension__ __PRETTY_FUNCTION__))
4957 I.getOpcode() == Instruction::SRem) &&(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
|| I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4958, __extension__ __PRETTY_FUNCTION__))
4958 "Unexpected instruction")(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
|| I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4958, __extension__ __PRETTY_FUNCTION__))
;
4959 Value *Divisor = I.getOperand(1);
4960 auto *CInt = dyn_cast<ConstantInt>(Divisor);
4961 return !CInt || CInt->isZero();
4962}
4963
4964void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4965 VPUser &User,
4966 VPTransformState &State) {
4967 switch (I.getOpcode()) {
4968 case Instruction::Call:
4969 case Instruction::Br:
4970 case Instruction::PHI:
4971 case Instruction::GetElementPtr:
4972 case Instruction::Select:
4973 llvm_unreachable("This instruction is handled by a different recipe.")::llvm::llvm_unreachable_internal("This instruction is handled by a different recipe."
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4973)
;
4974 case Instruction::UDiv:
4975 case Instruction::SDiv:
4976 case Instruction::SRem:
4977 case Instruction::URem:
4978 case Instruction::Add:
4979 case Instruction::FAdd:
4980 case Instruction::Sub:
4981 case Instruction::FSub:
4982 case Instruction::FNeg:
4983 case Instruction::Mul:
4984 case Instruction::FMul:
4985 case Instruction::FDiv:
4986 case Instruction::FRem:
4987 case Instruction::Shl:
4988 case Instruction::LShr:
4989 case Instruction::AShr:
4990 case Instruction::And:
4991 case Instruction::Or:
4992 case Instruction::Xor: {
4993 // Just widen unops and binops.
4994 setDebugLocFromInst(Builder, &I);
4995
4996 for (unsigned Part = 0; Part < UF; ++Part) {
4997 SmallVector<Value *, 2> Ops;
4998 for (VPValue *VPOp : User.operands())
4999 Ops.push_back(State.get(VPOp, Part));
5000
5001 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
5002
5003 if (auto *VecOp = dyn_cast<Instruction>(V))
5004 VecOp->copyIRFlags(&I);
5005
5006 // Use this vector value for all users of the original instruction.
5007 State.set(Def, V, Part);
5008 addMetadata(V, &I);
5009 }
5010
5011 break;
5012 }
5013 case Instruction::ICmp:
5014 case Instruction::FCmp: {
5015 // Widen compares. Generate vector compares.
5016 bool FCmp = (I.getOpcode() == Instruction::FCmp);
5017 auto *Cmp = cast<CmpInst>(&I);
5018 setDebugLocFromInst(Builder, Cmp);
5019 for (unsigned Part = 0; Part < UF; ++Part) {
5020 Value *A = State.get(User.getOperand(0), Part);
5021 Value *B = State.get(User.getOperand(1), Part);
5022 Value *C = nullptr;
5023 if (FCmp) {
5024 // Propagate fast math flags.
5025 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
5026 Builder.setFastMathFlags(Cmp->getFastMathFlags());
5027 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
5028 } else {
5029 C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
5030 }
5031 State.set(Def, C, Part);
5032 addMetadata(C, &I);
5033 }
5034
5035 break;
5036 }
5037
5038 case Instruction::ZExt:
5039 case Instruction::SExt:
5040 case Instruction::FPToUI:
5041 case Instruction::FPToSI:
5042 case Instruction::FPExt:
5043 case Instruction::PtrToInt:
5044 case Instruction::IntToPtr:
5045 case Instruction::SIToFP:
5046 case Instruction::UIToFP:
5047 case Instruction::Trunc:
5048 case Instruction::FPTrunc:
5049 case Instruction::BitCast: {
5050 auto *CI = cast<CastInst>(&I);
5051 setDebugLocFromInst(Builder, CI);
5052
5053 /// Vectorize casts.
5054 Type *DestTy =
5055 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
5056
5057 for (unsigned Part = 0; Part < UF; ++Part) {
5058 Value *A = State.get(User.getOperand(0), Part);
5059 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
5060 State.set(Def, Cast, Part);
5061 addMetadata(Cast, &I);
5062 }
5063 break;
5064 }
5065 default:
5066 // This instruction is not vectorized by simple widening.
5067 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an unhandled instruction: "
<< I; } } while (false)
;
5068 llvm_unreachable("Unhandled instruction!")::llvm::llvm_unreachable_internal("Unhandled instruction!", "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5068)
;
5069 } // end of switch.
5070}
5071
5072void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
5073 VPUser &ArgOperands,
5074 VPTransformState &State) {
5075 assert(!isa<DbgInfoIntrinsic>(I) &&(static_cast <bool> (!isa<DbgInfoIntrinsic>(I) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction"
) ? void (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(I) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5076, __extension__ __PRETTY_FUNCTION__))
5076 "DbgInfoIntrinsic should have been dropped during VPlan construction")(static_cast <bool> (!isa<DbgInfoIntrinsic>(I) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction"
) ? void (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(I) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5076, __extension__ __PRETTY_FUNCTION__))
;
5077 setDebugLocFromInst(Builder, &I);
5078
5079 Module *M = I.getParent()->getParent()->getParent();
5080 auto *CI = cast<CallInst>(&I);
5081
5082 SmallVector<Type *, 4> Tys;
5083 for (Value *ArgOperand : CI->arg_operands())
5084 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
5085
5086 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
5087
5088 // The flag shows whether we use Intrinsic or a usual Call for vectorized
5089 // version of the instruction.
5090 // Is it beneficial to perform intrinsic call compared to lib call?
5091 bool NeedToScalarize = false;
5092 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
5093 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
5094 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
5095 assert((UseVectorIntrinsic || !NeedToScalarize) &&(static_cast <bool> ((UseVectorIntrinsic || !NeedToScalarize
) && "Instruction should be scalarized elsewhere.") ?
void (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5096, __extension__ __PRETTY_FUNCTION__))
5096 "Instruction should be scalarized elsewhere.")(static_cast <bool> ((UseVectorIntrinsic || !NeedToScalarize
) && "Instruction should be scalarized elsewhere.") ?
void (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5096, __extension__ __PRETTY_FUNCTION__))
;
5097 assert((IntrinsicCost.isValid() || CallCost.isValid()) &&(static_cast <bool> ((IntrinsicCost.isValid() || CallCost
.isValid()) && "Either the intrinsic cost or vector call cost must be valid"
) ? void (0) : __assert_fail ("(IntrinsicCost.isValid() || CallCost.isValid()) && \"Either the intrinsic cost or vector call cost must be valid\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5098, __extension__ __PRETTY_FUNCTION__))
5098 "Either the intrinsic cost or vector call cost must be valid")(static_cast <bool> ((IntrinsicCost.isValid() || CallCost
.isValid()) && "Either the intrinsic cost or vector call cost must be valid"
) ? void (0) : __assert_fail ("(IntrinsicCost.isValid() || CallCost.isValid()) && \"Either the intrinsic cost or vector call cost must be valid\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5098, __extension__ __PRETTY_FUNCTION__))
;
5099
5100 for (unsigned Part = 0; Part < UF; ++Part) {
5101 SmallVector<Value *, 4> Args;
5102 for (auto &I : enumerate(ArgOperands.operands())) {
5103 // Some intrinsics have a scalar argument - don't replace it with a
5104 // vector.
5105 Value *Arg;
5106 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
5107 Arg = State.get(I.value(), Part);
5108 else
5109 Arg = State.get(I.value(), VPIteration(0, 0));
5110 Args.push_back(Arg);
5111 }
5112
5113 Function *VectorF;
5114 if (UseVectorIntrinsic) {
5115 // Use vector version of the intrinsic.
5116 Type *TysForDecl[] = {CI->getType()};
5117 if (VF.isVector())
5118 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
5119 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
5120 assert(VectorF && "Can't retrieve vector intrinsic.")(static_cast <bool> (VectorF && "Can't retrieve vector intrinsic."
) ? void (0) : __assert_fail ("VectorF && \"Can't retrieve vector intrinsic.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5120, __extension__ __PRETTY_FUNCTION__))
;
5121 } else {
5122 // Use vector version of the function call.
5123 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
5124#ifndef NDEBUG
5125 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&(static_cast <bool> (VFDatabase(*CI).getVectorizedFunction
(Shape) != nullptr && "Can't create vector function."
) ? void (0) : __assert_fail ("VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5126, __extension__ __PRETTY_FUNCTION__))
5126 "Can't create vector function.")(static_cast <bool> (VFDatabase(*CI).getVectorizedFunction
(Shape) != nullptr && "Can't create vector function."
) ? void (0) : __assert_fail ("VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5126, __extension__ __PRETTY_FUNCTION__))
;
5127#endif
5128 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
5129 }
5130 SmallVector<OperandBundleDef, 1> OpBundles;
5131 CI->getOperandBundlesAsDefs(OpBundles);
5132 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
5133
5134 if (isa<FPMathOperator>(V))
5135 V->copyFastMathFlags(CI);
5136
5137 State.set(Def, V, Part);
5138 addMetadata(V, &I);
5139 }
5140}
5141
5142void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
5143 VPUser &Operands,
5144 bool InvariantCond,
5145 VPTransformState &State) {
5146 setDebugLocFromInst(Builder, &I);
5147
5148 // The condition can be loop invariant but still defined inside the
5149 // loop. This means that we can't just use the original 'cond' value.
5150 // We have to take the 'vectorized' value and pick the first lane.
5151 // Instcombine will make this a no-op.
5152 auto *InvarCond = InvariantCond
5153 ? State.get(Operands.getOperand(0), VPIteration(0, 0))
5154 : nullptr;
5155
5156 for (unsigned Part = 0; Part < UF; ++Part) {
5157 Value *Cond =
5158 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
5159 Value *Op0 = State.get(Operands.getOperand(1), Part);
5160 Value *Op1 = State.get(Operands.getOperand(2), Part);
5161 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
5162 State.set(VPDef, Sel, Part);
5163 addMetadata(Sel, &I);
5164 }
5165}
5166
5167void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
5168 // We should not collect Scalars more than once per VF. Right now, this
5169 // function is called from collectUniformsAndScalars(), which already does
5170 // this check. Collecting Scalars for VF=1 does not make any sense.
5171 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&(static_cast <bool> (VF.isVector() && Scalars.find
(VF) == Scalars.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5172, __extension__ __PRETTY_FUNCTION__))
5172 "This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Scalars.find
(VF) == Scalars.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5172, __extension__ __PRETTY_FUNCTION__))
;
5173
5174 SmallSetVector<Instruction *, 8> Worklist;
5175
5176 // These sets are used to seed the analysis with pointers used by memory
5177 // accesses that will remain scalar.
5178 SmallSetVector<Instruction *, 8> ScalarPtrs;
5179 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
5180 auto *Latch = TheLoop->getLoopLatch();
5181
5182 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
5183 // The pointer operands of loads and stores will be scalar as long as the
5184 // memory access is not a gather or scatter operation. The value operand of a
5185 // store will remain scalar if the store is scalarized.
5186 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
5187 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
5188 assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5189, __extension__ __PRETTY_FUNCTION__))
5189 "Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5189, __extension__ __PRETTY_FUNCTION__))
;
5190 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
5191 if (Ptr == Store->getValueOperand())
5192 return WideningDecision == CM_Scalarize;
5193 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5194, __extension__ __PRETTY_FUNCTION__))
5194 "Ptr is neither a value or pointer operand")(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5194, __extension__ __PRETTY_FUNCTION__))
;
5195 return WideningDecision != CM_GatherScatter;
5196 };
5197
5198 // A helper that returns true if the given value is a bitcast or
5199 // getelementptr instruction contained in the loop.
5200 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
5201 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
5202 isa<GetElementPtrInst>(V)) &&
5203 !TheLoop->isLoopInvariant(V);
5204 };
5205
5206 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
5207 if (!isa<PHINode>(Ptr) ||
5208 !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
5209 return false;
5210 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
5211 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
5212 return false;
5213 return isScalarUse(MemAccess, Ptr);
5214 };
5215
5216 // A helper that evaluates a memory access's use of a pointer. If the
5217 // pointer is actually the pointer induction of a loop, it is being
5218 // inserted into Worklist. If the use will be a scalar use, and the
5219 // pointer is only used by memory accesses, we place the pointer in
5220 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
5221 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
5222 if (isScalarPtrInduction(MemAccess, Ptr)) {
5223 Worklist.insert(cast<Instruction>(Ptr));
5224 Instruction *Update = cast<Instruction>(
5225 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
5226 Worklist.insert(Update);
5227 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptrdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Ptr << "\n"; } } while (false)
5228 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Ptr << "\n"; } } while (false)
;
5229 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Updatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Update << "\n"; } } while (false)
5230 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Update << "\n"; } } while (false)
;
5231 return;
5232 }
5233 // We only care about bitcast and getelementptr instructions contained in
5234 // the loop.
5235 if (!isLoopVaryingBitCastOrGEP(Ptr))
5236 return;
5237
5238 // If the pointer has already been identified as scalar (e.g., if it was
5239 // also identified as uniform), there's nothing to do.
5240 auto *I = cast<Instruction>(Ptr);
5241 if (Worklist.count(I))
5242 return;
5243
5244 // If the use of the pointer will be a scalar use, and all users of the
5245 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5246 // place the pointer in PossibleNonScalarPtrs.
5247 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5248 return isa<LoadInst>(U) || isa<StoreInst>(U);
5249 }))
5250 ScalarPtrs.insert(I);
5251 else
5252 PossibleNonScalarPtrs.insert(I);
5253 };
5254
5255 // We seed the scalars analysis with three classes of instructions: (1)
5256 // instructions marked uniform-after-vectorization and (2) bitcast,
5257 // getelementptr and (pointer) phi instructions used by memory accesses
5258 // requiring a scalar use.
5259 //
5260 // (1) Add to the worklist all instructions that have been identified as
5261 // uniform-after-vectorization.
5262 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5263
5264 // (2) Add to the worklist all bitcast and getelementptr instructions used by
5265 // memory accesses requiring a scalar use. The pointer operands of loads and
5266 // stores will be scalar as long as the memory accesses is not a gather or
5267 // scatter operation. The value operand of a store will remain scalar if the
5268 // store is scalarized.
5269 for (auto *BB : TheLoop->blocks())
5270 for (auto &I : *BB) {
5271 if (auto *Load = dyn_cast<LoadInst>(&I)) {
5272 evaluatePtrUse(Load, Load->getPointerOperand());
5273 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5274 evaluatePtrUse(Store, Store->getPointerOperand());
5275 evaluatePtrUse(Store, Store->getValueOperand());
5276 }
5277 }
5278 for (auto *I : ScalarPtrs)
5279 if (!PossibleNonScalarPtrs.count(I)) {
5280 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *I << "\n"; } } while (false)
;
5281 Worklist.insert(I);
5282 }
5283
5284 // Insert the forced scalars.
5285 // FIXME: Currently widenPHIInstruction() often creates a dead vector
5286 // induction variable when the PHI user is scalarized.
5287 auto ForcedScalar = ForcedScalars.find(VF);
5288 if (ForcedScalar != ForcedScalars.end())
5289 for (auto *I : ForcedScalar->second)
5290 Worklist.insert(I);
5291
5292 // Expand the worklist by looking through any bitcasts and getelementptr
5293 // instructions we've already identified as scalar. This is similar to the
5294 // expansion step in collectLoopUniforms(); however, here we're only
5295 // expanding to include additional bitcasts and getelementptr instructions.
5296 unsigned Idx = 0;
5297 while (Idx != Worklist.size()) {
5298 Instruction *Dst = Worklist[Idx++];
5299 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5300 continue;
5301 auto *Src = cast<Instruction>(Dst->getOperand(0));
5302 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5303 auto *J = cast<Instruction>(U);
5304 return !TheLoop->contains(J) || Worklist.count(J) ||
5305 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5306 isScalarUse(J, Src));
5307 })) {
5308 Worklist.insert(Src);
5309 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Src << "\n"; } } while (false)
;
5310 }
5311 }
5312
5313 // An induction variable will remain scalar if all users of the induction
5314 // variable and induction variable update remain scalar.
5315 for (auto &Induction : Legal->getInductionVars()) {
5316 auto *Ind = Induction.first;
5317 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5318
5319 // If tail-folding is applied, the primary induction variable will be used
5320 // to feed a vector compare.
5321 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5322 continue;
5323
5324 // Determine if all users of the induction variable are scalar after
5325 // vectorization.
5326 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5327 auto *I = cast<Instruction>(U);
5328 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5329 });
5330 if (!ScalarInd)
5331 continue;
5332
5333 // Determine if all users of the induction variable update instruction are
5334 // scalar after vectorization.
5335 auto ScalarIndUpdate =
5336 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5337 auto *I = cast<Instruction>(U);
5338 return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5339 });
5340 if (!ScalarIndUpdate)
5341 continue;
5342
5343 // The induction variable and its update instruction will remain scalar.
5344 Worklist.insert(Ind);
5345 Worklist.insert(IndUpdate);
5346 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Ind << "\n"; } } while (false)
;
5347 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
5348 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
;
5349 }
5350
5351 Scalars[VF].insert(Worklist.begin(), Worklist.end());
5352}
5353
5354bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
5355 if (!blockNeedsPredication(I->getParent()))
5356 return false;
5357 switch(I->getOpcode()) {
5358 default:
5359 break;
5360 case Instruction::Load:
5361 case Instruction::Store: {
5362 if (!Legal->isMaskRequired(I))
5363 return false;
5364 auto *Ptr = getLoadStorePointerOperand(I);
5365 auto *Ty = getLoadStoreType(I);
5366 const Align Alignment = getLoadStoreAlignment(I);
5367 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5368 TTI.isLegalMaskedGather(Ty, Alignment))
5369 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5370 TTI.isLegalMaskedScatter(Ty, Alignment));
5371 }
5372 case Instruction::UDiv:
5373 case Instruction::SDiv:
5374 case Instruction::SRem:
5375 case Instruction::URem:
5376 return mayDivideByZero(*I);
5377 }
5378 return false;
5379}
5380
5381bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5382 Instruction *I, ElementCount VF) {
5383 assert(isAccessInterleaved(I) && "Expecting interleaved access.")(static_cast <bool> (isAccessInterleaved(I) && "Expecting interleaved access."
) ? void (0) : __assert_fail ("isAccessInterleaved(I) && \"Expecting interleaved access.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5383, __extension__ __PRETTY_FUNCTION__))
;
5384 assert(getWideningDecision(I, VF) == CM_Unknown &&(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown
&& "Decision should not be set yet.") ? void (0) : __assert_fail
("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5385, __extension__ __PRETTY_FUNCTION__))
5385 "Decision should not be set yet.")(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown
&& "Decision should not be set yet.") ? void (0) : __assert_fail
("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5385, __extension__ __PRETTY_FUNCTION__))
;
5386 auto *Group = getInterleavedAccessGroup(I);
5387 assert(Group && "Must have a group.")(static_cast <bool> (Group && "Must have a group."
) ? void (0) : __assert_fail ("Group && \"Must have a group.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5387, __extension__ __PRETTY_FUNCTION__))
;
5388
5389 // If the instruction's allocated size doesn't equal it's type size, it
5390 // requires padding and will be scalarized.
5391 auto &DL = I->getModule()->getDataLayout();
5392 auto *ScalarTy = getLoadStoreType(I);
5393 if (hasIrregularType(ScalarTy, DL))
5394 return false;
5395
5396 // Check if masking is required.
5397 // A Group may need masking for one of two reasons: it resides in a block that
5398 // needs predication, or it was decided to use masking to deal with gaps.
5399 bool PredicatedAccessRequiresMasking =
5400 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5401 bool AccessWithGapsRequiresMasking =
5402 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5403 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
5404 return true;
5405
5406 // If masked interleaving is required, we expect that the user/target had
5407 // enabled it, because otherwise it either wouldn't have been created or
5408 // it should have been invalidated by the CostModel.
5409 assert(useMaskedInterleavedAccesses(TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled."
) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5410, __extension__ __PRETTY_FUNCTION__))
5410 "Masked interleave-groups for predicated accesses are not enabled.")(static_cast <bool> (useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled."
) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5410, __extension__ __PRETTY_FUNCTION__))
;
5411
5412 auto *Ty = getLoadStoreType(I);
5413 const Align Alignment = getLoadStoreAlignment(I);
5414 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5415 : TTI.isLegalMaskedStore(Ty, Alignment);
5416}
5417
5418bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5419 Instruction *I, ElementCount VF) {
5420 // Get and ensure we have a valid memory instruction.
5421 LoadInst *LI = dyn_cast<LoadInst>(I);
5422 StoreInst *SI = dyn_cast<StoreInst>(I);
5423 assert((LI || SI) && "Invalid memory instruction")(static_cast <bool> ((LI || SI) && "Invalid memory instruction"
) ? void (0) : __assert_fail ("(LI || SI) && \"Invalid memory instruction\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5423, __extension__ __PRETTY_FUNCTION__))
;
5424
5425 auto *Ptr = getLoadStorePointerOperand(I);
5426
5427 // In order to be widened, the pointer should be consecutive, first of all.
5428 if (!Legal->isConsecutivePtr(Ptr))
5429 return false;
5430
5431 // If the instruction is a store located in a predicated block, it will be
5432 // scalarized.
5433 if (isScalarWithPredication(I))
5434 return false;
5435
5436 // If the instruction's allocated size doesn't equal it's type size, it
5437 // requires padding and will be scalarized.
5438 auto &DL = I->getModule()->getDataLayout();
5439 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5440 if (hasIrregularType(ScalarTy, DL))
5441 return false;
5442
5443 return true;
5444}
5445
5446void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5447 // We should not collect Uniforms more than once per VF. Right now,
5448 // this function is called from collectUniformsAndScalars(), which
5449 // already does this check. Collecting Uniforms for VF=1 does not make any
5450 // sense.
5451
5452 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&(static_cast <bool> (VF.isVector() && Uniforms.
find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5453, __extension__ __PRETTY_FUNCTION__))
5453 "This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Uniforms.
find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-13~++20210613111130+5be314f79ba7/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5453, __extension__ __PRETTY_FUNCTION__))
;
5454
5455 // Visit the list of Uniforms. If we'll not find any uniform value, we'll