Bug Summary

File:llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Warning:line 7270, column 35
Potential leak of memory pointed to by 'BlockMask'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopVectorize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-12/lib/clang/12.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/build-llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/build-llvm/include -I /build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-12/lib/clang/12.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/build-llvm/lib/Transforms/Vectorize -fdebug-prefix-map=/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb=. -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2020-09-26-161721-17566-1 -x c++ /build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/Proposal/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanHCFGBuilder.h"
61#include "VPlanPredicator.h"
62#include "VPlanTransforms.h"
63#include "llvm/ADT/APInt.h"
64#include "llvm/ADT/ArrayRef.h"
65#include "llvm/ADT/DenseMap.h"
66#include "llvm/ADT/DenseMapInfo.h"
67#include "llvm/ADT/Hashing.h"
68#include "llvm/ADT/MapVector.h"
69#include "llvm/ADT/None.h"
70#include "llvm/ADT/Optional.h"
71#include "llvm/ADT/STLExtras.h"
72#include "llvm/ADT/SetVector.h"
73#include "llvm/ADT/SmallPtrSet.h"
74#include "llvm/ADT/SmallVector.h"
75#include "llvm/ADT/Statistic.h"
76#include "llvm/ADT/StringRef.h"
77#include "llvm/ADT/Twine.h"
78#include "llvm/ADT/iterator_range.h"
79#include "llvm/Analysis/AssumptionCache.h"
80#include "llvm/Analysis/BasicAliasAnalysis.h"
81#include "llvm/Analysis/BlockFrequencyInfo.h"
82#include "llvm/Analysis/CFG.h"
83#include "llvm/Analysis/CodeMetrics.h"
84#include "llvm/Analysis/DemandedBits.h"
85#include "llvm/Analysis/GlobalsModRef.h"
86#include "llvm/Analysis/LoopAccessAnalysis.h"
87#include "llvm/Analysis/LoopAnalysisManager.h"
88#include "llvm/Analysis/LoopInfo.h"
89#include "llvm/Analysis/LoopIterator.h"
90#include "llvm/Analysis/MemorySSA.h"
91#include "llvm/Analysis/OptimizationRemarkEmitter.h"
92#include "llvm/Analysis/ProfileSummaryInfo.h"
93#include "llvm/Analysis/ScalarEvolution.h"
94#include "llvm/Analysis/ScalarEvolutionExpressions.h"
95#include "llvm/Analysis/TargetLibraryInfo.h"
96#include "llvm/Analysis/TargetTransformInfo.h"
97#include "llvm/Analysis/VectorUtils.h"
98#include "llvm/IR/Attributes.h"
99#include "llvm/IR/BasicBlock.h"
100#include "llvm/IR/CFG.h"
101#include "llvm/IR/Constant.h"
102#include "llvm/IR/Constants.h"
103#include "llvm/IR/DataLayout.h"
104#include "llvm/IR/DebugInfoMetadata.h"
105#include "llvm/IR/DebugLoc.h"
106#include "llvm/IR/DerivedTypes.h"
107#include "llvm/IR/DiagnosticInfo.h"
108#include "llvm/IR/Dominators.h"
109#include "llvm/IR/Function.h"
110#include "llvm/IR/IRBuilder.h"
111#include "llvm/IR/InstrTypes.h"
112#include "llvm/IR/Instruction.h"
113#include "llvm/IR/Instructions.h"
114#include "llvm/IR/IntrinsicInst.h"
115#include "llvm/IR/Intrinsics.h"
116#include "llvm/IR/LLVMContext.h"
117#include "llvm/IR/Metadata.h"
118#include "llvm/IR/Module.h"
119#include "llvm/IR/Operator.h"
120#include "llvm/IR/Type.h"
121#include "llvm/IR/Use.h"
122#include "llvm/IR/User.h"
123#include "llvm/IR/Value.h"
124#include "llvm/IR/ValueHandle.h"
125#include "llvm/IR/Verifier.h"
126#include "llvm/InitializePasses.h"
127#include "llvm/Pass.h"
128#include "llvm/Support/Casting.h"
129#include "llvm/Support/CommandLine.h"
130#include "llvm/Support/Compiler.h"
131#include "llvm/Support/Debug.h"
132#include "llvm/Support/ErrorHandling.h"
133#include "llvm/Support/MathExtras.h"
134#include "llvm/Support/raw_ostream.h"
135#include "llvm/Transforms/Utils/BasicBlockUtils.h"
136#include "llvm/Transforms/Utils/InjectTLIMappings.h"
137#include "llvm/Transforms/Utils/LoopSimplify.h"
138#include "llvm/Transforms/Utils/LoopUtils.h"
139#include "llvm/Transforms/Utils/LoopVersioning.h"
140#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141#include "llvm/Transforms/Utils/SizeOpts.h"
142#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143#include <algorithm>
144#include <cassert>
145#include <cstdint>
146#include <cstdlib>
147#include <functional>
148#include <iterator>
149#include <limits>
150#include <memory>
151#include <string>
152#include <tuple>
153#include <utility>
154
155using namespace llvm;
156
157#define LV_NAME"loop-vectorize" "loop-vectorize"
158#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"
159
160/// @{
161/// Metadata attribute names
162static const char *const LLVMLoopVectorizeFollowupAll =
163 "llvm.loop.vectorize.followup_all";
164static const char *const LLVMLoopVectorizeFollowupVectorized =
165 "llvm.loop.vectorize.followup_vectorized";
166static const char *const LLVMLoopVectorizeFollowupEpilogue =
167 "llvm.loop.vectorize.followup_epilogue";
168/// @}
169
170STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = {"loop-vectorize", "LoopsVectorized"
, "Number of loops vectorized"}
;
171STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = {"loop-vectorize", "LoopsAnalyzed"
, "Number of loops analyzed for vectorization"}
;
172
173/// Loops with a known constant trip count below this number are vectorized only
174/// if no scalar iteration overheads are incurred.
175static cl::opt<unsigned> TinyTripCountVectorThreshold(
176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177 cl::desc("Loops with a constant trip count that is smaller than this "
178 "value are vectorized only if no scalar iteration overheads "
179 "are incurred."));
180
181// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
182// that predication is preferred, and this lists all options. I.e., the
183// vectorizer will try to fold the tail-loop (epilogue) into the vector body
184// and predicate the instructions accordingly. If tail-folding fails, there are
185// different fallback strategies depending on these values:
186namespace PreferPredicateTy {
187 enum Option {
188 ScalarEpilogue = 0,
189 PredicateElseScalarEpilogue,
190 PredicateOrDontVectorize
191 };
192} // namespace PreferPredicateTy
193
194static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
195 "prefer-predicate-over-epilogue",
196 cl::init(PreferPredicateTy::ScalarEpilogue),
197 cl::Hidden,
198 cl::desc("Tail-folding and predication preferences over creating a scalar "
199 "epilogue loop."),
200 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
201 "scalar-epilogue",llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
202 "Don't tail-predicate loops, create scalar epilogue")llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
,
203 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
204 "predicate-else-scalar-epilogue",llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
205 "prefer tail-folding, create scalar epilogue if tail "llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
206 "folding fails.")llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
,
207 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
208 "predicate-dont-vectorize",llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
209 "prefers tail-folding, don't attempt vectorization if "llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
210 "tail-folding fails.")llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
));
211
212static cl::opt<bool> MaximizeBandwidth(
213 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
214 cl::desc("Maximize bandwidth when selecting vectorization factor which "
215 "will be determined by the smallest type in loop."));
216
217static cl::opt<bool> EnableInterleavedMemAccesses(
218 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
219 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
220
221/// An interleave-group may need masking if it resides in a block that needs
222/// predication, or in order to mask away gaps.
223static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
224 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
225 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
226
227static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
228 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
229 cl::desc("We don't interleave loops with a estimated constant trip count "
230 "below this number"));
231
232static cl::opt<unsigned> ForceTargetNumScalarRegs(
233 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
234 cl::desc("A flag that overrides the target's number of scalar registers."));
235
236static cl::opt<unsigned> ForceTargetNumVectorRegs(
237 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
238 cl::desc("A flag that overrides the target's number of vector registers."));
239
240static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
241 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
242 cl::desc("A flag that overrides the target's max interleave factor for "
243 "scalar loops."));
244
245static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
246 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
247 cl::desc("A flag that overrides the target's max interleave factor for "
248 "vectorized loops."));
249
250static cl::opt<unsigned> ForceTargetInstructionCost(
251 "force-target-instruction-cost", cl::init(0), cl::Hidden,
252 cl::desc("A flag that overrides the target's expected cost for "
253 "an instruction to a single constant value. Mostly "
254 "useful for getting consistent testing."));
255
256static cl::opt<unsigned> SmallLoopCost(
257 "small-loop-cost", cl::init(20), cl::Hidden,
258 cl::desc(
259 "The cost of a loop that is considered 'small' by the interleaver."));
260
261static cl::opt<bool> LoopVectorizeWithBlockFrequency(
262 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
263 cl::desc("Enable the use of the block frequency analysis to access PGO "
264 "heuristics minimizing code growth in cold regions and being more "
265 "aggressive in hot regions."));
266
267// Runtime interleave loops for load/store throughput.
268static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
269 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
270 cl::desc(
271 "Enable runtime interleaving until load/store ports are saturated"));
272
273/// Interleave small loops with scalar reductions.
274static cl::opt<bool> InterleaveSmallLoopScalarReduction(
275 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
276 cl::desc("Enable interleaving for loops with small iteration counts that "
277 "contain scalar reductions to expose ILP."));
278
279/// The number of stores in a loop that are allowed to need predication.
280static cl::opt<unsigned> NumberOfStoresToPredicate(
281 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
282 cl::desc("Max number of stores to be predicated behind an if."));
283
284static cl::opt<bool> EnableIndVarRegisterHeur(
285 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
286 cl::desc("Count the induction variable only once when interleaving"));
287
288static cl::opt<bool> EnableCondStoresVectorization(
289 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
290 cl::desc("Enable if predication of stores during vectorization."));
291
292static cl::opt<unsigned> MaxNestedScalarReductionIC(
293 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
294 cl::desc("The maximum interleave count to use when interleaving a scalar "
295 "reduction in a nested loop."));
296
297static cl::opt<bool>
298 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
299 cl::Hidden,
300 cl::desc("Prefer in-loop vector reductions, "
301 "overriding the targets preference."));
302
303static cl::opt<bool> PreferPredicatedReductionSelect(
304 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
305 cl::desc(
306 "Prefer predicating a reduction operation over an after loop select."));
307
308cl::opt<bool> EnableVPlanNativePath(
309 "enable-vplan-native-path", cl::init(false), cl::Hidden,
310 cl::desc("Enable VPlan-native vectorization path with "
311 "support for outer loop vectorization."));
312
313// FIXME: Remove this switch once we have divergence analysis. Currently we
314// assume divergent non-backedge branches when this switch is true.
315cl::opt<bool> EnableVPlanPredication(
316 "enable-vplan-predication", cl::init(false), cl::Hidden,
317 cl::desc("Enable VPlan-native vectorization path predicator with "
318 "support for outer loop vectorization."));
319
320// This flag enables the stress testing of the VPlan H-CFG construction in the
321// VPlan-native vectorization path. It must be used in conjuction with
322// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
323// verification of the H-CFGs built.
324static cl::opt<bool> VPlanBuildStressTest(
325 "vplan-build-stress-test", cl::init(false), cl::Hidden,
326 cl::desc(
327 "Build VPlan for every supported loop nest in the function and bail "
328 "out right after the build (stress test the VPlan H-CFG construction "
329 "in the VPlan-native vectorization path)."));
330
331cl::opt<bool> llvm::EnableLoopInterleaving(
332 "interleave-loops", cl::init(true), cl::Hidden,
333 cl::desc("Enable loop interleaving in Loop vectorization passes"));
334cl::opt<bool> llvm::EnableLoopVectorization(
335 "vectorize-loops", cl::init(true), cl::Hidden,
336 cl::desc("Run the Loop vectorization passes"));
337
338/// A helper function that returns the type of loaded or stored value.
339static Type *getMemInstValueType(Value *I) {
340 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Expected Load or Store instruction") ? static_cast<void>
(0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 341, __PRETTY_FUNCTION__))
341 "Expected Load or Store instruction")(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Expected Load or Store instruction") ? static_cast<void>
(0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 341, __PRETTY_FUNCTION__))
;
342 if (auto *LI = dyn_cast<LoadInst>(I))
343 return LI->getType();
344 return cast<StoreInst>(I)->getValueOperand()->getType();
345}
346
347/// A helper function that returns true if the given type is irregular. The
348/// type is irregular if its allocated size doesn't equal the store size of an
349/// element of the corresponding vector type at the given vectorization factor.
350static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
351 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 351, __PRETTY_FUNCTION__))
;
352 // Determine if an array of VF elements of type Ty is "bitcast compatible"
353 // with a <VF x Ty> vector.
354 if (VF.isVector()) {
355 auto *VectorTy = VectorType::get(Ty, VF);
356 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
357 }
358
359 // If the vectorization factor is one, we just check if an array of type Ty
360 // requires padding between elements.
361 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
362}
363
364/// A helper function that returns the reciprocal of the block probability of
365/// predicated blocks. If we return X, we are assuming the predicated block
366/// will execute once for every X iterations of the loop header.
367///
368/// TODO: We should use actual block probability here, if available. Currently,
369/// we always assume predicated blocks have a 50% chance of executing.
370static unsigned getReciprocalPredBlockProb() { return 2; }
371
372/// A helper function that adds a 'fast' flag to floating-point operations.
373static Value *addFastMathFlag(Value *V) {
374 if (isa<FPMathOperator>(V))
375 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
376 return V;
377}
378
379static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
380 if (isa<FPMathOperator>(V))
381 cast<Instruction>(V)->setFastMathFlags(FMF);
382 return V;
383}
384
385/// A helper function that returns an integer or floating-point constant with
386/// value C.
387static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
388 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
389 : ConstantFP::get(Ty, C);
390}
391
392/// Returns "best known" trip count for the specified loop \p L as defined by
393/// the following procedure:
394/// 1) Returns exact trip count if it is known.
395/// 2) Returns expected trip count according to profile data if any.
396/// 3) Returns upper bound estimate if it is known.
397/// 4) Returns None if all of the above failed.
398static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
399 // Check if exact trip count is known.
400 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
401 return ExpectedTC;
402
403 // Check if there is an expected trip count available from profile data.
404 if (LoopVectorizeWithBlockFrequency)
405 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
406 return EstimatedTC;
407
408 // Check if upper bound estimate is known.
409 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
410 return ExpectedTC;
411
412 return None;
413}
414
415namespace llvm {
416
417/// InnerLoopVectorizer vectorizes loops which contain only one basic
418/// block to a specified vectorization factor (VF).
419/// This class performs the widening of scalars into vectors, or multiple
420/// scalars. This class also implements the following features:
421/// * It inserts an epilogue loop for handling loops that don't have iteration
422/// counts that are known to be a multiple of the vectorization factor.
423/// * It handles the code generation for reduction variables.
424/// * Scalarization (implementation using scalars) of un-vectorizable
425/// instructions.
426/// InnerLoopVectorizer does not perform any vectorization-legality
427/// checks, and relies on the caller to check for the different legality
428/// aspects. The InnerLoopVectorizer relies on the
429/// LoopVectorizationLegality class to provide information about the induction
430/// and reduction variables that were found to a given vectorization factor.
431class InnerLoopVectorizer {
432public:
433 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
434 LoopInfo *LI, DominatorTree *DT,
435 const TargetLibraryInfo *TLI,
436 const TargetTransformInfo *TTI, AssumptionCache *AC,
437 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
438 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
439 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
440 ProfileSummaryInfo *PSI)
441 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
442 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
443 Builder(PSE.getSE()->getContext()),
444 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
445 BFI(BFI), PSI(PSI) {
446 // Query this against the original loop and save it here because the profile
447 // of the original loop header may change as the transformation happens.
448 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
449 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
450 }
451
452 virtual ~InnerLoopVectorizer() = default;
453
454 /// Create a new empty loop that will contain vectorized instructions later
455 /// on, while the old loop will be used as the scalar remainder. Control flow
456 /// is generated around the vectorized (and scalar epilogue) loops consisting
457 /// of various checks and bypasses. Return the pre-header block of the new
458 /// loop.
459 BasicBlock *createVectorizedLoopSkeleton();
460
461 /// Widen a single instruction within the innermost loop.
462 void widenInstruction(Instruction &I, VPUser &Operands,
463 VPTransformState &State);
464
465 /// Widen a single call instruction within the innermost loop.
466 void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
467 VPTransformState &State);
468
469 /// Widen a single select instruction within the innermost loop.
470 void widenSelectInstruction(SelectInst &I, VPUser &Operands,
471 bool InvariantCond, VPTransformState &State);
472
473 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
474 void fixVectorizedLoop();
475
476 // Return true if any runtime check is added.
477 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
478
479 /// A type for vectorized values in the new loop. Each value from the
480 /// original loop, when vectorized, is represented by UF vector values in the
481 /// new unrolled loop, where UF is the unroll factor.
482 using VectorParts = SmallVector<Value *, 2>;
483
484 /// Vectorize a single GetElementPtrInst based on information gathered and
485 /// decisions taken during planning.
486 void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF,
487 ElementCount VF, bool IsPtrLoopInvariant,
488 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
489
490 /// Vectorize a single PHINode in a block. This method handles the induction
491 /// variable canonicalization. It supports both VF = 1 for unrolled loops and
492 /// arbitrary length vectors.
493 void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF);
494
495 /// A helper function to scalarize a single Instruction in the innermost loop.
496 /// Generates a sequence of scalar instances for each lane between \p MinLane
497 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
498 /// inclusive. Uses the VPValue operands from \p Operands instead of \p
499 /// Instr's operands.
500 void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
501 const VPIteration &Instance, bool IfPredicateInstr,
502 VPTransformState &State);
503
504 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
505 /// is provided, the integer induction variable will first be truncated to
506 /// the corresponding type.
507 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
508
509 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
510 /// vector or scalar value on-demand if one is not yet available. When
511 /// vectorizing a loop, we visit the definition of an instruction before its
512 /// uses. When visiting the definition, we either vectorize or scalarize the
513 /// instruction, creating an entry for it in the corresponding map. (In some
514 /// cases, such as induction variables, we will create both vector and scalar
515 /// entries.) Then, as we encounter uses of the definition, we derive values
516 /// for each scalar or vector use unless such a value is already available.
517 /// For example, if we scalarize a definition and one of its uses is vector,
518 /// we build the required vector on-demand with an insertelement sequence
519 /// when visiting the use. Otherwise, if the use is scalar, we can use the
520 /// existing scalar definition.
521 ///
522 /// Return a value in the new loop corresponding to \p V from the original
523 /// loop at unroll index \p Part. If the value has already been vectorized,
524 /// the corresponding vector entry in VectorLoopValueMap is returned. If,
525 /// however, the value has a scalar entry in VectorLoopValueMap, we construct
526 /// a new vector value on-demand by inserting the scalar values into a vector
527 /// with an insertelement sequence. If the value has been neither vectorized
528 /// nor scalarized, it must be loop invariant, so we simply broadcast the
529 /// value into a vector.
530 Value *getOrCreateVectorValue(Value *V, unsigned Part);
531
532 /// Return a value in the new loop corresponding to \p V from the original
533 /// loop at unroll and vector indices \p Instance. If the value has been
534 /// vectorized but not scalarized, the necessary extractelement instruction
535 /// will be generated.
536 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
537
538 /// Construct the vector value of a scalarized value \p V one lane at a time.
539 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
540
541 /// Try to vectorize interleaved access group \p Group with the base address
542 /// given in \p Addr, optionally masking the vector operations if \p
543 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
544 /// values in the vectorized loop.
545 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
546 VPTransformState &State, VPValue *Addr,
547 VPValue *BlockInMask = nullptr);
548
549 /// Vectorize Load and Store instructions with the base address given in \p
550 /// Addr, optionally masking the vector operations if \p BlockInMask is
551 /// non-null. Use \p State to translate given VPValues to IR values in the
552 /// vectorized loop.
553 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
554 VPValue *Addr, VPValue *StoredValue,
555 VPValue *BlockInMask);
556
557 /// Set the debug location in the builder using the debug location in
558 /// the instruction.
559 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
560
561 /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
562 void fixNonInductionPHIs(void);
563
564protected:
565 friend class LoopVectorizationPlanner;
566
567 /// A small list of PHINodes.
568 using PhiVector = SmallVector<PHINode *, 4>;
569
570 /// A type for scalarized values in the new loop. Each value from the
571 /// original loop, when scalarized, is represented by UF x VF scalar values
572 /// in the new unrolled loop, where UF is the unroll factor and VF is the
573 /// vectorization factor.
574 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
575
576 /// Set up the values of the IVs correctly when exiting the vector loop.
577 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
578 Value *CountRoundDown, Value *EndValue,
579 BasicBlock *MiddleBlock);
580
581 /// Create a new induction variable inside L.
582 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
583 Value *Step, Instruction *DL);
584
585 /// Handle all cross-iteration phis in the header.
586 void fixCrossIterationPHIs();
587
588 /// Fix a first-order recurrence. This is the second phase of vectorizing
589 /// this phi node.
590 void fixFirstOrderRecurrence(PHINode *Phi);
591
592 /// Fix a reduction cross-iteration phi. This is the second phase of
593 /// vectorizing this phi node.
594 void fixReduction(PHINode *Phi);
595
596 /// Clear NSW/NUW flags from reduction instructions if necessary.
597 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
598
599 /// The Loop exit block may have single value PHI nodes with some
600 /// incoming value. While vectorizing we only handled real values
601 /// that were defined inside the loop and we should have one value for
602 /// each predecessor of its parent basic block. See PR14725.
603 void fixLCSSAPHIs();
604
605 /// Iteratively sink the scalarized operands of a predicated instruction into
606 /// the block that was created for it.
607 void sinkScalarOperands(Instruction *PredInst);
608
609 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
610 /// represented as.
611 void truncateToMinimalBitwidths();
612
613 /// Create a broadcast instruction. This method generates a broadcast
614 /// instruction (shuffle) for loop invariant values and for the induction
615 /// value. If this is the induction variable then we extend it to N, N+1, ...
616 /// this is needed because each iteration in the loop corresponds to a SIMD
617 /// element.
618 virtual Value *getBroadcastInstrs(Value *V);
619
620 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
621 /// to each vector element of Val. The sequence starts at StartIndex.
622 /// \p Opcode is relevant for FP induction variable.
623 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
624 Instruction::BinaryOps Opcode =
625 Instruction::BinaryOpsEnd);
626
627 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
628 /// variable on which to base the steps, \p Step is the size of the step, and
629 /// \p EntryVal is the value from the original loop that maps to the steps.
630 /// Note that \p EntryVal doesn't have to be an induction variable - it
631 /// can also be a truncate instruction.
632 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
633 const InductionDescriptor &ID);
634
635 /// Create a vector induction phi node based on an existing scalar one. \p
636 /// EntryVal is the value from the original loop that maps to the vector phi
637 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
638 /// truncate instruction, instead of widening the original IV, we widen a
639 /// version of the IV truncated to \p EntryVal's type.
640 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
641 Value *Step, Instruction *EntryVal);
642
643 /// Returns true if an instruction \p I should be scalarized instead of
644 /// vectorized for the chosen vectorization factor.
645 bool shouldScalarizeInstruction(Instruction *I) const;
646
647 /// Returns true if we should generate a scalar version of \p IV.
648 bool needsScalarInduction(Instruction *IV) const;
649
650 /// If there is a cast involved in the induction variable \p ID, which should
651 /// be ignored in the vectorized loop body, this function records the
652 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
653 /// cast. We had already proved that the casted Phi is equal to the uncasted
654 /// Phi in the vectorized loop (under a runtime guard), and therefore
655 /// there is no need to vectorize the cast - the same value can be used in the
656 /// vector loop for both the Phi and the cast.
657 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
658 /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
659 ///
660 /// \p EntryVal is the value from the original loop that maps to the vector
661 /// phi node and is used to distinguish what is the IV currently being
662 /// processed - original one (if \p EntryVal is a phi corresponding to the
663 /// original IV) or the "newly-created" one based on the proof mentioned above
664 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
665 /// latter case \p EntryVal is a TruncInst and we must not record anything for
666 /// that IV, but it's error-prone to expect callers of this routine to care
667 /// about that, hence this explicit parameter.
668 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
669 const Instruction *EntryVal,
670 Value *VectorLoopValue,
671 unsigned Part,
672 unsigned Lane = UINT_MAX(2147483647 *2U +1U));
673
674 /// Generate a shuffle sequence that will reverse the vector Vec.
675 virtual Value *reverseVector(Value *Vec);
676
677 /// Returns (and creates if needed) the original loop trip count.
678 Value *getOrCreateTripCount(Loop *NewLoop);
679
680 /// Returns (and creates if needed) the trip count of the widened loop.
681 Value *getOrCreateVectorTripCount(Loop *NewLoop);
682
683 /// Returns a bitcasted value to the requested vector type.
684 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
685 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
686 const DataLayout &DL);
687
688 /// Emit a bypass check to see if the vector trip count is zero, including if
689 /// it overflows.
690 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
691
692 /// Emit a bypass check to see if all of the SCEV assumptions we've
693 /// had to make are correct.
694 void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
695
696 /// Emit bypass checks to check any memory assumptions we may have made.
697 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
698
699 /// Compute the transformed value of Index at offset StartValue using step
700 /// StepValue.
701 /// For integer induction, returns StartValue + Index * StepValue.
702 /// For pointer induction, returns StartValue[Index * StepValue].
703 /// FIXME: The newly created binary instructions should contain nsw/nuw
704 /// flags, which can be found from the original scalar operations.
705 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
706 const DataLayout &DL,
707 const InductionDescriptor &ID) const;
708
709 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
710 /// vector loop preheader, middle block and scalar preheader. Also
711 /// allocate a loop object for the new vector loop and return it.
712 Loop *createVectorLoopSkeleton(StringRef Prefix);
713
714 /// Create new phi nodes for the induction variables to resume iteration count
715 /// in the scalar epilogue, from where the vectorized loop left off (given by
716 /// \p VectorTripCount).
717 void createInductionResumeValues(Loop *L, Value *VectorTripCount);
718
719 /// Complete the loop skeleton by adding debug MDs, creating appropriate
720 /// conditional branches in the middle block, preparing the builder and
721 /// running the verifier. Take in the vector loop \p L as argument, and return
722 /// the preheader of the completed vector loop.
723 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
724
725 /// Add additional metadata to \p To that was not present on \p Orig.
726 ///
727 /// Currently this is used to add the noalias annotations based on the
728 /// inserted memchecks. Use this for instructions that are *cloned* into the
729 /// vector loop.
730 void addNewMetadata(Instruction *To, const Instruction *Orig);
731
732 /// Add metadata from one instruction to another.
733 ///
734 /// This includes both the original MDs from \p From and additional ones (\see
735 /// addNewMetadata). Use this for *newly created* instructions in the vector
736 /// loop.
737 void addMetadata(Instruction *To, Instruction *From);
738
739 /// Similar to the previous function but it adds the metadata to a
740 /// vector of instructions.
741 void addMetadata(ArrayRef<Value *> To, Instruction *From);
742
743 /// The original loop.
744 Loop *OrigLoop;
745
746 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
747 /// dynamic knowledge to simplify SCEV expressions and converts them to a
748 /// more usable form.
749 PredicatedScalarEvolution &PSE;
750
751 /// Loop Info.
752 LoopInfo *LI;
753
754 /// Dominator Tree.
755 DominatorTree *DT;
756
757 /// Alias Analysis.
758 AAResults *AA;
759
760 /// Target Library Info.
761 const TargetLibraryInfo *TLI;
762
763 /// Target Transform Info.
764 const TargetTransformInfo *TTI;
765
766 /// Assumption Cache.
767 AssumptionCache *AC;
768
769 /// Interface to emit optimization remarks.
770 OptimizationRemarkEmitter *ORE;
771
772 /// LoopVersioning. It's only set up (non-null) if memchecks were
773 /// used.
774 ///
775 /// This is currently only used to add no-alias metadata based on the
776 /// memchecks. The actually versioning is performed manually.
777 std::unique_ptr<LoopVersioning> LVer;
778
779 /// The vectorization SIMD factor to use. Each vector will have this many
780 /// vector elements.
781 ElementCount VF;
782
783 /// The vectorization unroll factor to use. Each scalar is vectorized to this
784 /// many different vector instructions.
785 unsigned UF;
786
787 /// The builder that we use
788 IRBuilder<> Builder;
789
790 // --- Vectorization state ---
791
792 /// The vector-loop preheader.
793 BasicBlock *LoopVectorPreHeader;
794
795 /// The scalar-loop preheader.
796 BasicBlock *LoopScalarPreHeader;
797
798 /// Middle Block between the vector and the scalar.
799 BasicBlock *LoopMiddleBlock;
800
801 /// The ExitBlock of the scalar loop.
802 BasicBlock *LoopExitBlock;
803
804 /// The vector loop body.
805 BasicBlock *LoopVectorBody;
806
807 /// The scalar loop body.
808 BasicBlock *LoopScalarBody;
809
810 /// A list of all bypass blocks. The first block is the entry of the loop.
811 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
812
813 /// The new Induction variable which was added to the new block.
814 PHINode *Induction = nullptr;
815
816 /// The induction variable of the old basic block.
817 PHINode *OldInduction = nullptr;
818
819 /// Maps values from the original loop to their corresponding values in the
820 /// vectorized loop. A key value can map to either vector values, scalar
821 /// values or both kinds of values, depending on whether the key was
822 /// vectorized and scalarized.
823 VectorizerValueMap VectorLoopValueMap;
824
825 /// Store instructions that were predicated.
826 SmallVector<Instruction *, 4> PredicatedInstructions;
827
828 /// Trip count of the original loop.
829 Value *TripCount = nullptr;
830
831 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
832 Value *VectorTripCount = nullptr;
833
834 /// The legality analysis.
835 LoopVectorizationLegality *Legal;
836
837 /// The profitablity analysis.
838 LoopVectorizationCostModel *Cost;
839
840 // Record whether runtime checks are added.
841 bool AddedSafetyChecks = false;
842
843 // Holds the end values for each induction variable. We save the end values
844 // so we can later fix-up the external users of the induction variables.
845 DenseMap<PHINode *, Value *> IVEndValues;
846
847 // Vector of original scalar PHIs whose corresponding widened PHIs need to be
848 // fixed up at the end of vector code generation.
849 SmallVector<PHINode *, 8> OrigPHIsToFix;
850
851 /// BFI and PSI are used to check for profile guided size optimizations.
852 BlockFrequencyInfo *BFI;
853 ProfileSummaryInfo *PSI;
854
855 // Whether this loop should be optimized for size based on profile guided size
856 // optimizatios.
857 bool OptForSizeBasedOnProfile;
858};
859
860class InnerLoopUnroller : public InnerLoopVectorizer {
861public:
862 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
863 LoopInfo *LI, DominatorTree *DT,
864 const TargetLibraryInfo *TLI,
865 const TargetTransformInfo *TTI, AssumptionCache *AC,
866 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
867 LoopVectorizationLegality *LVL,
868 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
869 ProfileSummaryInfo *PSI)
870 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
871 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
872 BFI, PSI) {}
873
874private:
875 Value *getBroadcastInstrs(Value *V) override;
876 Value *getStepVector(Value *Val, int StartIdx, Value *Step,
877 Instruction::BinaryOps Opcode =
878 Instruction::BinaryOpsEnd) override;
879 Value *reverseVector(Value *Vec) override;
880};
881
882} // end namespace llvm
883
884/// Look for a meaningful debug location on the instruction or it's
885/// operands.
886static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
887 if (!I)
888 return I;
889
890 DebugLoc Empty;
891 if (I->getDebugLoc() != Empty)
892 return I;
893
894 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
895 if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
896 if (OpInst->getDebugLoc() != Empty)
897 return OpInst;
898 }
899
900 return I;
901}
902
903void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
904 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
905 const DILocation *DIL = Inst->getDebugLoc();
906 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
907 !isa<DbgInfoIntrinsic>(Inst)) {
908 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 908, __PRETTY_FUNCTION__))
;
909 auto NewDIL =
910 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
911 if (NewDIL)
912 B.SetCurrentDebugLocation(NewDIL.getValue());
913 else
914 LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
915 << "Failed to create new discriminator: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
916 << DIL->getFilename() << " Line: " << DIL->getLine())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
<< DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
;
917 }
918 else
919 B.SetCurrentDebugLocation(DIL);
920 } else
921 B.SetCurrentDebugLocation(DebugLoc());
922}
923
924/// Write a record \p DebugMsg about vectorization failure to the debug
925/// output stream. If \p I is passed, it is an instruction that prevents
926/// vectorization.
927#ifndef NDEBUG
928static void debugVectorizationFailure(const StringRef DebugMsg,
929 Instruction *I) {
930 dbgs() << "LV: Not vectorizing: " << DebugMsg;
931 if (I != nullptr)
932 dbgs() << " " << *I;
933 else
934 dbgs() << '.';
935 dbgs() << '\n';
936}
937#endif
938
939/// Create an analysis remark that explains why vectorization failed
940///
941/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
942/// RemarkName is the identifier for the remark. If \p I is passed it is an
943/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
944/// the location of the remark. \return the remark object that can be
945/// streamed to.
946static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
947 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
948 Value *CodeRegion = TheLoop->getHeader();
949 DebugLoc DL = TheLoop->getStartLoc();
950
951 if (I) {
952 CodeRegion = I->getParent();
953 // If there is no debug location attached to the instruction, revert back to
954 // using the loop's.
955 if (I->getDebugLoc())
956 DL = I->getDebugLoc();
957 }
958
959 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
960 R << "loop not vectorized: ";
961 return R;
962}
963
964namespace llvm {
965
966void reportVectorizationFailure(const StringRef DebugMsg,
967 const StringRef OREMsg, const StringRef ORETag,
968 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
969 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationFailure(DebugMsg, I);
} } while (false)
;
970 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
971 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
972 ORETag, TheLoop, I) << OREMsg);
973}
974
975} // end namespace llvm
976
977#ifndef NDEBUG
978/// \return string containing a file name and a line # for the given loop.
979static std::string getDebugLocString(const Loop *L) {
980 std::string Result;
981 if (L) {
982 raw_string_ostream OS(Result);
983 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
984 LoopDbgLoc.print(OS);
985 else
986 // Just print the module name.
987 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
988 OS.flush();
989 }
990 return Result;
991}
992#endif
993
994void InnerLoopVectorizer::addNewMetadata(Instruction *To,
995 const Instruction *Orig) {
996 // If the loop was versioned with memchecks, add the corresponding no-alias
997 // metadata.
998 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
999 LVer->annotateInstWithNoAlias(To, Orig);
1000}
1001
1002void InnerLoopVectorizer::addMetadata(Instruction *To,
1003 Instruction *From) {
1004 propagateMetadata(To, From);
1005 addNewMetadata(To, From);
1006}
1007
1008void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1009 Instruction *From) {
1010 for (Value *V : To) {
1011 if (Instruction *I = dyn_cast<Instruction>(V))
1012 addMetadata(I, From);
1013 }
1014}
1015
1016namespace llvm {
1017
1018// Loop vectorization cost-model hints how the scalar epilogue loop should be
1019// lowered.
1020enum ScalarEpilogueLowering {
1021
1022 // The default: allowing scalar epilogues.
1023 CM_ScalarEpilogueAllowed,
1024
1025 // Vectorization with OptForSize: don't allow epilogues.
1026 CM_ScalarEpilogueNotAllowedOptSize,
1027
1028 // A special case of vectorisation with OptForSize: loops with a very small
1029 // trip count are considered for vectorization under OptForSize, thereby
1030 // making sure the cost of their loop body is dominant, free of runtime
1031 // guards and scalar iteration overheads.
1032 CM_ScalarEpilogueNotAllowedLowTripLoop,
1033
1034 // Loop hint predicate indicating an epilogue is undesired.
1035 CM_ScalarEpilogueNotNeededUsePredicate
1036};
1037
1038/// LoopVectorizationCostModel - estimates the expected speedups due to
1039/// vectorization.
1040/// In many cases vectorization is not profitable. This can happen because of
1041/// a number of reasons. In this class we mainly attempt to predict the
1042/// expected speedup/slowdowns due to the supported instruction set. We use the
1043/// TargetTransformInfo to query the different backends for the cost of
1044/// different operations.
1045class LoopVectorizationCostModel {
1046public:
1047 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1048 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1049 LoopVectorizationLegality *Legal,
1050 const TargetTransformInfo &TTI,
1051 const TargetLibraryInfo *TLI, DemandedBits *DB,
1052 AssumptionCache *AC,
1053 OptimizationRemarkEmitter *ORE, const Function *F,
1054 const LoopVectorizeHints *Hints,
1055 InterleavedAccessInfo &IAI)
1056 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1057 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1058 Hints(Hints), InterleaveInfo(IAI) {}
1059
1060 /// \return An upper bound for the vectorization factor, or None if
1061 /// vectorization and interleaving should be avoided up front.
1062 Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC);
1063
1064 /// \return True if runtime checks are required for vectorization, and false
1065 /// otherwise.
1066 bool runtimeChecksRequired();
1067
1068 /// \return The most profitable vectorization factor and the cost of that VF.
1069 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1070 /// then this vectorization factor will be selected if vectorization is
1071 /// possible.
1072 VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
1073
1074 /// Setup cost-based decisions for user vectorization factor.
1075 void selectUserVectorizationFactor(ElementCount UserVF) {
1076 collectUniformsAndScalars(UserVF);
1077 collectInstsToScalarize(UserVF);
1078 }
1079
1080 /// \return The size (in bits) of the smallest and widest types in the code
1081 /// that needs to be vectorized. We ignore values that remain scalar such as
1082 /// 64 bit loop indices.
1083 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1084
1085 /// \return The desired interleave count.
1086 /// If interleave count has been specified by metadata it will be returned.
1087 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1088 /// are the selected vectorization factor and the cost of the selected VF.
1089 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1090
1091 /// Memory access instruction may be vectorized in more than one way.
1092 /// Form of instruction after vectorization depends on cost.
1093 /// This function takes cost-based decisions for Load/Store instructions
1094 /// and collects them in a map. This decisions map is used for building
1095 /// the lists of loop-uniform and loop-scalar instructions.
1096 /// The calculated cost is saved with widening decision in order to
1097 /// avoid redundant calculations.
1098 void setCostBasedWideningDecision(ElementCount VF);
1099
1100 /// A struct that represents some properties of the register usage
1101 /// of a loop.
1102 struct RegisterUsage {
1103 /// Holds the number of loop invariant values that are used in the loop.
1104 /// The key is ClassID of target-provided register class.
1105 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1106 /// Holds the maximum number of concurrent live intervals in the loop.
1107 /// The key is ClassID of target-provided register class.
1108 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1109 };
1110
1111 /// \return Returns information about the register usages of the loop for the
1112 /// given vectorization factors.
1113 SmallVector<RegisterUsage, 8>
1114 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1115
1116 /// Collect values we want to ignore in the cost model.
1117 void collectValuesToIgnore();
1118
1119 /// Split reductions into those that happen in the loop, and those that happen
1120 /// outside. In loop reductions are collected into InLoopReductionChains.
1121 void collectInLoopReductions();
1122
1123 /// \returns The smallest bitwidth each instruction can be represented with.
1124 /// The vector equivalents of these instructions should be truncated to this
1125 /// type.
1126 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1127 return MinBWs;
1128 }
1129
1130 /// \returns True if it is more profitable to scalarize instruction \p I for
1131 /// vectorization factor \p VF.
1132 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1133 assert(VF.isVector() &&((VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1134, __PRETTY_FUNCTION__))
1134 "Profitable to scalarize relevant only for VF > 1.")((VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1134, __PRETTY_FUNCTION__))
;
1135
1136 // Cost model is not run in the VPlan-native path - return conservative
1137 // result until this changes.
1138 if (EnableVPlanNativePath)
1139 return false;
1140
1141 auto Scalars = InstsToScalarize.find(VF);
1142 assert(Scalars != InstsToScalarize.end() &&((Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"
) ? static_cast<void> (0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1143, __PRETTY_FUNCTION__))
1143 "VF not yet analyzed for scalarization profitability")((Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"
) ? static_cast<void> (0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1143, __PRETTY_FUNCTION__))
;
1144 return Scalars->second.find(I) != Scalars->second.end();
1145 }
1146
1147 /// Returns true if \p I is known to be uniform after vectorization.
1148 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1149 if (VF.isScalar())
1150 return true;
1151
1152 // Cost model is not run in the VPlan-native path - return conservative
1153 // result until this changes.
1154 if (EnableVPlanNativePath)
1155 return false;
1156
1157 auto UniformsPerVF = Uniforms.find(VF);
1158 assert(UniformsPerVF != Uniforms.end() &&((UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"
) ? static_cast<void> (0) : __assert_fail ("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1159, __PRETTY_FUNCTION__))
1159 "VF not yet analyzed for uniformity")((UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"
) ? static_cast<void> (0) : __assert_fail ("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1159, __PRETTY_FUNCTION__))
;
1160 return UniformsPerVF->second.count(I);
1161 }
1162
1163 /// Returns true if \p I is known to be scalar after vectorization.
1164 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1165 if (VF.isScalar())
1166 return true;
1167
1168 // Cost model is not run in the VPlan-native path - return conservative
1169 // result until this changes.
1170 if (EnableVPlanNativePath)
1171 return false;
1172
1173 auto ScalarsPerVF = Scalars.find(VF);
1174 assert(ScalarsPerVF != Scalars.end() &&((ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"
) ? static_cast<void> (0) : __assert_fail ("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1175, __PRETTY_FUNCTION__))
1175 "Scalar values are not calculated for VF")((ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"
) ? static_cast<void> (0) : __assert_fail ("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1175, __PRETTY_FUNCTION__))
;
1176 return ScalarsPerVF->second.count(I);
1177 }
1178
1179 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1180 /// for vectorization factor \p VF.
1181 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1182 return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1183 !isProfitableToScalarize(I, VF) &&
1184 !isScalarAfterVectorization(I, VF);
1185 }
1186
1187 /// Decision that was taken during cost calculation for memory instruction.
1188 enum InstWidening {
1189 CM_Unknown,
1190 CM_Widen, // For consecutive accesses with stride +1.
1191 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1192 CM_Interleave,
1193 CM_GatherScatter,
1194 CM_Scalarize
1195 };
1196
1197 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1198 /// instruction \p I and vector width \p VF.
1199 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1200 unsigned Cost) {
1201 assert(VF.isVector() && "Expected VF >=2")((VF.isVector() && "Expected VF >=2") ? static_cast
<void> (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1201, __PRETTY_FUNCTION__))
;
1202 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1203 }
1204
1205 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1206 /// interleaving group \p Grp and vector width \p VF.
1207 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1208 ElementCount VF, InstWidening W, unsigned Cost) {
1209 assert(VF.isVector() && "Expected VF >=2")((VF.isVector() && "Expected VF >=2") ? static_cast
<void> (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1209, __PRETTY_FUNCTION__))
;
1210 /// Broadcast this decicion to all instructions inside the group.
1211 /// But the cost will be assigned to one instruction only.
1212 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1213 if (auto *I = Grp->getMember(i)) {
1214 if (Grp->getInsertPos() == I)
1215 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1216 else
1217 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1218 }
1219 }
1220 }
1221
1222 /// Return the cost model decision for the given instruction \p I and vector
1223 /// width \p VF. Return CM_Unknown if this instruction did not pass
1224 /// through the cost modeling.
1225 InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1226 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1226, __PRETTY_FUNCTION__))
;
1227 assert(VF.isVector() && "Expected VF >=2")((VF.isVector() && "Expected VF >=2") ? static_cast
<void> (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1227, __PRETTY_FUNCTION__))
;
1228
1229 // Cost model is not run in the VPlan-native path - return conservative
1230 // result until this changes.
1231 if (EnableVPlanNativePath)
1232 return CM_GatherScatter;
1233
1234 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1235 auto Itr = WideningDecisions.find(InstOnVF);
1236 if (Itr == WideningDecisions.end())
1237 return CM_Unknown;
1238 return Itr->second.first;
1239 }
1240
1241 /// Return the vectorization cost for the given instruction \p I and vector
1242 /// width \p VF.
1243 unsigned getWideningCost(Instruction *I, ElementCount VF) {
1244 assert(VF.isVector() && "Expected VF >=2")((VF.isVector() && "Expected VF >=2") ? static_cast
<void> (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1244, __PRETTY_FUNCTION__))
;
1245 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1246 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&((WideningDecisions.find(InstOnVF) != WideningDecisions.end()
&& "The cost is not calculated") ? static_cast<void
> (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1247, __PRETTY_FUNCTION__))
1247 "The cost is not calculated")((WideningDecisions.find(InstOnVF) != WideningDecisions.end()
&& "The cost is not calculated") ? static_cast<void
> (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1247, __PRETTY_FUNCTION__))
;
1248 return WideningDecisions[InstOnVF].second;
1249 }
1250
1251 /// Return True if instruction \p I is an optimizable truncate whose operand
1252 /// is an induction variable. Such a truncate will be removed by adding a new
1253 /// induction variable with the destination type.
1254 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1255 // If the instruction is not a truncate, return false.
1256 auto *Trunc = dyn_cast<TruncInst>(I);
1257 if (!Trunc)
1258 return false;
1259
1260 // Get the source and destination types of the truncate.
1261 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1262 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1263
1264 // If the truncate is free for the given types, return false. Replacing a
1265 // free truncate with an induction variable would add an induction variable
1266 // update instruction to each iteration of the loop. We exclude from this
1267 // check the primary induction variable since it will need an update
1268 // instruction regardless.
1269 Value *Op = Trunc->getOperand(0);
1270 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1271 return false;
1272
1273 // If the truncated value is not an induction variable, return false.
1274 return Legal->isInductionPhi(Op);
1275 }
1276
1277 /// Collects the instructions to scalarize for each predicated instruction in
1278 /// the loop.
1279 void collectInstsToScalarize(ElementCount VF);
1280
1281 /// Collect Uniform and Scalar values for the given \p VF.
1282 /// The sets depend on CM decision for Load/Store instructions
1283 /// that may be vectorized as interleave, gather-scatter or scalarized.
1284 void collectUniformsAndScalars(ElementCount VF) {
1285 // Do the analysis once.
1286 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1287 return;
1288 setCostBasedWideningDecision(VF);
1289 collectLoopUniforms(VF);
1290 collectLoopScalars(VF);
1291 }
1292
1293 /// Returns true if the target machine supports masked store operation
1294 /// for the given \p DataType and kind of access to \p Ptr.
1295 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1296 return Legal->isConsecutivePtr(Ptr) &&
1297 TTI.isLegalMaskedStore(DataType, Alignment);
1298 }
1299
1300 /// Returns true if the target machine supports masked load operation
1301 /// for the given \p DataType and kind of access to \p Ptr.
1302 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1303 return Legal->isConsecutivePtr(Ptr) &&
1304 TTI.isLegalMaskedLoad(DataType, Alignment);
1305 }
1306
1307 /// Returns true if the target machine supports masked scatter operation
1308 /// for the given \p DataType.
1309 bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1310 return TTI.isLegalMaskedScatter(DataType, Alignment);
1311 }
1312
1313 /// Returns true if the target machine supports masked gather operation
1314 /// for the given \p DataType.
1315 bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1316 return TTI.isLegalMaskedGather(DataType, Alignment);
1317 }
1318
1319 /// Returns true if the target machine can represent \p V as a masked gather
1320 /// or scatter operation.
1321 bool isLegalGatherOrScatter(Value *V) {
1322 bool LI = isa<LoadInst>(V);
1323 bool SI = isa<StoreInst>(V);
1324 if (!LI && !SI)
1325 return false;
1326 auto *Ty = getMemInstValueType(V);
1327 Align Align = getLoadStoreAlignment(V);
1328 return (LI && isLegalMaskedGather(Ty, Align)) ||
1329 (SI && isLegalMaskedScatter(Ty, Align));
1330 }
1331
1332 /// Returns true if \p I is an instruction that will be scalarized with
1333 /// predication. Such instructions include conditional stores and
1334 /// instructions that may divide by zero.
1335 /// If a non-zero VF has been calculated, we check if I will be scalarized
1336 /// predication for that VF.
1337 bool isScalarWithPredication(Instruction *I,
1338 ElementCount VF = ElementCount::getFixed(1));
1339
1340 // Returns true if \p I is an instruction that will be predicated either
1341 // through scalar predication or masked load/store or masked gather/scatter.
1342 // Superset of instructions that return true for isScalarWithPredication.
1343 bool isPredicatedInst(Instruction *I) {
1344 if (!blockNeedsPredication(I->getParent()))
1345 return false;
1346 // Loads and stores that need some form of masked operation are predicated
1347 // instructions.
1348 if (isa<LoadInst>(I) || isa<StoreInst>(I))
1349 return Legal->isMaskRequired(I);
1350 return isScalarWithPredication(I);
1351 }
1352
1353 /// Returns true if \p I is a memory instruction with consecutive memory
1354 /// access that can be widened.
1355 bool
1356 memoryInstructionCanBeWidened(Instruction *I,
1357 ElementCount VF = ElementCount::getFixed(1));
1358
1359 /// Returns true if \p I is a memory instruction in an interleaved-group
1360 /// of memory accesses that can be vectorized with wide vector loads/stores
1361 /// and shuffles.
1362 bool
1363 interleavedAccessCanBeWidened(Instruction *I,
1364 ElementCount VF = ElementCount::getFixed(1));
1365
1366 /// Check if \p Instr belongs to any interleaved access group.
1367 bool isAccessInterleaved(Instruction *Instr) {
1368 return InterleaveInfo.isInterleaved(Instr);
1369 }
1370
1371 /// Get the interleaved access group that \p Instr belongs to.
1372 const InterleaveGroup<Instruction> *
1373 getInterleavedAccessGroup(Instruction *Instr) {
1374 return InterleaveInfo.getInterleaveGroup(Instr);
1375 }
1376
1377 /// Returns true if an interleaved group requires a scalar iteration
1378 /// to handle accesses with gaps, and there is nothing preventing us from
1379 /// creating a scalar epilogue.
1380 bool requiresScalarEpilogue() const {
1381 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1382 }
1383
1384 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1385 /// loop hint annotation.
1386 bool isScalarEpilogueAllowed() const {
1387 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1388 }
1389
1390 /// Returns true if all loop blocks should be masked to fold tail loop.
1391 bool foldTailByMasking() const { return FoldTailByMasking; }
1392
1393 bool blockNeedsPredication(BasicBlock *BB) {
1394 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1395 }
1396
1397 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1398 /// nodes to the chain of instructions representing the reductions. Uses a
1399 /// MapVector to ensure deterministic iteration order.
1400 using ReductionChainMap =
1401 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1402
1403 /// Return the chain of instructions representing an inloop reduction.
1404 const ReductionChainMap &getInLoopReductionChains() const {
1405 return InLoopReductionChains;
1406 }
1407
1408 /// Returns true if the Phi is part of an inloop reduction.
1409 bool isInLoopReduction(PHINode *Phi) const {
1410 return InLoopReductionChains.count(Phi);
1411 }
1412
1413 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1414 /// with factor VF. Return the cost of the instruction, including
1415 /// scalarization overhead if it's needed.
1416 unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1417
1418 /// Estimate cost of a call instruction CI if it were vectorized with factor
1419 /// VF. Return the cost of the instruction, including scalarization overhead
1420 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1421 /// scalarized -
1422 /// i.e. either vector version isn't available, or is too expensive.
1423 unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
1424 bool &NeedToScalarize);
1425
1426 /// Invalidates decisions already taken by the cost model.
1427 void invalidateCostModelingDecisions() {
1428 WideningDecisions.clear();
1429 Uniforms.clear();
1430 Scalars.clear();
1431 }
1432
1433private:
1434 unsigned NumPredStores = 0;
1435
1436 /// \return An upper bound for the vectorization factor, a power-of-2 larger
1437 /// than zero. One is returned if vectorization should best be avoided due
1438 /// to cost.
1439 unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1440
1441 /// The vectorization cost is a combination of the cost itself and a boolean
1442 /// indicating whether any of the contributing operations will actually
1443 /// operate on
1444 /// vector values after type legalization in the backend. If this latter value
1445 /// is
1446 /// false, then all operations will be scalarized (i.e. no vectorization has
1447 /// actually taken place).
1448 using VectorizationCostTy = std::pair<unsigned, bool>;
1449
1450 /// Returns the expected execution cost. The unit of the cost does
1451 /// not matter because we use the 'cost' units to compare different
1452 /// vector widths. The cost that is returned is *not* normalized by
1453 /// the factor width.
1454 VectorizationCostTy expectedCost(ElementCount VF);
1455
1456 /// Returns the execution time cost of an instruction for a given vector
1457 /// width. Vector width of one means scalar.
1458 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1459
1460 /// The cost-computation logic from getInstructionCost which provides
1461 /// the vector type as an output parameter.
1462 unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy);
1463
1464 /// Calculate vectorization cost of memory instruction \p I.
1465 unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
1466
1467 /// The cost computation for scalarized memory instruction.
1468 unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1469
1470 /// The cost computation for interleaving group of memory instructions.
1471 unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
1472
1473 /// The cost computation for Gather/Scatter instruction.
1474 unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
1475
1476 /// The cost computation for widening instruction \p I with consecutive
1477 /// memory access.
1478 unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1479
1480 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1481 /// Load: scalar load + broadcast.
1482 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1483 /// element)
1484 unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
1485
1486 /// Estimate the overhead of scalarizing an instruction. This is a
1487 /// convenience wrapper for the type-based getScalarizationOverhead API.
1488 unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
1489
1490 /// Returns whether the instruction is a load or store and will be a emitted
1491 /// as a vector operation.
1492 bool isConsecutiveLoadOrStore(Instruction *I);
1493
1494 /// Returns true if an artificially high cost for emulated masked memrefs
1495 /// should be used.
1496 bool useEmulatedMaskMemRefHack(Instruction *I);
1497
1498 /// Map of scalar integer values to the smallest bitwidth they can be legally
1499 /// represented as. The vector equivalents of these values should be truncated
1500 /// to this type.
1501 MapVector<Instruction *, uint64_t> MinBWs;
1502
1503 /// A type representing the costs for instructions if they were to be
1504 /// scalarized rather than vectorized. The entries are Instruction-Cost
1505 /// pairs.
1506 using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1507
1508 /// A set containing all BasicBlocks that are known to present after
1509 /// vectorization as a predicated block.
1510 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1511
1512 /// Records whether it is allowed to have the original scalar loop execute at
1513 /// least once. This may be needed as a fallback loop in case runtime
1514 /// aliasing/dependence checks fail, or to handle the tail/remainder
1515 /// iterations when the trip count is unknown or doesn't divide by the VF,
1516 /// or as a peel-loop to handle gaps in interleave-groups.
1517 /// Under optsize and when the trip count is very small we don't allow any
1518 /// iterations to execute in the scalar loop.
1519 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1520
1521 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1522 bool FoldTailByMasking = false;
1523
1524 /// A map holding scalar costs for different vectorization factors. The
1525 /// presence of a cost for an instruction in the mapping indicates that the
1526 /// instruction will be scalarized when vectorizing with the associated
1527 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1528 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1529
1530 /// Holds the instructions known to be uniform after vectorization.
1531 /// The data is collected per VF.
1532 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1533
1534 /// Holds the instructions known to be scalar after vectorization.
1535 /// The data is collected per VF.
1536 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1537
1538 /// Holds the instructions (address computations) that are forced to be
1539 /// scalarized.
1540 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1541
1542 /// PHINodes of the reductions that should be expanded in-loop along with
1543 /// their associated chains of reduction operations, in program order from top
1544 /// (PHI) to bottom
1545 ReductionChainMap InLoopReductionChains;
1546
1547 /// Returns the expected difference in cost from scalarizing the expression
1548 /// feeding a predicated instruction \p PredInst. The instructions to
1549 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1550 /// non-negative return value implies the expression will be scalarized.
1551 /// Currently, only single-use chains are considered for scalarization.
1552 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1553 ElementCount VF);
1554
1555 /// Collect the instructions that are uniform after vectorization. An
1556 /// instruction is uniform if we represent it with a single scalar value in
1557 /// the vectorized loop corresponding to each vector iteration. Examples of
1558 /// uniform instructions include pointer operands of consecutive or
1559 /// interleaved memory accesses. Note that although uniformity implies an
1560 /// instruction will be scalar, the reverse is not true. In general, a
1561 /// scalarized instruction will be represented by VF scalar values in the
1562 /// vectorized loop, each corresponding to an iteration of the original
1563 /// scalar loop.
1564 void collectLoopUniforms(ElementCount VF);
1565
1566 /// Collect the instructions that are scalar after vectorization. An
1567 /// instruction is scalar if it is known to be uniform or will be scalarized
1568 /// during vectorization. Non-uniform scalarized instructions will be
1569 /// represented by VF values in the vectorized loop, each corresponding to an
1570 /// iteration of the original scalar loop.
1571 void collectLoopScalars(ElementCount VF);
1572
1573 /// Keeps cost model vectorization decision and cost for instructions.
1574 /// Right now it is used for memory instructions only.
1575 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1576 std::pair<InstWidening, unsigned>>;
1577
1578 DecisionList WideningDecisions;
1579
1580 /// Returns true if \p V is expected to be vectorized and it needs to be
1581 /// extracted.
1582 bool needsExtract(Value *V, ElementCount VF) const {
1583 Instruction *I = dyn_cast<Instruction>(V);
1584 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1585 TheLoop->isLoopInvariant(I))
1586 return false;
1587
1588 // Assume we can vectorize V (and hence we need extraction) if the
1589 // scalars are not computed yet. This can happen, because it is called
1590 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1591 // the scalars are collected. That should be a safe assumption in most
1592 // cases, because we check if the operands have vectorizable types
1593 // beforehand in LoopVectorizationLegality.
1594 return Scalars.find(VF) == Scalars.end() ||
1595 !isScalarAfterVectorization(I, VF);
1596 };
1597
1598 /// Returns a range containing only operands needing to be extracted.
1599 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1600 ElementCount VF) {
1601 return SmallVector<Value *, 4>(make_filter_range(
1602 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1603 }
1604
1605public:
1606 /// The loop that we evaluate.
1607 Loop *TheLoop;
1608
1609 /// Predicated scalar evolution analysis.
1610 PredicatedScalarEvolution &PSE;
1611
1612 /// Loop Info analysis.
1613 LoopInfo *LI;
1614
1615 /// Vectorization legality.
1616 LoopVectorizationLegality *Legal;
1617
1618 /// Vector target information.
1619 const TargetTransformInfo &TTI;
1620
1621 /// Target Library Info.
1622 const TargetLibraryInfo *TLI;
1623
1624 /// Demanded bits analysis.
1625 DemandedBits *DB;
1626
1627 /// Assumption cache.
1628 AssumptionCache *AC;
1629
1630 /// Interface to emit optimization remarks.
1631 OptimizationRemarkEmitter *ORE;
1632
1633 const Function *TheFunction;
1634
1635 /// Loop Vectorize Hint.
1636 const LoopVectorizeHints *Hints;
1637
1638 /// The interleave access information contains groups of interleaved accesses
1639 /// with the same stride and close to each other.
1640 InterleavedAccessInfo &InterleaveInfo;
1641
1642 /// Values to ignore in the cost model.
1643 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1644
1645 /// Values to ignore in the cost model when VF > 1.
1646 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1647};
1648
1649} // end namespace llvm
1650
1651// Return true if \p OuterLp is an outer loop annotated with hints for explicit
1652// vectorization. The loop needs to be annotated with #pragma omp simd
1653// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1654// vector length information is not provided, vectorization is not considered
1655// explicit. Interleave hints are not allowed either. These limitations will be
1656// relaxed in the future.
1657// Please, note that we are currently forced to abuse the pragma 'clang
1658// vectorize' semantics. This pragma provides *auto-vectorization hints*
1659// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1660// provides *explicit vectorization hints* (LV can bypass legal checks and
1661// assume that vectorization is legal). However, both hints are implemented
1662// using the same metadata (llvm.loop.vectorize, processed by
1663// LoopVectorizeHints). This will be fixed in the future when the native IR
1664// representation for pragma 'omp simd' is introduced.
1665static bool isExplicitVecOuterLoop(Loop *OuterLp,
1666 OptimizationRemarkEmitter *ORE) {
1667 assert(!OuterLp->isInnermost() && "This is not an outer loop")((!OuterLp->isInnermost() && "This is not an outer loop"
) ? static_cast<void> (0) : __assert_fail ("!OuterLp->isInnermost() && \"This is not an outer loop\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1667, __PRETTY_FUNCTION__))
;
1668 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1669
1670 // Only outer loops with an explicit vectorization hint are supported.
1671 // Unannotated outer loops are ignored.
1672 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1673 return false;
1674
1675 Function *Fn = OuterLp->getHeader()->getParent();
1676 if (!Hints.allowVectorization(Fn, OuterLp,
1677 true /*VectorizeOnlyWhenForced*/)) {
1678 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"
; } } while (false)
;
1679 return false;
1680 }
1681
1682 if (Hints.getInterleave() > 1) {
1683 // TODO: Interleave support is future work.
1684 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
1685 "outer loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
;
1686 Hints.emitRemarkWithHints();
1687 return false;
1688 }
1689
1690 return true;
1691}
1692
1693static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1694 OptimizationRemarkEmitter *ORE,
1695 SmallVectorImpl<Loop *> &V) {
1696 // Collect inner loops and outer loops without irreducible control flow. For
1697 // now, only collect outer loops that have explicit vectorization hints. If we
1698 // are stress testing the VPlan H-CFG construction, we collect the outermost
1699 // loop of every loop nest.
1700 if (L.isInnermost() || VPlanBuildStressTest ||
1701 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1702 LoopBlocksRPO RPOT(&L);
1703 RPOT.perform(LI);
1704 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1705 V.push_back(&L);
1706 // TODO: Collect inner loops inside marked outer loops in case
1707 // vectorization fails for the outer loop. Do not invoke
1708 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1709 // already known to be reducible. We can use an inherited attribute for
1710 // that.
1711 return;
1712 }
1713 }
1714 for (Loop *InnerL : L)
1715 collectSupportedLoops(*InnerL, LI, ORE, V);
1716}
1717
1718namespace {
1719
1720/// The LoopVectorize Pass.
1721struct LoopVectorize : public FunctionPass {
1722 /// Pass identification, replacement for typeid
1723 static char ID;
1724
1725 LoopVectorizePass Impl;
1726
1727 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1728 bool VectorizeOnlyWhenForced = false)
1729 : FunctionPass(ID),
1730 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1731 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1732 }
1733
1734 bool runOnFunction(Function &F) override {
1735 if (skipFunction(F))
1736 return false;
1737
1738 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1739 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1740 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1741 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1742 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1743 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1744 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1745 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1746 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1747 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1748 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1749 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1750 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1751
1752 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1753 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1754
1755 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1756 GetLAA, *ORE, PSI).MadeAnyChange;
1757 }
1758
1759 void getAnalysisUsage(AnalysisUsage &AU) const override {
1760 AU.addRequired<AssumptionCacheTracker>();
1761 AU.addRequired<BlockFrequencyInfoWrapperPass>();
1762 AU.addRequired<DominatorTreeWrapperPass>();
1763 AU.addRequired<LoopInfoWrapperPass>();
1764 AU.addRequired<ScalarEvolutionWrapperPass>();
1765 AU.addRequired<TargetTransformInfoWrapperPass>();
1766 AU.addRequired<AAResultsWrapperPass>();
1767 AU.addRequired<LoopAccessLegacyAnalysis>();
1768 AU.addRequired<DemandedBitsWrapperPass>();
1769 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1770 AU.addRequired<InjectTLIMappingsLegacy>();
1771
1772 // We currently do not preserve loopinfo/dominator analyses with outer loop
1773 // vectorization. Until this is addressed, mark these analyses as preserved
1774 // only for non-VPlan-native path.
1775 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1776 if (!EnableVPlanNativePath) {
1777 AU.addPreserved<LoopInfoWrapperPass>();
1778 AU.addPreserved<DominatorTreeWrapperPass>();
1779 }
1780
1781 AU.addPreserved<BasicAAWrapperPass>();
1782 AU.addPreserved<GlobalsAAWrapperPass>();
1783 AU.addRequired<ProfileSummaryInfoWrapperPass>();
1784 }
1785};
1786
1787} // end anonymous namespace
1788
1789//===----------------------------------------------------------------------===//
1790// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1791// LoopVectorizationCostModel and LoopVectorizationPlanner.
1792//===----------------------------------------------------------------------===//
1793
1794Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1795 // We need to place the broadcast of invariant variables outside the loop,
1796 // but only if it's proven safe to do so. Else, broadcast will be inside
1797 // vector loop body.
1798 Instruction *Instr = dyn_cast<Instruction>(V);
1799 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1800 (!Instr ||
1801 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1802 // Place the code for broadcasting invariant variables in the new preheader.
1803 IRBuilder<>::InsertPointGuard Guard(Builder);
1804 if (SafeToHoist)
1805 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1806
1807 // Broadcast the scalar into all locations in the vector.
1808 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1809
1810 return Shuf;
1811}
1812
1813void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1814 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1815 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1816, __PRETTY_FUNCTION__))
1816 "Expected either an induction phi-node or a truncate of it!")(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1816, __PRETTY_FUNCTION__))
;
1817 Value *Start = II.getStartValue();
1818
1819 // Construct the initial value of the vector IV in the vector loop preheader
1820 auto CurrIP = Builder.saveIP();
1821 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1822 if (isa<TruncInst>(EntryVal)) {
1823 assert(Start->getType()->isIntegerTy() &&((Start->getType()->isIntegerTy() && "Truncation requires an integer type"
) ? static_cast<void> (0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1824, __PRETTY_FUNCTION__))
1824 "Truncation requires an integer type")((Start->getType()->isIntegerTy() && "Truncation requires an integer type"
) ? static_cast<void> (0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1824, __PRETTY_FUNCTION__))
;
1825 auto *TruncType = cast<IntegerType>(EntryVal->getType());
1826 Step = Builder.CreateTrunc(Step, TruncType);
1827 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1828 }
1829 Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1830 Value *SteppedStart =
1831 getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1832
1833 // We create vector phi nodes for both integer and floating-point induction
1834 // variables. Here, we determine the kind of arithmetic we will perform.
1835 Instruction::BinaryOps AddOp;
1836 Instruction::BinaryOps MulOp;
1837 if (Step->getType()->isIntegerTy()) {
1838 AddOp = Instruction::Add;
1839 MulOp = Instruction::Mul;
1840 } else {
1841 AddOp = II.getInductionOpcode();
1842 MulOp = Instruction::FMul;
1843 }
1844
1845 // Multiply the vectorization factor by the step using integer or
1846 // floating-point arithmetic as appropriate.
1847 Value *ConstVF =
1848 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
1849 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1850
1851 // Create a vector splat to use in the induction update.
1852 //
1853 // FIXME: If the step is non-constant, we create the vector splat with
1854 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1855 // handle a constant vector splat.
1856 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1856, __PRETTY_FUNCTION__))
;
1857 Value *SplatVF = isa<Constant>(Mul)
1858 ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1859 : Builder.CreateVectorSplat(VF, Mul);
1860 Builder.restoreIP(CurrIP);
1861
1862 // We may need to add the step a number of times, depending on the unroll
1863 // factor. The last of those goes into the PHI.
1864 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1865 &*LoopVectorBody->getFirstInsertionPt());
1866 VecInd->setDebugLoc(EntryVal->getDebugLoc());
1867 Instruction *LastInduction = VecInd;
1868 for (unsigned Part = 0; Part < UF; ++Part) {
1869 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1870
1871 if (isa<TruncInst>(EntryVal))
1872 addMetadata(LastInduction, EntryVal);
1873 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1874
1875 LastInduction = cast<Instruction>(addFastMathFlag(
1876 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1877 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1878 }
1879
1880 // Move the last step to the end of the latch block. This ensures consistent
1881 // placement of all induction updates.
1882 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1883 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1884 auto *ICmp = cast<Instruction>(Br->getCondition());
1885 LastInduction->moveBefore(ICmp);
1886 LastInduction->setName("vec.ind.next");
1887
1888 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1889 VecInd->addIncoming(LastInduction, LoopVectorLatch);
1890}
1891
1892bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1893 return Cost->isScalarAfterVectorization(I, VF) ||
1894 Cost->isProfitableToScalarize(I, VF);
1895}
1896
1897bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1898 if (shouldScalarizeInstruction(IV))
1899 return true;
1900 auto isScalarInst = [&](User *U) -> bool {
1901 auto *I = cast<Instruction>(U);
1902 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1903 };
1904 return llvm::any_of(IV->users(), isScalarInst);
1905}
1906
1907void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1908 const InductionDescriptor &ID, const Instruction *EntryVal,
1909 Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1910 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1911, __PRETTY_FUNCTION__))
1911 "Expected either an induction phi-node or a truncate of it!")(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1911, __PRETTY_FUNCTION__))
;
1912
1913 // This induction variable is not the phi from the original loop but the
1914 // newly-created IV based on the proof that casted Phi is equal to the
1915 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1916 // re-uses the same InductionDescriptor that original IV uses but we don't
1917 // have to do any recording in this case - that is done when original IV is
1918 // processed.
1919 if (isa<TruncInst>(EntryVal))
1920 return;
1921
1922 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1923 if (Casts.empty())
1924 return;
1925 // Only the first Cast instruction in the Casts vector is of interest.
1926 // The rest of the Casts (if exist) have no uses outside the
1927 // induction update chain itself.
1928 Instruction *CastInst = *Casts.begin();
1929 if (Lane < UINT_MAX(2147483647 *2U +1U))
1930 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1931 else
1932 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1933}
1934
1935void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1936 assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&(((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type") ? static_cast
<void> (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1937, __PRETTY_FUNCTION__))
1937 "Primary induction variable must have an integer type")(((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type") ? static_cast
<void> (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1937, __PRETTY_FUNCTION__))
;
1938
1939 auto II = Legal->getInductionVars().find(IV);
1940 assert(II != Legal->getInductionVars().end() && "IV is not an induction")((II != Legal->getInductionVars().end() && "IV is not an induction"
) ? static_cast<void> (0) : __assert_fail ("II != Legal->getInductionVars().end() && \"IV is not an induction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1940, __PRETTY_FUNCTION__))
;
1941
1942 auto ID = II->second;
1943 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match")((IV->getType() == ID.getStartValue()->getType() &&
"Types must match") ? static_cast<void> (0) : __assert_fail
("IV->getType() == ID.getStartValue()->getType() && \"Types must match\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1943, __PRETTY_FUNCTION__))
;
1944
1945 // The value from the original loop to which we are mapping the new induction
1946 // variable.
1947 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1948
1949 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1950
1951 // Generate code for the induction step. Note that induction steps are
1952 // required to be loop-invariant
1953 auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1954 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&((PSE.getSE()->isLoopInvariant(Step, OrigLoop) && "Induction step should be loop invariant"
) ? static_cast<void> (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(Step, OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1955, __PRETTY_FUNCTION__))
1955 "Induction step should be loop invariant")((PSE.getSE()->isLoopInvariant(Step, OrigLoop) && "Induction step should be loop invariant"
) ? static_cast<void> (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(Step, OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1955, __PRETTY_FUNCTION__))
;
1956 if (PSE.getSE()->isSCEVable(IV->getType())) {
1957 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1958 return Exp.expandCodeFor(Step, Step->getType(),
1959 LoopVectorPreHeader->getTerminator());
1960 }
1961 return cast<SCEVUnknown>(Step)->getValue();
1962 };
1963
1964 // The scalar value to broadcast. This is derived from the canonical
1965 // induction variable. If a truncation type is given, truncate the canonical
1966 // induction variable and step. Otherwise, derive these values from the
1967 // induction descriptor.
1968 auto CreateScalarIV = [&](Value *&Step) -> Value * {
1969 Value *ScalarIV = Induction;
1970 if (IV != OldInduction) {
1971 ScalarIV = IV->getType()->isIntegerTy()
1972 ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1973 : Builder.CreateCast(Instruction::SIToFP, Induction,
1974 IV->getType());
1975 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1976 ScalarIV->setName("offset.idx");
1977 }
1978 if (Trunc) {
1979 auto *TruncType = cast<IntegerType>(Trunc->getType());
1980 assert(Step->getType()->isIntegerTy() &&((Step->getType()->isIntegerTy() && "Truncation requires an integer step"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1981, __PRETTY_FUNCTION__))
1981 "Truncation requires an integer step")((Step->getType()->isIntegerTy() && "Truncation requires an integer step"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1981, __PRETTY_FUNCTION__))
;
1982 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1983 Step = Builder.CreateTrunc(Step, TruncType);
1984 }
1985 return ScalarIV;
1986 };
1987
1988 // Create the vector values from the scalar IV, in the absence of creating a
1989 // vector IV.
1990 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1991 Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1992 for (unsigned Part = 0; Part < UF; ++Part) {
1993 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1993, __PRETTY_FUNCTION__))
;
1994 Value *EntryPart =
1995 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
1996 ID.getInductionOpcode());
1997 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1998 if (Trunc)
1999 addMetadata(EntryPart, Trunc);
2000 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
2001 }
2002 };
2003
2004 // Now do the actual transformations, and start with creating the step value.
2005 Value *Step = CreateStepValue(ID.getStep());
2006 if (VF.isZero() || VF.isScalar()) {
2007 Value *ScalarIV = CreateScalarIV(Step);
2008 CreateSplatIV(ScalarIV, Step);
2009 return;
2010 }
2011
2012 // Determine if we want a scalar version of the induction variable. This is
2013 // true if the induction variable itself is not widened, or if it has at
2014 // least one user in the loop that is not widened.
2015 auto NeedsScalarIV = needsScalarInduction(EntryVal);
2016 if (!NeedsScalarIV) {
2017 createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2018 return;
2019 }
2020
2021 // Try to create a new independent vector induction variable. If we can't
2022 // create the phi node, we will splat the scalar induction variable in each
2023 // loop iteration.
2024 if (!shouldScalarizeInstruction(EntryVal)) {
2025 createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2026 Value *ScalarIV = CreateScalarIV(Step);
2027 // Create scalar steps that can be used by instructions we will later
2028 // scalarize. Note that the addition of the scalar steps will not increase
2029 // the number of instructions in the loop in the common case prior to
2030 // InstCombine. We will be trading one vector extract for each scalar step.
2031 buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2032 return;
2033 }
2034
2035 // All IV users are scalar instructions, so only emit a scalar IV, not a
2036 // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2037 // predicate used by the masked loads/stores.
2038 Value *ScalarIV = CreateScalarIV(Step);
2039 if (!Cost->isScalarEpilogueAllowed())
2040 CreateSplatIV(ScalarIV, Step);
2041 buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2042}
2043
2044Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2045 Instruction::BinaryOps BinOp) {
2046 // Create and check the types.
2047 auto *ValVTy = cast<FixedVectorType>(Val->getType());
2048 int VLen = ValVTy->getNumElements();
2049
2050 Type *STy = Val->getType()->getScalarType();
2051 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&(((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
"Induction Step must be an integer or FP") ? static_cast<
void> (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2052, __PRETTY_FUNCTION__))
2052 "Induction Step must be an integer or FP")(((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
"Induction Step must be an integer or FP") ? static_cast<
void> (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2052, __PRETTY_FUNCTION__))
;
2053 assert(Step->getType() == STy && "Step has wrong type")((Step->getType() == STy && "Step has wrong type")
? static_cast<void> (0) : __assert_fail ("Step->getType() == STy && \"Step has wrong type\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2053, __PRETTY_FUNCTION__))
;
2054
2055 SmallVector<Constant *, 8> Indices;
2056
2057 if (STy->isIntegerTy()) {
2058 // Create a vector of consecutive numbers from zero to VF.
2059 for (int i = 0; i < VLen; ++i)
2060 Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2061
2062 // Add the consecutive indices to the vector value.
2063 Constant *Cv = ConstantVector::get(Indices);
2064 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec")((Cv->getType() == Val->getType() && "Invalid consecutive vec"
) ? static_cast<void> (0) : __assert_fail ("Cv->getType() == Val->getType() && \"Invalid consecutive vec\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2064, __PRETTY_FUNCTION__))
;
2065 Step = Builder.CreateVectorSplat(VLen, Step);
2066 assert(Step->getType() == Val->getType() && "Invalid step vec")((Step->getType() == Val->getType() && "Invalid step vec"
) ? static_cast<void> (0) : __assert_fail ("Step->getType() == Val->getType() && \"Invalid step vec\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2066, __PRETTY_FUNCTION__))
;
2067 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2068 // which can be found from the original scalar operations.
2069 Step = Builder.CreateMul(Cv, Step);
2070 return Builder.CreateAdd(Val, Step, "induction");
2071 }
2072
2073 // Floating point induction.
2074 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&(((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
"Binary Opcode should be specified for FP induction") ? static_cast
<void> (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2075, __PRETTY_FUNCTION__))
2075 "Binary Opcode should be specified for FP induction")(((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
"Binary Opcode should be specified for FP induction") ? static_cast
<void> (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2075, __PRETTY_FUNCTION__))
;
2076 // Create a vector of consecutive numbers from zero to VF.
2077 for (int i = 0; i < VLen; ++i)
2078 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2079
2080 // Add the consecutive indices to the vector value.
2081 Constant *Cv = ConstantVector::get(Indices);
2082
2083 Step = Builder.CreateVectorSplat(VLen, Step);
2084
2085 // Floating point operations had to be 'fast' to enable the induction.
2086 FastMathFlags Flags;
2087 Flags.setFast();
2088
2089 Value *MulOp = Builder.CreateFMul(Cv, Step);
2090 if (isa<Instruction>(MulOp))
2091 // Have to check, MulOp may be a constant
2092 cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2093
2094 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2095 if (isa<Instruction>(BOp))
2096 cast<Instruction>(BOp)->setFastMathFlags(Flags);
2097 return BOp;
2098}
2099
2100void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2101 Instruction *EntryVal,
2102 const InductionDescriptor &ID) {
2103 // We shouldn't have to build scalar steps if we aren't vectorizing.
2104 assert(VF.isVector() && "VF should be greater than one")((VF.isVector() && "VF should be greater than one") ?
static_cast<void> (0) : __assert_fail ("VF.isVector() && \"VF should be greater than one\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2104, __PRETTY_FUNCTION__))
;
2105 assert(!VF.isScalable() &&((!VF.isScalable() && "the code below assumes a fixed number of elements at compile time"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"the code below assumes a fixed number of elements at compile time\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2106, __PRETTY_FUNCTION__))
2106 "the code below assumes a fixed number of elements at compile time")((!VF.isScalable() && "the code below assumes a fixed number of elements at compile time"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"the code below assumes a fixed number of elements at compile time\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2106, __PRETTY_FUNCTION__))
;
2107 // Get the value type and ensure it and the step have the same integer type.
2108 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2109 assert(ScalarIVTy == Step->getType() &&((ScalarIVTy == Step->getType() && "Val and Step should have the same type"
) ? static_cast<void> (0) : __assert_fail ("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2110, __PRETTY_FUNCTION__))
2110 "Val and Step should have the same type")((ScalarIVTy == Step->getType() && "Val and Step should have the same type"
) ? static_cast<void> (0) : __assert_fail ("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2110, __PRETTY_FUNCTION__))
;
2111
2112 // We build scalar steps for both integer and floating-point induction
2113 // variables. Here, we determine the kind of arithmetic we will perform.
2114 Instruction::BinaryOps AddOp;
2115 Instruction::BinaryOps MulOp;
2116 if (ScalarIVTy->isIntegerTy()) {
2117 AddOp = Instruction::Add;
2118 MulOp = Instruction::Mul;
2119 } else {
2120 AddOp = ID.getInductionOpcode();
2121 MulOp = Instruction::FMul;
2122 }
2123
2124 // Determine the number of scalars we need to generate for each unroll
2125 // iteration. If EntryVal is uniform, we only need to generate the first
2126 // lane. Otherwise, we generate all VF values.
2127 unsigned Lanes =
2128 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2129 ? 1
2130 : VF.getKnownMinValue();
2131 // Compute the scalar steps and save the results in VectorLoopValueMap.
2132 for (unsigned Part = 0; Part < UF; ++Part) {
2133 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2134 auto *StartIdx = getSignedIntOrFpConstant(
2135 ScalarIVTy, VF.getKnownMinValue() * Part + Lane);
2136 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2137 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2138 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2139 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2140 }
2141 }
2142}
2143
2144Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2145 assert(V != Induction && "The new induction variable should not be used.")((V != Induction && "The new induction variable should not be used."
) ? static_cast<void> (0) : __assert_fail ("V != Induction && \"The new induction variable should not be used.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2145, __PRETTY_FUNCTION__))
;
2146 assert(!V->getType()->isVectorTy() && "Can't widen a vector")((!V->getType()->isVectorTy() && "Can't widen a vector"
) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVectorTy() && \"Can't widen a vector\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2146, __PRETTY_FUNCTION__))
;
2147 assert(!V->getType()->isVoidTy() && "Type does not produce a value")((!V->getType()->isVoidTy() && "Type does not produce a value"
) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVoidTy() && \"Type does not produce a value\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2147, __PRETTY_FUNCTION__))
;
2148
2149 // If we have a stride that is replaced by one, do it here. Defer this for
2150 // the VPlan-native path until we start running Legal checks in that path.
2151 if (!EnableVPlanNativePath && Legal->hasStride(V))
2152 V = ConstantInt::get(V->getType(), 1);
2153
2154 // If we have a vector mapped to this value, return it.
2155 if (VectorLoopValueMap.hasVectorValue(V, Part))
2156 return VectorLoopValueMap.getVectorValue(V, Part);
2157
2158 // If the value has not been vectorized, check if it has been scalarized
2159 // instead. If it has been scalarized, and we actually need the value in
2160 // vector form, we will construct the vector values on demand.
2161 if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2162 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2163
2164 // If we've scalarized a value, that value should be an instruction.
2165 auto *I = cast<Instruction>(V);
2166
2167 // If we aren't vectorizing, we can just copy the scalar map values over to
2168 // the vector map.
2169 if (VF == 1) {
2170 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2171 return ScalarValue;
2172 }
2173
2174 // Get the last scalar instruction we generated for V and Part. If the value
2175 // is known to be uniform after vectorization, this corresponds to lane zero
2176 // of the Part unroll iteration. Otherwise, the last instruction is the one
2177 // we created for the last vector lane of the Part unroll iteration.
2178 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2178, __PRETTY_FUNCTION__))
;
2179 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
2180 ? 0
2181 : VF.getKnownMinValue() - 1;
2182 auto *LastInst = cast<Instruction>(
2183 VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2184
2185 // Set the insert point after the last scalarized instruction. This ensures
2186 // the insertelement sequence will directly follow the scalar definitions.
2187 auto OldIP = Builder.saveIP();
2188 auto NewIP = std::next(BasicBlock::iterator(LastInst));
2189 Builder.SetInsertPoint(&*NewIP);
2190
2191 // However, if we are vectorizing, we need to construct the vector values.
2192 // If the value is known to be uniform after vectorization, we can just
2193 // broadcast the scalar value corresponding to lane zero for each unroll
2194 // iteration. Otherwise, we construct the vector values using insertelement
2195 // instructions. Since the resulting vectors are stored in
2196 // VectorLoopValueMap, we will only generate the insertelements once.
2197 Value *VectorValue = nullptr;
2198 if (Cost->isUniformAfterVectorization(I, VF)) {
2199 VectorValue = getBroadcastInstrs(ScalarValue);
2200 VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2201 } else {
2202 // Initialize packing with insertelements to start from undef.
2203 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2203, __PRETTY_FUNCTION__))
;
2204 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2205 VectorLoopValueMap.setVectorValue(V, Part, Undef);
2206 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
2207 packScalarIntoVectorValue(V, {Part, Lane});
2208 VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2209 }
2210 Builder.restoreIP(OldIP);
2211 return VectorValue;
2212 }
2213
2214 // If this scalar is unknown, assume that it is a constant or that it is
2215 // loop invariant. Broadcast V and save the value for future uses.
2216 Value *B = getBroadcastInstrs(V);
2217 VectorLoopValueMap.setVectorValue(V, Part, B);
2218 return B;
2219}
2220
2221Value *
2222InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2223 const VPIteration &Instance) {
2224 // If the value is not an instruction contained in the loop, it should
2225 // already be scalar.
2226 if (OrigLoop->isLoopInvariant(V))
2227 return V;
2228
2229 assert(Instance.Lane > 0((Instance.Lane > 0 ? !Cost->isUniformAfterVectorization
(cast<Instruction>(V), VF) : true && "Uniform values only have lane zero"
) ? static_cast<void> (0) : __assert_fail ("Instance.Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) : true && \"Uniform values only have lane zero\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2231, __PRETTY_FUNCTION__))
2230 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)((Instance.Lane > 0 ? !Cost->isUniformAfterVectorization
(cast<Instruction>(V), VF) : true && "Uniform values only have lane zero"
) ? static_cast<void> (0) : __assert_fail ("Instance.Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) : true && \"Uniform values only have lane zero\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2231, __PRETTY_FUNCTION__))
2231 : true && "Uniform values only have lane zero")((Instance.Lane > 0 ? !Cost->isUniformAfterVectorization
(cast<Instruction>(V), VF) : true && "Uniform values only have lane zero"
) ? static_cast<void> (0) : __assert_fail ("Instance.Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) : true && \"Uniform values only have lane zero\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2231, __PRETTY_FUNCTION__))
;
2232
2233 // If the value from the original loop has not been vectorized, it is
2234 // represented by UF x VF scalar values in the new loop. Return the requested
2235 // scalar value.
2236 if (VectorLoopValueMap.hasScalarValue(V, Instance))
2237 return VectorLoopValueMap.getScalarValue(V, Instance);
2238
2239 // If the value has not been scalarized, get its entry in VectorLoopValueMap
2240 // for the given unroll part. If this entry is not a vector type (i.e., the
2241 // vectorization factor is one), there is no need to generate an
2242 // extractelement instruction.
2243 auto *U = getOrCreateVectorValue(V, Instance.Part);
2244 if (!U->getType()->isVectorTy()) {
2245 assert(VF == 1 && "Value not scalarized has non-vector type")((VF == 1 && "Value not scalarized has non-vector type"
) ? static_cast<void> (0) : __assert_fail ("VF == 1 && \"Value not scalarized has non-vector type\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2245, __PRETTY_FUNCTION__))
;
2246 return U;
2247 }
2248
2249 // Otherwise, the value from the original loop has been vectorized and is
2250 // represented by UF vector values. Extract and return the requested scalar
2251 // value from the appropriate vector lane.
2252 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2253}
2254
2255void InnerLoopVectorizer::packScalarIntoVectorValue(
2256 Value *V, const VPIteration &Instance) {
2257 assert(V != Induction && "The new induction variable should not be used.")((V != Induction && "The new induction variable should not be used."
) ? static_cast<void> (0) : __assert_fail ("V != Induction && \"The new induction variable should not be used.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2257, __PRETTY_FUNCTION__))
;
2258 assert(!V->getType()->isVectorTy() && "Can't pack a vector")((!V->getType()->isVectorTy() && "Can't pack a vector"
) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVectorTy() && \"Can't pack a vector\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2258, __PRETTY_FUNCTION__))
;
2259 assert(!V->getType()->isVoidTy() && "Type does not produce a value")((!V->getType()->isVoidTy() && "Type does not produce a value"
) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVoidTy() && \"Type does not produce a value\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2259, __PRETTY_FUNCTION__))
;
2260
2261 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2262 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2263 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2264 Builder.getInt32(Instance.Lane));
2265 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2266}
2267
2268Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2269 assert(Vec->getType()->isVectorTy() && "Invalid type")((Vec->getType()->isVectorTy() && "Invalid type"
) ? static_cast<void> (0) : __assert_fail ("Vec->getType()->isVectorTy() && \"Invalid type\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2269, __PRETTY_FUNCTION__))
;
2270 assert(!VF.isScalable() && "Cannot reverse scalable vectors")((!VF.isScalable() && "Cannot reverse scalable vectors"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"Cannot reverse scalable vectors\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2270, __PRETTY_FUNCTION__))
;
2271 SmallVector<int, 8> ShuffleMask;
2272 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2273 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2274
2275 return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
2276}
2277
2278// Return whether we allow using masked interleave-groups (for dealing with
2279// strided loads/stores that reside in predicated blocks, or for dealing
2280// with gaps).
2281static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2282 // If an override option has been passed in for interleaved accesses, use it.
2283 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2284 return EnableMaskedInterleavedMemAccesses;
2285
2286 return TTI.enableMaskedInterleavedAccessVectorization();
2287}
2288
2289// Try to vectorize the interleave group that \p Instr belongs to.
2290//
2291// E.g. Translate following interleaved load group (factor = 3):
2292// for (i = 0; i < N; i+=3) {
2293// R = Pic[i]; // Member of index 0
2294// G = Pic[i+1]; // Member of index 1
2295// B = Pic[i+2]; // Member of index 2
2296// ... // do something to R, G, B
2297// }
2298// To:
2299// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2300// %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
2301// %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
2302// %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
2303//
2304// Or translate following interleaved store group (factor = 3):
2305// for (i = 0; i < N; i+=3) {
2306// ... do something to R, G, B
2307// Pic[i] = R; // Member of index 0
2308// Pic[i+1] = G; // Member of index 1
2309// Pic[i+2] = B; // Member of index 2
2310// }
2311// To:
2312// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2313// %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2314// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2315// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2316// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2317void InnerLoopVectorizer::vectorizeInterleaveGroup(
2318 const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2319 VPValue *Addr, VPValue *BlockInMask) {
2320 Instruction *Instr = Group->getInsertPos();
2321 const DataLayout &DL = Instr->getModule()->getDataLayout();
2322
2323 // Prepare for the vector type of the interleaved load/store.
2324 Type *ScalarTy = getMemInstValueType(Instr);
2325 unsigned InterleaveFactor = Group->getFactor();
2326 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2326, __PRETTY_FUNCTION__))
;
2327 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2328
2329 // Prepare for the new pointers.
2330 SmallVector<Value *, 2> AddrParts;
2331 unsigned Index = Group->getIndex(Instr);
2332
2333 // TODO: extend the masked interleaved-group support to reversed access.
2334 assert((!BlockInMask || !Group->isReverse()) &&(((!BlockInMask || !Group->isReverse()) && "Reversed masked interleave-group not supported."
) ? static_cast<void> (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2335, __PRETTY_FUNCTION__))
2335 "Reversed masked interleave-group not supported.")(((!BlockInMask || !Group->isReverse()) && "Reversed masked interleave-group not supported."
) ? static_cast<void> (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2335, __PRETTY_FUNCTION__))
;
2336
2337 // If the group is reverse, adjust the index to refer to the last vector lane
2338 // instead of the first. We adjust the index from the first vector lane,
2339 // rather than directly getting the pointer for lane VF - 1, because the
2340 // pointer operand of the interleaved access is supposed to be uniform. For
2341 // uniform instructions, we're only required to generate a value for the
2342 // first vector lane in each unroll iteration.
2343 assert(!VF.isScalable() &&((!VF.isScalable() && "scalable vector reverse operation is not implemented"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vector reverse operation is not implemented\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2344, __PRETTY_FUNCTION__))
2344 "scalable vector reverse operation is not implemented")((!VF.isScalable() && "scalable vector reverse operation is not implemented"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vector reverse operation is not implemented\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2344, __PRETTY_FUNCTION__))
;
2345 if (Group->isReverse())
2346 Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2347
2348 for (unsigned Part = 0; Part < UF; Part++) {
2349 Value *AddrPart = State.get(Addr, {Part, 0});
2350 setDebugLocFromInst(Builder, AddrPart);
2351
2352 // Notice current instruction could be any index. Need to adjust the address
2353 // to the member of index 0.
2354 //
2355 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2356 // b = A[i]; // Member of index 0
2357 // Current pointer is pointed to A[i+1], adjust it to A[i].
2358 //
2359 // E.g. A[i+1] = a; // Member of index 1
2360 // A[i] = b; // Member of index 0
2361 // A[i+2] = c; // Member of index 2 (Current instruction)
2362 // Current pointer is pointed to A[i+2], adjust it to A[i].
2363
2364 bool InBounds = false;
2365 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2366 InBounds = gep->isInBounds();
2367 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2368 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2369
2370 // Cast to the vector pointer type.
2371 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2372 Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2373 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2374 }
2375
2376 setDebugLocFromInst(Builder, Instr);
2377 Value *UndefVec = UndefValue::get(VecTy);
2378
2379 Value *MaskForGaps = nullptr;
2380 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2381 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2381, __PRETTY_FUNCTION__))
;
2382 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2383 assert(MaskForGaps && "Mask for Gaps is required but it is null")((MaskForGaps && "Mask for Gaps is required but it is null"
) ? static_cast<void> (0) : __assert_fail ("MaskForGaps && \"Mask for Gaps is required but it is null\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2383, __PRETTY_FUNCTION__))
;
2384 }
2385
2386 // Vectorize the interleaved load group.
2387 if (isa<LoadInst>(Instr)) {
2388 // For each unroll part, create a wide load for the group.
2389 SmallVector<Value *, 2> NewLoads;
2390 for (unsigned Part = 0; Part < UF; Part++) {
2391 Instruction *NewLoad;
2392 if (BlockInMask || MaskForGaps) {
2393 assert(useMaskedInterleavedAccesses(*TTI) &&((useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2394, __PRETTY_FUNCTION__))
2394 "masked interleaved groups are not allowed.")((useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2394, __PRETTY_FUNCTION__))
;
2395 Value *GroupMask = MaskForGaps;
2396 if (BlockInMask) {
2397 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2398 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2398, __PRETTY_FUNCTION__))
;
2399 Value *ShuffledMask = Builder.CreateShuffleVector(
2400 BlockInMaskPart,
2401 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2402 "interleaved.mask");
2403 GroupMask = MaskForGaps
2404 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2405 MaskForGaps)
2406 : ShuffledMask;
2407 }
2408 NewLoad =
2409 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2410 GroupMask, UndefVec, "wide.masked.vec");
2411 }
2412 else
2413 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2414 Group->getAlign(), "wide.vec");
2415 Group->addMetadata(NewLoad);
2416 NewLoads.push_back(NewLoad);
2417 }
2418
2419 // For each member in the group, shuffle out the appropriate data from the
2420 // wide loads.
2421 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2422 Instruction *Member = Group->getMember(I);
2423
2424 // Skip the gaps in the group.
2425 if (!Member)
2426 continue;
2427
2428 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2428, __PRETTY_FUNCTION__))
;
2429 auto StrideMask =
2430 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2431 for (unsigned Part = 0; Part < UF; Part++) {
2432 Value *StridedVec = Builder.CreateShuffleVector(
2433 NewLoads[Part], StrideMask, "strided.vec");
2434
2435 // If this member has different type, cast the result type.
2436 if (Member->getType() != ScalarTy) {
2437 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2437, __PRETTY_FUNCTION__))
;
2438 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2439 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2440 }
2441
2442 if (Group->isReverse())
2443 StridedVec = reverseVector(StridedVec);
2444
2445 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2446 }
2447 }
2448 return;
2449 }
2450
2451 // The sub vector type for current instruction.
2452 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2452, __PRETTY_FUNCTION__))
;
2453 auto *SubVT = VectorType::get(ScalarTy, VF);
2454
2455 // Vectorize the interleaved store group.
2456 for (unsigned Part = 0; Part < UF; Part++) {
2457 // Collect the stored vector from each member.
2458 SmallVector<Value *, 4> StoredVecs;
2459 for (unsigned i = 0; i < InterleaveFactor; i++) {
2460 // Interleaved store group doesn't allow a gap, so each index has a member
2461 Instruction *Member = Group->getMember(i);
2462 assert(Member && "Fail to get a member from an interleaved store group")((Member && "Fail to get a member from an interleaved store group"
) ? static_cast<void> (0) : __assert_fail ("Member && \"Fail to get a member from an interleaved store group\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2462, __PRETTY_FUNCTION__))
;
2463
2464 Value *StoredVec = getOrCreateVectorValue(
2465 cast<StoreInst>(Member)->getValueOperand(), Part);
2466 if (Group->isReverse())
2467 StoredVec = reverseVector(StoredVec);
2468
2469 // If this member has different type, cast it to a unified type.
2470
2471 if (StoredVec->getType() != SubVT)
2472 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2473
2474 StoredVecs.push_back(StoredVec);
2475 }
2476
2477 // Concatenate all vectors into a wide vector.
2478 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2479
2480 // Interleave the elements in the wide vector.
2481 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2481, __PRETTY_FUNCTION__))
;
2482 Value *IVec = Builder.CreateShuffleVector(
2483 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2484 "interleaved.vec");
2485
2486 Instruction *NewStoreInstr;
2487 if (BlockInMask) {
2488 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2489 Value *ShuffledMask = Builder.CreateShuffleVector(
2490 BlockInMaskPart,
2491 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2492 "interleaved.mask");
2493 NewStoreInstr = Builder.CreateMaskedStore(
2494 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2495 }
2496 else
2497 NewStoreInstr =
2498 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2499
2500 Group->addMetadata(NewStoreInstr);
2501 }
2502}
2503
2504void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2505 VPTransformState &State,
2506 VPValue *Addr,
2507 VPValue *StoredValue,
2508 VPValue *BlockInMask) {
2509 // Attempt to issue a wide load.
2510 LoadInst *LI = dyn_cast<LoadInst>(Instr);
2511 StoreInst *SI = dyn_cast<StoreInst>(Instr);
2512
2513 assert((LI || SI) && "Invalid Load/Store instruction")(((LI || SI) && "Invalid Load/Store instruction") ? static_cast
<void> (0) : __assert_fail ("(LI || SI) && \"Invalid Load/Store instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2513, __PRETTY_FUNCTION__))
;
2514 assert((!SI || StoredValue) && "No stored value provided for widened store")(((!SI || StoredValue) && "No stored value provided for widened store"
) ? static_cast<void> (0) : __assert_fail ("(!SI || StoredValue) && \"No stored value provided for widened store\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2514, __PRETTY_FUNCTION__))
;
2515 assert((!LI || !StoredValue) && "Stored value provided for widened load")(((!LI || !StoredValue) && "Stored value provided for widened load"
) ? static_cast<void> (0) : __assert_fail ("(!LI || !StoredValue) && \"Stored value provided for widened load\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2515, __PRETTY_FUNCTION__))
;
2516
2517 LoopVectorizationCostModel::InstWidening Decision =
2518 Cost->getWideningDecision(Instr, VF);
2519 assert((Decision == LoopVectorizationCostModel::CM_Widen ||(((Decision == LoopVectorizationCostModel::CM_Widen || Decision
== LoopVectorizationCostModel::CM_Widen_Reverse || Decision ==
LoopVectorizationCostModel::CM_GatherScatter) && "CM decision is not to widen the memory instruction"
) ? static_cast<void> (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2522, __PRETTY_FUNCTION__))
2520 Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||(((Decision == LoopVectorizationCostModel::CM_Widen || Decision
== LoopVectorizationCostModel::CM_Widen_Reverse || Decision ==
LoopVectorizationCostModel::CM_GatherScatter) && "CM decision is not to widen the memory instruction"
) ? static_cast<void> (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2522, __PRETTY_FUNCTION__))
2521 Decision == LoopVectorizationCostModel::CM_GatherScatter) &&(((Decision == LoopVectorizationCostModel::CM_Widen || Decision
== LoopVectorizationCostModel::CM_Widen_Reverse || Decision ==
LoopVectorizationCostModel::CM_GatherScatter) && "CM decision is not to widen the memory instruction"
) ? static_cast<void> (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2522, __PRETTY_FUNCTION__))
2522 "CM decision is not to widen the memory instruction")(((Decision == LoopVectorizationCostModel::CM_Widen || Decision
== LoopVectorizationCostModel::CM_Widen_Reverse || Decision ==
LoopVectorizationCostModel::CM_GatherScatter) && "CM decision is not to widen the memory instruction"
) ? static_cast<void> (0) : __assert_fail ("(Decision == LoopVectorizationCostModel::CM_Widen || Decision == LoopVectorizationCostModel::CM_Widen_Reverse || Decision == LoopVectorizationCostModel::CM_GatherScatter) && \"CM decision is not to widen the memory instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2522, __PRETTY_FUNCTION__))
;
2523
2524 Type *ScalarDataTy = getMemInstValueType(Instr);
2525
2526 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2526, __PRETTY_FUNCTION__))
;
2527 auto *DataTy = VectorType::get(ScalarDataTy, VF);
2528 const Align Alignment = getLoadStoreAlignment(Instr);
2529
2530 // Determine if the pointer operand of the access is either consecutive or
2531 // reverse consecutive.
2532 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2533 bool ConsecutiveStride =
2534 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2535 bool CreateGatherScatter =
2536 (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2537
2538 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2539 // gather/scatter. Otherwise Decision should have been to Scalarize.
2540 assert((ConsecutiveStride || CreateGatherScatter) &&(((ConsecutiveStride || CreateGatherScatter) && "The instruction should be scalarized"
) ? static_cast<void> (0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2541, __PRETTY_FUNCTION__))
2541 "The instruction should be scalarized")(((ConsecutiveStride || CreateGatherScatter) && "The instruction should be scalarized"
) ? static_cast<void> (0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2541, __PRETTY_FUNCTION__))
;
2542 (void)ConsecutiveStride;
2543
2544 VectorParts BlockInMaskParts(UF);
2545 bool isMaskRequired = BlockInMask;
2546 if (isMaskRequired)
2547 for (unsigned Part = 0; Part < UF; ++Part)
2548 BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2549
2550 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2551 // Calculate the pointer for the specific unroll-part.
2552 GetElementPtrInst *PartPtr = nullptr;
2553
2554 bool InBounds = false;
2555 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2556 InBounds = gep->isInBounds();
2557
2558 if (Reverse) {
2559 // If the address is consecutive but reversed, then the
2560 // wide store needs to start at the last vector element.
2561 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2562 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2563 PartPtr->setIsInBounds(InBounds);
2564 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2565 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2566 PartPtr->setIsInBounds(InBounds);
2567 if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2568 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2569 } else {
2570 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2571 ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue())));
2572 PartPtr->setIsInBounds(InBounds);
2573 }
2574
2575 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2576 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2577 };
2578
2579 // Handle Stores:
2580 if (SI) {
2581 setDebugLocFromInst(Builder, SI);
2582
2583 for (unsigned Part = 0; Part < UF; ++Part) {
2584 Instruction *NewSI = nullptr;
2585 Value *StoredVal = State.get(StoredValue, Part);
2586 if (CreateGatherScatter) {
2587 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2588 Value *VectorGep = State.get(Addr, Part);
2589 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2590 MaskPart);
2591 } else {
2592 if (Reverse) {
2593 // If we store to reverse consecutive memory locations, then we need
2594 // to reverse the order of elements in the stored value.
2595 StoredVal = reverseVector(StoredVal);
2596 // We don't want to update the value in the map as it might be used in
2597 // another expression. So don't call resetVectorValue(StoredVal).
2598 }
2599 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2600 if (isMaskRequired)
2601 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2602 BlockInMaskParts[Part]);
2603 else
2604 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2605 }
2606 addMetadata(NewSI, SI);
2607 }
2608 return;
2609 }
2610
2611 // Handle loads.
2612 assert(LI && "Must have a load instruction")((LI && "Must have a load instruction") ? static_cast
<void> (0) : __assert_fail ("LI && \"Must have a load instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2612, __PRETTY_FUNCTION__))
;
2613 setDebugLocFromInst(Builder, LI);
2614 for (unsigned Part = 0; Part < UF; ++Part) {
2615 Value *NewLI;
2616 if (CreateGatherScatter) {
2617 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2618 Value *VectorGep = State.get(Addr, Part);
2619 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2620 nullptr, "wide.masked.gather");
2621 addMetadata(NewLI, LI);
2622 } else {
2623 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2624 if (isMaskRequired)
2625 NewLI = Builder.CreateMaskedLoad(
2626 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2627 "wide.masked.load");
2628 else
2629 NewLI =
2630 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2631
2632 // Add metadata to the load, but setVectorValue to the reverse shuffle.
2633 addMetadata(NewLI, LI);
2634 if (Reverse)
2635 NewLI = reverseVector(NewLI);
2636 }
2637 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2638 }
2639}
2640
2641void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2642 const VPIteration &Instance,
2643 bool IfPredicateInstr,
2644 VPTransformState &State) {
2645 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")((!Instr->getType()->isAggregateType() && "Can't handle vectors"
) ? static_cast<void> (0) : __assert_fail ("!Instr->getType()->isAggregateType() && \"Can't handle vectors\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2645, __PRETTY_FUNCTION__))
;
2646
2647 setDebugLocFromInst(Builder, Instr);
2648
2649 // Does this instruction return a value ?
2650 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2651
2652 Instruction *Cloned = Instr->clone();
2653 if (!IsVoidRetTy)
2654 Cloned->setName(Instr->getName() + ".cloned");
2655
2656 // Replace the operands of the cloned instructions with their scalar
2657 // equivalents in the new loop.
2658 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2659 auto *NewOp = State.get(User.getOperand(op), Instance);
2660 Cloned->setOperand(op, NewOp);
2661 }
2662 addNewMetadata(Cloned, Instr);
2663
2664 // Place the cloned scalar in the new loop.
2665 Builder.Insert(Cloned);
2666
2667 // Add the cloned scalar to the scalar map entry.
2668 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2669
2670 // If we just cloned a new assumption, add it the assumption cache.
2671 if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2672 if (II->getIntrinsicID() == Intrinsic::assume)
2673 AC->registerAssumption(II);
2674
2675 // End if-block.
2676 if (IfPredicateInstr)
2677 PredicatedInstructions.push_back(Cloned);
2678}
2679
2680PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2681 Value *End, Value *Step,
2682 Instruction *DL) {
2683 BasicBlock *Header = L->getHeader();
2684 BasicBlock *Latch = L->getLoopLatch();
2685 // As we're just creating this loop, it's possible no latch exists
2686 // yet. If so, use the header as this will be a single block loop.
2687 if (!Latch)
2688 Latch = Header;
2689
2690 IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2691 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2692 setDebugLocFromInst(Builder, OldInst);
2693 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2694
2695 Builder.SetInsertPoint(Latch->getTerminator());
2696 setDebugLocFromInst(Builder, OldInst);
2697
2698 // Create i+1 and fill the PHINode.
2699 Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2700 Induction->addIncoming(Start, L->getLoopPreheader());
2701 Induction->addIncoming(Next, Latch);
2702 // Create the compare.
2703 Value *ICmp = Builder.CreateICmpEQ(Next, End);
2704 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2705
2706 // Now we have two terminators. Remove the old one from the block.
2707 Latch->getTerminator()->eraseFromParent();
2708
2709 return Induction;
2710}
2711
2712Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2713 if (TripCount)
2714 return TripCount;
2715
2716 assert(L && "Create Trip Count for null loop.")((L && "Create Trip Count for null loop.") ? static_cast
<void> (0) : __assert_fail ("L && \"Create Trip Count for null loop.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2716, __PRETTY_FUNCTION__))
;
2717 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2718 // Find the loop boundaries.
2719 ScalarEvolution *SE = PSE.getSE();
2720 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2721 assert(BackedgeTakenCount != SE->getCouldNotCompute() &&((BackedgeTakenCount != SE->getCouldNotCompute() &&
"Invalid loop count") ? static_cast<void> (0) : __assert_fail
("BackedgeTakenCount != SE->getCouldNotCompute() && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2722, __PRETTY_FUNCTION__))
2722 "Invalid loop count")((BackedgeTakenCount != SE->getCouldNotCompute() &&
"Invalid loop count") ? static_cast<void> (0) : __assert_fail
("BackedgeTakenCount != SE->getCouldNotCompute() && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2722, __PRETTY_FUNCTION__))
;
2723
2724 Type *IdxTy = Legal->getWidestInductionType();
2725 assert(IdxTy && "No type for induction")((IdxTy && "No type for induction") ? static_cast<
void> (0) : __assert_fail ("IdxTy && \"No type for induction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2725, __PRETTY_FUNCTION__))
;
2726
2727 // The exit count might have the type of i64 while the phi is i32. This can
2728 // happen if we have an induction variable that is sign extended before the
2729 // compare. The only way that we get a backedge taken count is that the
2730 // induction variable was signed and as such will not overflow. In such a case
2731 // truncation is legal.
2732 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2733 IdxTy->getPrimitiveSizeInBits())
2734 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2735 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2736
2737 // Get the total trip count from the count by adding 1.
2738 const SCEV *ExitCount = SE->getAddExpr(
2739 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2740
2741 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2742
2743 // Expand the trip count and place the new instructions in the preheader.
2744 // Notice that the pre-header does not change, only the loop body.
2745 SCEVExpander Exp(*SE, DL, "induction");
2746
2747 // Count holds the overall loop count (N).
2748 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2749 L->getLoopPreheader()->getTerminator());
2750
2751 if (TripCount->getType()->isPointerTy())
2752 TripCount =
2753 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2754 L->getLoopPreheader()->getTerminator());
2755
2756 return TripCount;
2757}
2758
2759Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2760 if (VectorTripCount)
2761 return VectorTripCount;
2762
2763 Value *TC = getOrCreateTripCount(L);
2764 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2765
2766 Type *Ty = TC->getType();
2767 // This is where we can make the step a runtime constant.
2768 assert(!VF.isScalable() && "scalable vectorization is not supported yet")((!VF.isScalable() && "scalable vectorization is not supported yet"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectorization is not supported yet\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2768, __PRETTY_FUNCTION__))
;
2769 Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF);
2770
2771 // If the tail is to be folded by masking, round the number of iterations N
2772 // up to a multiple of Step instead of rounding down. This is done by first
2773 // adding Step-1 and then rounding down. Note that it's ok if this addition
2774 // overflows: the vector induction variable will eventually wrap to zero given
2775 // that it starts at zero and its Step is a power of two; the loop will then
2776 // exit, with the last early-exit vector comparison also producing all-true.
2777 if (Cost->foldTailByMasking()) {
2778 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&((isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2779, __PRETTY_FUNCTION__))
2779 "VF*UF must be a power of 2 when folding tail by masking")((isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2779, __PRETTY_FUNCTION__))
;
2780 TC = Builder.CreateAdd(
2781 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
2782 }
2783
2784 // Now we need to generate the expression for the part of the loop that the
2785 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2786 // iterations are not required for correctness, or N - Step, otherwise. Step
2787 // is equal to the vectorization factor (number of SIMD elements) times the
2788 // unroll factor (number of SIMD instructions).
2789 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2790
2791 // If there is a non-reversed interleaved group that may speculatively access
2792 // memory out-of-bounds, we need to ensure that there will be at least one
2793 // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2794 // the trip count, we set the remainder to be equal to the step. If the step
2795 // does not evenly divide the trip count, no adjustment is necessary since
2796 // there will already be scalar iterations. Note that the minimum iterations
2797 // check ensures that N >= Step.
2798 if (VF.isVector() && Cost->requiresScalarEpilogue()) {
2799 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2800 R = Builder.CreateSelect(IsZero, Step, R);
2801 }
2802
2803 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2804
2805 return VectorTripCount;
2806}
2807
2808Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2809 const DataLayout &DL) {
2810 // Verify that V is a vector type with same number of elements as DstVTy.
2811 auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2812 unsigned VF = DstFVTy->getNumElements();
2813 auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2814 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match")(((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"
) ? static_cast<void> (0) : __assert_fail ("(VF == SrcVecTy->getNumElements()) && \"Vector dimensions do not match\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2814, __PRETTY_FUNCTION__))
;
2815 Type *SrcElemTy = SrcVecTy->getElementType();
2816 Type *DstElemTy = DstFVTy->getElementType();
2817 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&(((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy
)) && "Vector elements must have same size") ? static_cast
<void> (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2818, __PRETTY_FUNCTION__))
2818 "Vector elements must have same size")(((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy
)) && "Vector elements must have same size") ? static_cast
<void> (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2818, __PRETTY_FUNCTION__))
;
2819
2820 // Do a direct cast if element types are castable.
2821 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2822 return Builder.CreateBitOrPointerCast(V, DstFVTy);
2823 }
2824 // V cannot be directly casted to desired vector type.
2825 // May happen when V is a floating point vector but DstVTy is a vector of
2826 // pointers or vice-versa. Handle this using a two-step bitcast using an
2827 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2828 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&(((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()
) && "Only one type should be a pointer type") ? static_cast
<void> (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2829, __PRETTY_FUNCTION__))
2829 "Only one type should be a pointer type")(((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()
) && "Only one type should be a pointer type") ? static_cast
<void> (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2829, __PRETTY_FUNCTION__))
;
2830 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&(((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy
()) && "Only one type should be a floating point type"
) ? static_cast<void> (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2831, __PRETTY_FUNCTION__))
2831 "Only one type should be a floating point type")(((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy
()) && "Only one type should be a floating point type"
) ? static_cast<void> (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2831, __PRETTY_FUNCTION__))
;
2832 Type *IntTy =
2833 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2834 auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2835 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2836 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2837}
2838
2839void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2840 BasicBlock *Bypass) {
2841 Value *Count = getOrCreateTripCount(L);
2842 // Reuse existing vector loop preheader for TC checks.
2843 // Note that new preheader block is generated for vector loop.
2844 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2845 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2846
2847 // Generate code to check if the loop's trip count is less than VF * UF, or
2848 // equal to it in case a scalar epilogue is required; this implies that the
2849 // vector trip count is zero. This check also covers the case where adding one
2850 // to the backedge-taken count overflowed leading to an incorrect trip count
2851 // of zero. In this case we will also jump to the scalar loop.
2852 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2853 : ICmpInst::ICMP_ULT;
2854
2855 // If tail is to be folded, vector loop takes care of all iterations.
2856 Value *CheckMinIters = Builder.getFalse();
2857 if (!Cost->foldTailByMasking()) {
2858 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2858, __PRETTY_FUNCTION__))
;
2859 CheckMinIters = Builder.CreateICmp(
2860 P, Count,
2861 ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF),
2862 "min.iters.check");
2863 }
2864 // Create new preheader for vector loop.
2865 LoopVectorPreHeader =
2866 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2867 "vector.ph");
2868
2869 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2871, __PRETTY_FUNCTION__))
2870 DT->getNode(Bypass)->getIDom()) &&((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2871, __PRETTY_FUNCTION__))
2871 "TC check is expected to dominate Bypass")((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2871, __PRETTY_FUNCTION__))
;
2872
2873 // Update dominator for Bypass & LoopExit.
2874 DT->changeImmediateDominator(Bypass, TCCheckBlock);
2875 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2876
2877 ReplaceInstWithInst(
2878 TCCheckBlock->getTerminator(),
2879 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2880 LoopBypassBlocks.push_back(TCCheckBlock);
2881}
2882
2883void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2884 // Reuse existing vector loop preheader for SCEV checks.
2885 // Note that new preheader block is generated for vector loop.
2886 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2887
2888 // Generate the code to check that the SCEV assumptions that we made.
2889 // We want the new basic block to start at the first instruction in a
2890 // sequence of instructions that form a check.
2891 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2892 "scev.check");
2893 Value *SCEVCheck = Exp.expandCodeForPredicate(
2894 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2895
2896 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2897 if (C->isZero())
2898 return;
2899
2900 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||((!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile
&& Cost->Hints->getForce() != LoopVectorizeHints
::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2903, __PRETTY_FUNCTION__))
2901 (OptForSizeBasedOnProfile &&((!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile
&& Cost->Hints->getForce() != LoopVectorizeHints
::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2903, __PRETTY_FUNCTION__))
2902 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&((!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile
&& Cost->Hints->getForce() != LoopVectorizeHints
::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2903, __PRETTY_FUNCTION__))
2903 "Cannot SCEV check stride or overflow when optimizing for size")((!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile
&& Cost->Hints->getForce() != LoopVectorizeHints
::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2903, __PRETTY_FUNCTION__))
;
2904
2905 SCEVCheckBlock->setName("vector.scevcheck");
2906 // Create new preheader for vector loop.
2907 LoopVectorPreHeader =
2908 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2909 nullptr, "vector.ph");
2910
2911 // Update dominator only if this is first RT check.
2912 if (LoopBypassBlocks.empty()) {
2913 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2914 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2915 }
2916
2917 ReplaceInstWithInst(
2918 SCEVCheckBlock->getTerminator(),
2919 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2920 LoopBypassBlocks.push_back(SCEVCheckBlock);
2921 AddedSafetyChecks = true;
2922}
2923
2924void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2925 // VPlan-native path does not do any analysis for runtime checks currently.
2926 if (EnableVPlanNativePath)
2927 return;
2928
2929 // Reuse existing vector loop preheader for runtime memory checks.
2930 // Note that new preheader block is generated for vector loop.
2931 BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2932
2933 // Generate the code that checks in runtime if arrays overlap. We put the
2934 // checks into a separate block to make the more common case of few elements
2935 // faster.
2936 auto *LAI = Legal->getLAI();
2937 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
2938 if (!RtPtrChecking.Need)
2939 return;
2940
2941 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2942 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&((Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled
&& "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? static_cast<void> (0) : __assert_fail
("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2944, __PRETTY_FUNCTION__))
2943 "Cannot emit memory checks when optimizing for size, unless forced "((Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled
&& "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? static_cast<void> (0) : __assert_fail
("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2944, __PRETTY_FUNCTION__))
2944 "to vectorize.")((Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled
&& "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? static_cast<void> (0) : __assert_fail
("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2944, __PRETTY_FUNCTION__))
;
2945 ORE->emit([&]() {
2946 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationCodeSize",
2947 L->getStartLoc(), L->getHeader())
2948 << "Code-size may be reduced by not forcing "
2949 "vectorization, or by source-code modifications "
2950 "eliminating the need for runtime checks "
2951 "(e.g., adding 'restrict').";
2952 });
2953 }
2954
2955 MemCheckBlock->setName("vector.memcheck");
2956 // Create new preheader for vector loop.
2957 LoopVectorPreHeader =
2958 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2959 "vector.ph");
2960
2961 auto *CondBranch = cast<BranchInst>(
2962 Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
2963 ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
2964 LoopBypassBlocks.push_back(MemCheckBlock);
2965 AddedSafetyChecks = true;
2966
2967 // Update dominator only if this is first RT check.
2968 if (LoopBypassBlocks.empty()) {
2969 DT->changeImmediateDominator(Bypass, MemCheckBlock);
2970 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2971 }
2972
2973 Instruction *FirstCheckInst;
2974 Instruction *MemRuntimeCheck;
2975 std::tie(FirstCheckInst, MemRuntimeCheck) =
2976 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
2977 RtPtrChecking.getChecks(), RtPtrChecking.getSE());
2978 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "((MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? static_cast<void> (0)
: __assert_fail ("MemRuntimeCheck && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2979, __PRETTY_FUNCTION__))
2979 "claimed checks are required")((MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? static_cast<void> (0)
: __assert_fail ("MemRuntimeCheck && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2979, __PRETTY_FUNCTION__))
;
2980 CondBranch->setCondition(MemRuntimeCheck);
2981
2982 // We currently don't use LoopVersioning for the actual loop cloning but we
2983 // still use it to add the noalias metadata.
2984 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2985 PSE.getSE());
2986 LVer->prepareNoAliasMetadata();
2987}
2988
2989Value *InnerLoopVectorizer::emitTransformedIndex(
2990 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2991 const InductionDescriptor &ID) const {
2992
2993 SCEVExpander Exp(*SE, DL, "induction");
2994 auto Step = ID.getStep();
2995 auto StartValue = ID.getStartValue();
2996 assert(Index->getType() == Step->getType() &&((Index->getType() == Step->getType() && "Index type does not match StepValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == Step->getType() && \"Index type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2997, __PRETTY_FUNCTION__))
2997 "Index type does not match StepValue type")((Index->getType() == Step->getType() && "Index type does not match StepValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == Step->getType() && \"Index type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2997, __PRETTY_FUNCTION__))
;
2998
2999 // Note: the IR at this point is broken. We cannot use SE to create any new
3000 // SCEV and then expand it, hoping that SCEV's simplification will give us
3001 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3002 // lead to various SCEV crashes. So all we can do is to use builder and rely
3003 // on InstCombine for future simplifications. Here we handle some trivial
3004 // cases only.
3005 auto CreateAdd = [&B](Value *X, Value *Y) {
3006 assert(X->getType() == Y->getType() && "Types don't match!")((X->getType() == Y->getType() && "Types don't match!"
) ? static_cast<void> (0) : __assert_fail ("X->getType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3006, __PRETTY_FUNCTION__))
;
3007 if (auto *CX = dyn_cast<ConstantInt>(X))
3008 if (CX->isZero())
3009 return Y;
3010 if (auto *CY = dyn_cast<ConstantInt>(Y))
3011 if (CY->isZero())
3012 return X;
3013 return B.CreateAdd(X, Y);
3014 };
3015
3016 auto CreateMul = [&B](Value *X, Value *Y) {
3017 assert(X->getType() == Y->getType() && "Types don't match!")((X->getType() == Y->getType() && "Types don't match!"
) ? static_cast<void> (0) : __assert_fail ("X->getType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3017, __PRETTY_FUNCTION__))
;
3018 if (auto *CX = dyn_cast<ConstantInt>(X))
3019 if (CX->isOne())
3020 return Y;
3021 if (auto *CY = dyn_cast<ConstantInt>(Y))
3022 if (CY->isOne())
3023 return X;
3024 return B.CreateMul(X, Y);
3025 };
3026
3027 // Get a suitable insert point for SCEV expansion. For blocks in the vector
3028 // loop, choose the end of the vector loop header (=LoopVectorBody), because
3029 // the DomTree is not kept up-to-date for additional blocks generated in the
3030 // vector loop. By using the header as insertion point, we guarantee that the
3031 // expanded instructions dominate all their uses.
3032 auto GetInsertPoint = [this, &B]() {
3033 BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3034 if (InsertBB != LoopVectorBody &&
3035 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3036 return LoopVectorBody->getTerminator();
3037 return &*B.GetInsertPoint();
3038 };
3039 switch (ID.getKind()) {
3040 case InductionDescriptor::IK_IntInduction: {
3041 assert(Index->getType() == StartValue->getType() &&((Index->getType() == StartValue->getType() && "Index type does not match StartValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3042, __PRETTY_FUNCTION__))
3042 "Index type does not match StartValue type")((Index->getType() == StartValue->getType() && "Index type does not match StartValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3042, __PRETTY_FUNCTION__))
;
3043 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3044 return B.CreateSub(StartValue, Index);
3045 auto *Offset = CreateMul(
3046 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3047 return CreateAdd(StartValue, Offset);
3048 }
3049 case InductionDescriptor::IK_PtrInduction: {
3050 assert(isa<SCEVConstant>(Step) &&((isa<SCEVConstant>(Step) && "Expected constant step for pointer induction"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3051, __PRETTY_FUNCTION__))
3051 "Expected constant step for pointer induction")((isa<SCEVConstant>(Step) && "Expected constant step for pointer induction"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3051, __PRETTY_FUNCTION__))
;
3052 return B.CreateGEP(
3053 StartValue->getType()->getPointerElementType(), StartValue,
3054 CreateMul(Index,
3055 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3056 }
3057 case InductionDescriptor::IK_FpInduction: {
3058 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value")((Step->getType()->isFloatingPointTy() && "Expected FP Step value"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isFloatingPointTy() && \"Expected FP Step value\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3058, __PRETTY_FUNCTION__))
;
3059 auto InductionBinOp = ID.getInductionBinOp();
3060 assert(InductionBinOp &&((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3063, __PRETTY_FUNCTION__))
3061 (InductionBinOp->getOpcode() == Instruction::FAdd ||((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3063, __PRETTY_FUNCTION__))
3062 InductionBinOp->getOpcode() == Instruction::FSub) &&((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3063, __PRETTY_FUNCTION__))
3063 "Original bin op should be defined for FP induction")((InductionBinOp && (InductionBinOp->getOpcode() ==
Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3063, __PRETTY_FUNCTION__))
;
3064
3065 Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3066
3067 // Floating point operations had to be 'fast' to enable the induction.
3068 FastMathFlags Flags;
3069 Flags.setFast();
3070
3071 Value *MulExp = B.CreateFMul(StepValue, Index);
3072 if (isa<Instruction>(MulExp))
3073 // We have to check, the MulExp may be a constant.
3074 cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3075
3076 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3077 "induction");
3078 if (isa<Instruction>(BOp))
3079 cast<Instruction>(BOp)->setFastMathFlags(Flags);
3080
3081 return BOp;
3082 }
3083 case InductionDescriptor::IK_NoInduction:
3084 return nullptr;
3085 }
3086 llvm_unreachable("invalid enum")::llvm::llvm_unreachable_internal("invalid enum", "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3086)
;
3087}
3088
3089Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3090 LoopScalarBody = OrigLoop->getHeader();
3091 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3092 LoopExitBlock = OrigLoop->getExitBlock();
3093 assert(LoopExitBlock && "Must have an exit block")((LoopExitBlock && "Must have an exit block") ? static_cast
<void> (0) : __assert_fail ("LoopExitBlock && \"Must have an exit block\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3093, __PRETTY_FUNCTION__))
;
3094 assert(LoopVectorPreHeader && "Invalid loop structure")((LoopVectorPreHeader && "Invalid loop structure") ? static_cast
<void> (0) : __assert_fail ("LoopVectorPreHeader && \"Invalid loop structure\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3094, __PRETTY_FUNCTION__))
;
3095
3096 LoopMiddleBlock =
3097 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3098 LI, nullptr, Twine(Prefix) + "middle.block");
3099 LoopScalarPreHeader =
3100 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3101 nullptr, Twine(Prefix) + "scalar.ph");
3102 // We intentionally don't let SplitBlock to update LoopInfo since
3103 // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3104 // LoopVectorBody is explicitly added to the correct place few lines later.
3105 LoopVectorBody =
3106 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3107 nullptr, nullptr, Twine(Prefix) + "vector.body");
3108
3109 // Update dominator for loop exit.
3110 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3111
3112 // Create and register the new vector loop.
3113 Loop *Lp = LI->AllocateLoop();
3114 Loop *ParentLoop = OrigLoop->getParentLoop();
3115
3116 // Insert the new loop into the loop nest and register the new basic blocks
3117 // before calling any utilities such as SCEV that require valid LoopInfo.
3118 if (ParentLoop) {
3119 ParentLoop->addChildLoop(Lp);
3120 } else {
3121 LI->addTopLevelLoop(Lp);
3122 }
3123 Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3124 return Lp;
3125}
3126
3127void InnerLoopVectorizer::createInductionResumeValues(Loop *L,
3128 Value *VectorTripCount) {
3129 assert(VectorTripCount && L && "Expected valid arguments")((VectorTripCount && L && "Expected valid arguments"
) ? static_cast<void> (0) : __assert_fail ("VectorTripCount && L && \"Expected valid arguments\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3129, __PRETTY_FUNCTION__))
;
3130 // We are going to resume the execution of the scalar loop.
3131 // Go over all of the induction variables that we found and fix the
3132 // PHIs that are left in the scalar version of the loop.
3133 // The starting values of PHI nodes depend on the counter of the last
3134 // iteration in the vectorized loop.
3135 // If we come from a bypass edge then we need to start from the original
3136 // start value.
3137 for (auto &InductionEntry : Legal->getInductionVars()) {
3138 PHINode *OrigPhi = InductionEntry.first;
3139 InductionDescriptor II = InductionEntry.second;
3140
3141 // Create phi nodes to merge from the backedge-taken check block.
3142 PHINode *BCResumeVal =
3143 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3144 LoopScalarPreHeader->getTerminator());
3145 // Copy original phi DL over to the new one.
3146 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3147 Value *&EndValue = IVEndValues[OrigPhi];
3148 if (OrigPhi == OldInduction) {
3149 // We know what the end value is.
3150 EndValue = VectorTripCount;
3151 } else {
3152 IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3153 Type *StepType = II.getStep()->getType();
3154 Instruction::CastOps CastOp =
3155 CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3156 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3157 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3158 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3159 EndValue->setName("ind.end");
3160 }
3161
3162 // The new PHI merges the original incoming value, in case of a bypass,
3163 // or the value at the end of the vectorized loop.
3164 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3165
3166 // Fix the scalar body counter (PHI node).
3167 // The old induction's phi node in the scalar body needs the truncated
3168 // value.
3169 for (BasicBlock *BB : LoopBypassBlocks)
3170 BCResumeVal->addIncoming(II.getStartValue(), BB);
3171 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3172 }
3173}
3174
3175BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3176 MDNode *OrigLoopID) {
3177 assert(L && "Expected valid loop.")((L && "Expected valid loop.") ? static_cast<void>
(0) : __assert_fail ("L && \"Expected valid loop.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3177, __PRETTY_FUNCTION__))
;
3178
3179 // The trip counts should be cached by now.
3180 Value *Count = getOrCreateTripCount(L);
3181 Value *VectorTripCount = getOrCreateVectorTripCount(L);
3182
3183 // We need the OrigLoop (scalar loop part) latch terminator to help
3184 // produce correct debug info for the middle block BB instructions.
3185 // The legality check stage guarantees that the loop will have a single
3186 // latch.
3187 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&((isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator
()) && "Scalar loop latch terminator isn't a branch")
? static_cast<void> (0) : __assert_fail ("isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && \"Scalar loop latch terminator isn't a branch\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3188, __PRETTY_FUNCTION__))
3188 "Scalar loop latch terminator isn't a branch")((isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator
()) && "Scalar loop latch terminator isn't a branch")
? static_cast<void> (0) : __assert_fail ("isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && \"Scalar loop latch terminator isn't a branch\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3188, __PRETTY_FUNCTION__))
;
3189 BranchInst *ScalarLatchBr =
3190 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3191
3192 // Add a check in the middle block to see if we have completed
3193 // all of the iterations in the first vector loop.
3194 // If (N - N%VF) == N, then we *don't* need to run the remainder.
3195 // If tail is to be folded, we know we don't need to run the remainder.
3196 Value *CmpN = Builder.getTrue();
3197 if (!Cost->foldTailByMasking()) {
3198 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3199 VectorTripCount, "cmp.n",
3200 LoopMiddleBlock->getTerminator());
3201
3202 // Here we use the same DebugLoc as the scalar loop latch branch instead
3203 // of the corresponding compare because they may have ended up with
3204 // different line numbers and we want to avoid awkward line stepping while
3205 // debugging. Eg. if the compare has got a line number inside the loop.
3206 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3207 }
3208
3209 BranchInst *BrInst =
3210 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3211 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3212 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3213
3214 // Get ready to start creating new instructions into the vectorized body.
3215 assert(LoopVectorPreHeader == L->getLoopPreheader() &&((LoopVectorPreHeader == L->getLoopPreheader() && "Inconsistent vector loop preheader"
) ? static_cast<void> (0) : __assert_fail ("LoopVectorPreHeader == L->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3216, __PRETTY_FUNCTION__))
3216 "Inconsistent vector loop preheader")((LoopVectorPreHeader == L->getLoopPreheader() && "Inconsistent vector loop preheader"
) ? static_cast<void> (0) : __assert_fail ("LoopVectorPreHeader == L->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3216, __PRETTY_FUNCTION__))
;
3217 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3218
3219 Optional<MDNode *> VectorizedLoopID =
3220 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3221 LLVMLoopVectorizeFollowupVectorized});
3222 if (VectorizedLoopID.hasValue()) {
3223 L->setLoopID(VectorizedLoopID.getValue());
3224
3225 // Do not setAlreadyVectorized if loop attributes have been defined
3226 // explicitly.
3227 return LoopVectorPreHeader;
3228 }
3229
3230 // Keep all loop hints from the original loop on the vector loop (we'll
3231 // replace the vectorizer-specific hints below).
3232 if (MDNode *LID = OrigLoop->getLoopID())
3233 L->setLoopID(LID);
3234
3235 LoopVectorizeHints Hints(L, true, *ORE);
3236 Hints.setAlreadyVectorized();
3237
3238#ifdef EXPENSIVE_CHECKS
3239 assert(DT->verify(DominatorTree::VerificationLevel::Fast))((DT->verify(DominatorTree::VerificationLevel::Fast)) ? static_cast
<void> (0) : __assert_fail ("DT->verify(DominatorTree::VerificationLevel::Fast)"
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3239, __PRETTY_FUNCTION__))
;
3240 LI->verify(*DT);
3241#endif
3242
3243 return LoopVectorPreHeader;
3244}
3245
3246BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3247 /*
3248 In this function we generate a new loop. The new loop will contain
3249 the vectorized instructions while the old loop will continue to run the
3250 scalar remainder.
3251
3252 [ ] <-- loop iteration number check.
3253 / |
3254 / v
3255 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3256 | / |
3257 | / v
3258 || [ ] <-- vector pre header.
3259 |/ |
3260 | v
3261 | [ ] \
3262 | [ ]_| <-- vector loop.
3263 | |
3264 | v
3265 | -[ ] <--- middle-block.
3266 | / |
3267 | / v
3268 -|- >[ ] <--- new preheader.
3269 | |
3270 | v
3271 | [ ] \
3272 | [ ]_| <-- old scalar loop to handle remainder.
3273 \ |
3274 \ v
3275 >[ ] <-- exit block.
3276 ...
3277 */
3278
3279 // Get the metadata of the original loop before it gets modified.
3280 MDNode *OrigLoopID = OrigLoop->getLoopID();
3281
3282 // Create an empty vector loop, and prepare basic blocks for the runtime
3283 // checks.
3284 Loop *Lp = createVectorLoopSkeleton("");
3285
3286 // Now, compare the new count to zero. If it is zero skip the vector loop and
3287 // jump to the scalar loop. This check also covers the case where the
3288 // backedge-taken count is uint##_max: adding one to it will overflow leading
3289 // to an incorrect trip count of zero. In this (rare) case we will also jump
3290 // to the scalar loop.
3291 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3292
3293 // Generate the code to check any assumptions that we've made for SCEV
3294 // expressions.
3295 emitSCEVChecks(Lp, LoopScalarPreHeader);
3296
3297 // Generate the code that checks in runtime if arrays overlap. We put the
3298 // checks into a separate block to make the more common case of few elements
3299 // faster.
3300 emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3301
3302 // Some loops have a single integer induction variable, while other loops
3303 // don't. One example is c++ iterators that often have multiple pointer
3304 // induction variables. In the code below we also support a case where we
3305 // don't have a single induction variable.
3306 //
3307 // We try to obtain an induction variable from the original loop as hard
3308 // as possible. However if we don't find one that:
3309 // - is an integer
3310 // - counts from zero, stepping by one
3311 // - is the size of the widest induction variable type
3312 // then we create a new one.
3313 OldInduction = Legal->getPrimaryInduction();
3314 Type *IdxTy = Legal->getWidestInductionType();
3315 Value *StartIdx = ConstantInt::get(IdxTy, 0);
3316 // The loop step is equal to the vectorization factor (num of SIMD elements)
3317 // times the unroll factor (num of SIMD instructions).
3318 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3318, __PRETTY_FUNCTION__))
;
3319 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
3320 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3321 Induction =
3322 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3323 getDebugLocFromInstOrOperands(OldInduction));
3324
3325 // Emit phis for the new starting index of the scalar loop.
3326 createInductionResumeValues(Lp, CountRoundDown);
3327
3328 return completeLoopSkeleton(Lp, OrigLoopID);
3329}
3330
3331// Fix up external users of the induction variable. At this point, we are
3332// in LCSSA form, with all external PHIs that use the IV having one input value,
3333// coming from the remainder loop. We need those PHIs to also have a correct
3334// value for the IV when arriving directly from the middle block.
3335void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3336 const InductionDescriptor &II,
3337 Value *CountRoundDown, Value *EndValue,
3338 BasicBlock *MiddleBlock) {
3339 // There are two kinds of external IV usages - those that use the value
3340 // computed in the last iteration (the PHI) and those that use the penultimate
3341 // value (the value that feeds into the phi from the loop latch).
3342 // We allow both, but they, obviously, have different values.
3343
3344 assert(OrigLoop->getExitBlock() && "Expected a single exit block")((OrigLoop->getExitBlock() && "Expected a single exit block"
) ? static_cast<void> (0) : __assert_fail ("OrigLoop->getExitBlock() && \"Expected a single exit block\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3344, __PRETTY_FUNCTION__))
;
3345
3346 DenseMap<Value *, Value *> MissingVals;
3347
3348 // An external user of the last iteration's value should see the value that
3349 // the remainder loop uses to initialize its own IV.
3350 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3351 for (User *U : PostInc->users()) {
3352 Instruction *UI = cast<Instruction>(U);
3353 if (!OrigLoop->contains(UI)) {
3354 assert(isa<PHINode>(UI) && "Expected LCSSA form")((isa<PHINode>(UI) && "Expected LCSSA form") ? static_cast
<void> (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3354, __PRETTY_FUNCTION__))
;
3355 MissingVals[UI] = EndValue;
3356 }
3357 }
3358
3359 // An external user of the penultimate value need to see EndValue - Step.
3360 // The simplest way to get this is to recompute it from the constituent SCEVs,
3361 // that is Start + (Step * (CRD - 1)).
3362 for (User *U : OrigPhi->users()) {
3363 auto *UI = cast<Instruction>(U);
3364 if (!OrigLoop->contains(UI)) {
3365 const DataLayout &DL =
3366 OrigLoop->getHeader()->getModule()->getDataLayout();
3367 assert(isa<PHINode>(UI) && "Expected LCSSA form")((isa<PHINode>(UI) && "Expected LCSSA form") ? static_cast
<void> (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3367, __PRETTY_FUNCTION__))
;
3368
3369 IRBuilder<> B(MiddleBlock->getTerminator());
3370 Value *CountMinusOne = B.CreateSub(
3371 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3372 Value *CMO =
3373 !II.getStep()->getType()->isIntegerTy()
3374 ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3375 II.getStep()->getType())
3376 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3377 CMO->setName("cast.cmo");
3378 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3379 Escape->setName("ind.escape");
3380 MissingVals[UI] = Escape;
3381 }
3382 }
3383
3384 for (auto &I : MissingVals) {
3385 PHINode *PHI = cast<PHINode>(I.first);
3386 // One corner case we have to handle is two IVs "chasing" each-other,
3387 // that is %IV2 = phi [...], [ %IV1, %latch ]
3388 // In this case, if IV1 has an external use, we need to avoid adding both
3389 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3390 // don't already have an incoming value for the middle block.
3391 if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3392 PHI->addIncoming(I.second, MiddleBlock);
3393 }
3394}
3395
3396namespace {
3397
3398struct CSEDenseMapInfo {
3399 static bool canHandle(const Instruction *I) {
3400 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3401 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3402 }
3403
3404 static inline Instruction *getEmptyKey() {
3405 return DenseMapInfo<Instruction *>::getEmptyKey();
3406 }
3407
3408 static inline Instruction *getTombstoneKey() {
3409 return DenseMapInfo<Instruction *>::getTombstoneKey();
3410 }
3411
3412 static unsigned getHashValue(const Instruction *I) {
3413 assert(canHandle(I) && "Unknown instruction!")((canHandle(I) && "Unknown instruction!") ? static_cast
<void> (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3413, __PRETTY_FUNCTION__))
;
3414 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3415 I->value_op_end()));
3416 }
3417
3418 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3419 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3420 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3421 return LHS == RHS;
3422 return LHS->isIdenticalTo(RHS);
3423 }
3424};
3425
3426} // end anonymous namespace
3427
3428///Perform cse of induction variable instructions.
3429static void cse(BasicBlock *BB) {
3430 // Perform simple cse.
3431 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3432 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3433 Instruction *In = &*I++;
3434
3435 if (!CSEDenseMapInfo::canHandle(In))
3436 continue;
3437
3438 // Check if we can replace this instruction with any of the
3439 // visited instructions.
3440 if (Instruction *V = CSEMap.lookup(In)) {
3441 In->replaceAllUsesWith(V);
3442 In->eraseFromParent();
3443 continue;
3444 }
3445
3446 CSEMap[In] = In;
3447 }
3448}
3449
3450unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3451 ElementCount VF,
3452 bool &NeedToScalarize) {
3453 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3453, __PRETTY_FUNCTION__))
;
3454 Function *F = CI->getCalledFunction();
3455 Type *ScalarRetTy = CI->getType();
3456 SmallVector<Type *, 4> Tys, ScalarTys;
3457 for (auto &ArgOp : CI->arg_operands())
3458 ScalarTys.push_back(ArgOp->getType());
3459
3460 // Estimate cost of scalarized vector call. The source operands are assumed
3461 // to be vectors, so we need to extract individual elements from there,
3462 // execute VF scalar calls, and then gather the result into the vector return
3463 // value.
3464 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3465 TTI::TCK_RecipThroughput);
3466 if (VF.isScalar())
3467 return ScalarCallCost;
3468
3469 // Compute corresponding vector type for return value and arguments.
3470 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3471 for (Type *ScalarTy : ScalarTys)
3472 Tys.push_back(ToVectorTy(ScalarTy, VF));
3473
3474 // Compute costs of unpacking argument values for the scalar calls and
3475 // packing the return values to a vector.
3476 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3477
3478 unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3479
3480 // If we can't emit a vector call for this function, then the currently found
3481 // cost is the cost we need to return.
3482 NeedToScalarize = true;
3483 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3484 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3485
3486 if (!TLI || CI->isNoBuiltin() || !VecFunc)
3487 return Cost;
3488
3489 // If the corresponding vector cost is cheaper, return its cost.
3490 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3491 TTI::TCK_RecipThroughput);
3492 if (VectorCallCost < Cost) {
3493 NeedToScalarize = false;
3494 return VectorCallCost;
3495 }
3496 return Cost;
3497}
3498
3499unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3500 ElementCount VF) {
3501 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3502 assert(ID && "Expected intrinsic call!")((ID && "Expected intrinsic call!") ? static_cast<
void> (0) : __assert_fail ("ID && \"Expected intrinsic call!\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3502, __PRETTY_FUNCTION__))
;
3503
3504 IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3505 return TTI.getIntrinsicInstrCost(CostAttrs,
3506 TargetTransformInfo::TCK_RecipThroughput);
3507}
3508
3509static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3510 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3511 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3512 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3513}
3514
3515static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3516 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3517 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3518 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3519}
3520
3521void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3522 // For every instruction `I` in MinBWs, truncate the operands, create a
3523 // truncated version of `I` and reextend its result. InstCombine runs
3524 // later and will remove any ext/trunc pairs.
3525 SmallPtrSet<Value *, 4> Erased;
3526 for (const auto &KV : Cost->getMinimalBitwidths()) {
3527 // If the value wasn't vectorized, we must maintain the original scalar
3528 // type. The absence of the value from VectorLoopValueMap indicates that it
3529 // wasn't vectorized.
3530 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3531 continue;
3532 for (unsigned Part = 0; Part < UF; ++Part) {
3533 Value *I = getOrCreateVectorValue(KV.first, Part);
3534 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3535 continue;
3536 Type *OriginalTy = I->getType();
3537 Type *ScalarTruncatedTy =
3538 IntegerType::get(OriginalTy->getContext(), KV.second);
3539 auto *TruncatedTy = FixedVectorType::get(
3540 ScalarTruncatedTy,
3541 cast<FixedVectorType>(OriginalTy)->getNumElements());
3542 if (TruncatedTy == OriginalTy)
3543 continue;
3544
3545 IRBuilder<> B(cast<Instruction>(I));
3546 auto ShrinkOperand = [&](Value *V) -> Value * {
3547 if (auto *ZI = dyn_cast<ZExtInst>(V))
3548 if (ZI->getSrcTy() == TruncatedTy)
3549 return ZI->getOperand(0);
3550 return B.CreateZExtOrTrunc(V, TruncatedTy);
3551 };
3552
3553 // The actual instruction modification depends on the instruction type,
3554 // unfortunately.
3555 Value *NewI = nullptr;
3556 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3557 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3558 ShrinkOperand(BO->getOperand(1)));
3559
3560 // Any wrapping introduced by shrinking this operation shouldn't be
3561 // considered undefined behavior. So, we can't unconditionally copy
3562 // arithmetic wrapping flags to NewI.
3563 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3564 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3565 NewI =
3566 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3567 ShrinkOperand(CI->getOperand(1)));
3568 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3569 NewI = B.CreateSelect(SI->getCondition(),
3570 ShrinkOperand(SI->getTrueValue()),
3571 ShrinkOperand(SI->getFalseValue()));
3572 } else if (auto *CI = dyn_cast<CastInst>(I)) {
3573 switch (CI->getOpcode()) {
3574 default:
3575 llvm_unreachable("Unhandled cast!")::llvm::llvm_unreachable_internal("Unhandled cast!", "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3575)
;
3576 case Instruction::Trunc:
3577 NewI = ShrinkOperand(CI->getOperand(0));
3578 break;
3579 case Instruction::SExt:
3580 NewI = B.CreateSExtOrTrunc(
3581 CI->getOperand(0),
3582 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3583 break;
3584 case Instruction::ZExt:
3585 NewI = B.CreateZExtOrTrunc(
3586 CI->getOperand(0),
3587 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3588 break;
3589 }
3590 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3591 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3592 ->getNumElements();
3593 auto *O0 = B.CreateZExtOrTrunc(
3594 SI->getOperand(0),
3595 FixedVectorType::get(ScalarTruncatedTy, Elements0));
3596 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3597 ->getNumElements();
3598 auto *O1 = B.CreateZExtOrTrunc(
3599 SI->getOperand(1),
3600 FixedVectorType::get(ScalarTruncatedTy, Elements1));
3601
3602 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3603 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3604 // Don't do anything with the operands, just extend the result.
3605 continue;
3606 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3607 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3608 ->getNumElements();
3609 auto *O0 = B.CreateZExtOrTrunc(
3610 IE->getOperand(0),
3611 FixedVectorType::get(ScalarTruncatedTy, Elements));
3612 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3613 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3614 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3615 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3616 ->getNumElements();
3617 auto *O0 = B.CreateZExtOrTrunc(
3618 EE->getOperand(0),
3619 FixedVectorType::get(ScalarTruncatedTy, Elements));
3620 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3621 } else {
3622 // If we don't know what to do, be conservative and don't do anything.
3623 continue;
3624 }
3625
3626 // Lastly, extend the result.
3627 NewI->takeName(cast<Instruction>(I));
3628 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3629 I->replaceAllUsesWith(Res);
3630 cast<Instruction>(I)->eraseFromParent();
3631 Erased.insert(I);
3632 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3633 }
3634 }
3635
3636 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3637 for (const auto &KV : Cost->getMinimalBitwidths()) {
3638 // If the value wasn't vectorized, we must maintain the original scalar
3639 // type. The absence of the value from VectorLoopValueMap indicates that it
3640 // wasn't vectorized.
3641 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3642 continue;
3643 for (unsigned Part = 0; Part < UF; ++Part) {
3644 Value *I = getOrCreateVectorValue(KV.first, Part);
3645 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3646 if (Inst && Inst->use_empty()) {
3647 Value *NewI = Inst->getOperand(0);
3648 Inst->eraseFromParent();
3649 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3650 }
3651 }
3652 }
3653}
3654
3655void InnerLoopVectorizer::fixVectorizedLoop() {
3656 // Insert truncates and extends for any truncated instructions as hints to
3657 // InstCombine.
3658 if (VF.isVector())
3659 truncateToMinimalBitwidths();
3660
3661 // Fix widened non-induction PHIs by setting up the PHI operands.
3662 if (OrigPHIsToFix.size()) {
3663 assert(EnableVPlanNativePath &&((EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3664, __PRETTY_FUNCTION__))
3664 "Unexpected non-induction PHIs for fixup in non VPlan-native path")((EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3664, __PRETTY_FUNCTION__))
;
3665 fixNonInductionPHIs();
3666 }
3667
3668 // At this point every instruction in the original loop is widened to a
3669 // vector form. Now we need to fix the recurrences in the loop. These PHI
3670 // nodes are currently empty because we did not want to introduce cycles.
3671 // This is the second stage of vectorizing recurrences.
3672 fixCrossIterationPHIs();
3673
3674 // Forget the original basic block.
3675 PSE.getSE()->forgetLoop(OrigLoop);
3676
3677 // Fix-up external users of the induction variables.
3678 for (auto &Entry : Legal->getInductionVars())
3679 fixupIVUsers(Entry.first, Entry.second,
3680 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3681 IVEndValues[Entry.first], LoopMiddleBlock);
3682
3683 fixLCSSAPHIs();
3684 for (Instruction *PI : PredicatedInstructions)
3685 sinkScalarOperands(&*PI);
3686
3687 // Remove redundant induction instructions.
3688 cse(LoopVectorBody);
3689
3690 // Set/update profile weights for the vector and remainder loops as original
3691 // loop iterations are now distributed among them. Note that original loop
3692 // represented by LoopScalarBody becomes remainder loop after vectorization.
3693 //
3694 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3695 // end up getting slightly roughened result but that should be OK since
3696 // profile is not inherently precise anyway. Note also possible bypass of
3697 // vector code caused by legality checks is ignored, assigning all the weight
3698 // to the vector loop, optimistically.
3699 assert(!VF.isScalable() &&((!VF.isScalable() && "cannot use scalable ElementCount to determine unroll factor"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"cannot use scalable ElementCount to determine unroll factor\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3700, __PRETTY_FUNCTION__))
3700 "cannot use scalable ElementCount to determine unroll factor")((!VF.isScalable() && "cannot use scalable ElementCount to determine unroll factor"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"cannot use scalable ElementCount to determine unroll factor\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3700, __PRETTY_FUNCTION__))
;
3701 setProfileInfoAfterUnrolling(
3702 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3703 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3704}
3705
3706void InnerLoopVectorizer::fixCrossIterationPHIs() {
3707 // In order to support recurrences we need to be able to vectorize Phi nodes.
3708 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3709 // stage #2: We now need to fix the recurrences by adding incoming edges to
3710 // the currently empty PHI nodes. At this point every instruction in the
3711 // original loop is widened to a vector form so we can use them to construct
3712 // the incoming edges.
3713 for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3714 // Handle first-order recurrences and reductions that need to be fixed.
3715 if (Legal->isFirstOrderRecurrence(&Phi))
3716 fixFirstOrderRecurrence(&Phi);
3717 else if (Legal->isReductionVariable(&Phi))
3718 fixReduction(&Phi);
3719 }
3720}
3721
3722void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3723 // This is the second phase of vectorizing first-order recurrences. An
3724 // overview of the transformation is described below. Suppose we have the
3725 // following loop.
3726 //
3727 // for (int i = 0; i < n; ++i)
3728 // b[i] = a[i] - a[i - 1];
3729 //
3730 // There is a first-order recurrence on "a". For this loop, the shorthand
3731 // scalar IR looks like:
3732 //
3733 // scalar.ph:
3734 // s_init = a[-1]
3735 // br scalar.body
3736 //
3737 // scalar.body:
3738 // i = phi [0, scalar.ph], [i+1, scalar.body]
3739 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3740 // s2 = a[i]
3741 // b[i] = s2 - s1
3742 // br cond, scalar.body, ...
3743 //
3744 // In this example, s1 is a recurrence because it's value depends on the
3745 // previous iteration. In the first phase of vectorization, we created a
3746 // temporary value for s1. We now complete the vectorization and produce the
3747 // shorthand vector IR shown below (for VF = 4, UF = 1).
3748 //
3749 // vector.ph:
3750 // v_init = vector(..., ..., ..., a[-1])
3751 // br vector.body
3752 //
3753 // vector.body
3754 // i = phi [0, vector.ph], [i+4, vector.body]
3755 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3756 // v2 = a[i, i+1, i+2, i+3];
3757 // v3 = vector(v1(3), v2(0, 1, 2))
3758 // b[i, i+1, i+2, i+3] = v2 - v3
3759 // br cond, vector.body, middle.block
3760 //
3761 // middle.block:
3762 // x = v2(3)
3763 // br scalar.ph
3764 //
3765 // scalar.ph:
3766 // s_init = phi [x, middle.block], [a[-1], otherwise]
3767 // br scalar.body
3768 //
3769 // After execution completes the vector loop, we extract the next value of
3770 // the recurrence (x) to use as the initial value in the scalar loop.
3771
3772 // Get the original loop preheader and single loop latch.
3773 auto *Preheader = OrigLoop->getLoopPreheader();
3774 auto *Latch = OrigLoop->getLoopLatch();
3775
3776 // Get the initial and previous values of the scalar recurrence.
3777 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3778 auto *Previous = Phi->getIncomingValueForBlock(Latch);
3779
3780 // Create a vector from the initial value.
3781 auto *VectorInit = ScalarInit;
3782 if (VF.isVector()) {
3783 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3784 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3784, __PRETTY_FUNCTION__))
;
3785 VectorInit = Builder.CreateInsertElement(
3786 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3787 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
3788 }
3789
3790 // We constructed a temporary phi node in the first phase of vectorization.
3791 // This phi node will eventually be deleted.
3792 Builder.SetInsertPoint(
3793 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3794
3795 // Create a phi node for the new recurrence. The current value will either be
3796 // the initial value inserted into a vector or loop-varying vector value.
3797 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3798 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3799
3800 // Get the vectorized previous value of the last part UF - 1. It appears last
3801 // among all unrolled iterations, due to the order of their construction.
3802 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3803
3804 // Find and set the insertion point after the previous value if it is an
3805 // instruction.
3806 BasicBlock::iterator InsertPt;
3807 // Note that the previous value may have been constant-folded so it is not
3808 // guaranteed to be an instruction in the vector loop.
3809 // FIXME: Loop invariant values do not form recurrences. We should deal with
3810 // them earlier.
3811 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3812 InsertPt = LoopVectorBody->getFirstInsertionPt();
3813 else {
3814 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3815 if (isa<PHINode>(PreviousLastPart))
3816 // If the previous value is a phi node, we should insert after all the phi
3817 // nodes in the block containing the PHI to avoid breaking basic block
3818 // verification. Note that the basic block may be different to
3819 // LoopVectorBody, in case we predicate the loop.
3820 InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3821 else
3822 InsertPt = ++PreviousInst->getIterator();
3823 }
3824 Builder.SetInsertPoint(&*InsertPt);
3825
3826 // We will construct a vector for the recurrence by combining the values for
3827 // the current and previous iterations. This is the required shuffle mask.
3828 assert(!VF.isScalable())((!VF.isScalable()) ? static_cast<void> (0) : __assert_fail
("!VF.isScalable()", "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3828, __PRETTY_FUNCTION__))
;
3829 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
3830 ShuffleMask[0] = VF.getKnownMinValue() - 1;
3831 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
3832 ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
3833
3834 // The vector from which to take the initial value for the current iteration
3835 // (actual or unrolled). Initially, this is the vector phi node.
3836 Value *Incoming = VecPhi;
3837
3838 // Shuffle the current and previous vector and update the vector parts.
3839 for (unsigned Part = 0; Part < UF; ++Part) {
3840 Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3841 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3842 auto *Shuffle =
3843 VF.isVector()
3844 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
3845 : Incoming;
3846 PhiPart->replaceAllUsesWith(Shuffle);
3847 cast<Instruction>(PhiPart)->eraseFromParent();
3848 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3849 Incoming = PreviousPart;
3850 }
3851
3852 // Fix the latch value of the new recurrence in the vector loop.
3853 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3854
3855 // Extract the last vector element in the middle block. This will be the
3856 // initial value for the recurrence when jumping to the scalar loop.
3857 auto *ExtractForScalar = Incoming;
3858 if (VF.isVector()) {
3859 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3860 ExtractForScalar = Builder.CreateExtractElement(
3861 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
3862 "vector.recur.extract");
3863 }
3864 // Extract the second last element in the middle block if the
3865 // Phi is used outside the loop. We need to extract the phi itself
3866 // and not the last element (the phi update in the current iteration). This
3867 // will be the value when jumping to the exit block from the LoopMiddleBlock,
3868 // when the scalar loop is not run at all.
3869 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3870 if (VF.isVector())
3871 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3872 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
3873 "vector.recur.extract.for.phi");
3874 // When loop is unrolled without vectorizing, initialize
3875 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3876 // `Incoming`. This is analogous to the vectorized case above: extracting the
3877 // second last element when VF > 1.
3878 else if (UF > 1)
3879 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3880
3881 // Fix the initial value of the original recurrence in the scalar loop.
3882 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3883 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3884 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3885 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3886 Start->addIncoming(Incoming, BB);
3887 }
3888
3889 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3890 Phi->setName("scalar.recur");
3891
3892 // Finally, fix users of the recurrence outside the loop. The users will need
3893 // either the last value of the scalar recurrence or the last value of the
3894 // vector recurrence we extracted in the middle block. Since the loop is in
3895 // LCSSA form, we just need to find all the phi nodes for the original scalar
3896 // recurrence in the exit block, and then add an edge for the middle block.
3897 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3898 if (LCSSAPhi.getIncomingValue(0) == Phi) {
3899 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3900 }
3901 }
3902}
3903
3904void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3905 Constant *Zero = Builder.getInt32(0);
3906
3907 // Get it's reduction variable descriptor.
3908 assert(Legal->isReductionVariable(Phi) &&((Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->isReductionVariable(Phi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3909, __PRETTY_FUNCTION__))
3909 "Unable to find the reduction variable")((Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->isReductionVariable(Phi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3909, __PRETTY_FUNCTION__))
;
3910 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3911
3912 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3913 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3914 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3915 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3916 RdxDesc.getMinMaxRecurrenceKind();
3917 setDebugLocFromInst(Builder, ReductionStartValue);
3918 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
3919
3920 // We need to generate a reduction vector from the incoming scalar.
3921 // To do so, we need to generate the 'identity' vector and override
3922 // one of the elements with the incoming scalar reduction. We need
3923 // to do it in the vector-loop preheader.
3924 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3925
3926 // This is the vector-clone of the value that leaves the loop.
3927 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3928
3929 // Find the reduction identity variable. Zero for addition, or, xor,
3930 // one for multiplication, -1 for And.
3931 Value *Identity;
3932 Value *VectorStart;
3933 if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3934 RK == RecurrenceDescriptor::RK_FloatMinMax) {
3935 // MinMax reduction have the start value as their identify.
3936 if (VF == 1 || IsInLoopReductionPhi) {
3937 VectorStart = Identity = ReductionStartValue;
3938 } else {
3939 VectorStart = Identity =
3940 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3941 }
3942 } else {
3943 // Handle other reduction kinds:
3944 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3945 RK, VecTy->getScalarType());
3946 if (VF == 1 || IsInLoopReductionPhi) {
3947 Identity = Iden;
3948 // This vector is the Identity vector where the first element is the
3949 // incoming scalar reduction.
3950 VectorStart = ReductionStartValue;
3951 } else {
3952 Identity = ConstantVector::getSplat(VF, Iden);
3953
3954 // This vector is the Identity vector where the first element is the
3955 // incoming scalar reduction.
3956 VectorStart =
3957 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3958 }
3959 }
3960
3961 // Wrap flags are in general invalid after vectorization, clear them.
3962 clearReductionWrapFlags(RdxDesc);
3963
3964 // Fix the vector-loop phi.
3965
3966 // Reductions do not have to start at zero. They can start with
3967 // any loop invariant values.
3968 BasicBlock *Latch = OrigLoop->getLoopLatch();
3969 Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3970
3971 for (unsigned Part = 0; Part < UF; ++Part) {
3972 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3973 Value *Val = getOrCreateVectorValue(LoopVal, Part);
3974 // Make sure to add the reduction start value only to the
3975 // first unroll part.
3976 Value *StartVal = (Part == 0) ? VectorStart : Identity;
3977 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3978 cast<PHINode>(VecRdxPhi)
3979 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3980 }
3981
3982 // Before each round, move the insertion point right between
3983 // the PHIs and the values we are going to write.
3984 // This allows us to write both PHINodes and the extractelement
3985 // instructions.
3986 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3987
3988 setDebugLocFromInst(Builder, LoopExitInst);
3989
3990 // If tail is folded by masking, the vector value to leave the loop should be
3991 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3992 // instead of the former.
3993 if (Cost->foldTailByMasking()) {
3994 for (unsigned Part = 0; Part < UF; ++Part) {
3995 Value *VecLoopExitInst =
3996 VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3997 Value *Sel = nullptr;
3998 for (User *U : VecLoopExitInst->users()) {
3999 if (isa<SelectInst>(U)) {
4000 assert(!Sel && "Reduction exit feeding two selects")((!Sel && "Reduction exit feeding two selects") ? static_cast
<void> (0) : __assert_fail ("!Sel && \"Reduction exit feeding two selects\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4000, __PRETTY_FUNCTION__))
;
4001 Sel = U;
4002 } else
4003 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select")((isa<PHINode>(U) && "Reduction exit must feed Phi's or select"
) ? static_cast<void> (0) : __assert_fail ("isa<PHINode>(U) && \"Reduction exit must feed Phi's or select\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4003, __PRETTY_FUNCTION__))
;
4004 }
4005 assert(Sel && "Reduction exit feeds no select")((Sel && "Reduction exit feeds no select") ? static_cast
<void> (0) : __assert_fail ("Sel && \"Reduction exit feeds no select\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4005, __PRETTY_FUNCTION__))
;
4006 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
4007
4008 // If the target can create a predicated operator for the reduction at no
4009 // extra cost in the loop (for example a predicated vadd), it can be
4010 // cheaper for the select to remain in the loop than be sunk out of it,
4011 // and so use the select value for the phi instead of the old
4012 // LoopExitValue.
4013 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4014 if (PreferPredicatedReductionSelect ||
4015 TTI->preferPredicatedReductionSelect(
4016 RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()),
4017 Phi->getType(), TargetTransformInfo::ReductionFlags())) {
4018 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
4019 VecRdxPhi->setIncomingValueForBlock(
4020 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4021 }
4022 }
4023 }
4024
4025 // If the vector reduction can be performed in a smaller type, we truncate
4026 // then extend the loop exit value to enable InstCombine to evaluate the
4027 // entire expression in the smaller type.
4028 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4029 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!")((!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"
) ? static_cast<void> (0) : __assert_fail ("!IsInLoopReductionPhi && \"Unexpected truncated inloop reduction!\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4029, __PRETTY_FUNCTION__))
;
4030 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4030, __PRETTY_FUNCTION__))
;
4031 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4032 Builder.SetInsertPoint(
4033 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4034 VectorParts RdxParts(UF);
4035 for (unsigned Part = 0; Part < UF; ++Part) {
4036 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4037 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4038 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4039 : Builder.CreateZExt(Trunc, VecTy);
4040 for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4041 UI != RdxParts[Part]->user_end();)
4042 if (*UI != Trunc) {
4043 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4044 RdxParts[Part] = Extnd;
4045 } else {
4046 ++UI;
4047 }
4048 }
4049 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4050 for (unsigned Part = 0; Part < UF; ++Part) {
4051 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4052 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4053 }
4054 }
4055
4056 // Reduce all of the unrolled parts into a single vector.
4057 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4058 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4059
4060 // The middle block terminator has already been assigned a DebugLoc here (the
4061 // OrigLoop's single latch terminator). We want the whole middle block to
4062 // appear to execute on this line because: (a) it is all compiler generated,
4063 // (b) these instructions are always executed after evaluating the latch
4064 // conditional branch, and (c) other passes may add new predecessors which
4065 // terminate on this line. This is the easiest way to ensure we don't
4066 // accidentally cause an extra step back into the loop while debugging.
4067 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4068 for (unsigned Part = 1; Part < UF; ++Part) {
4069 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4070 if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4071 // Floating point operations had to be 'fast' to enable the reduction.
4072 ReducedPartRdx = addFastMathFlag(
4073 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4074 ReducedPartRdx, "bin.rdx"),
4075 RdxDesc.getFastMathFlags());
4076 else
4077 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
4078 RdxPart);
4079 }
4080
4081 // Create the reduction after the loop. Note that inloop reductions create the
4082 // target reduction in the loop using a Reduction recipe.
4083 if (VF.isVector() && !IsInLoopReductionPhi) {
4084 bool NoNaN = Legal->hasFunNoNaNAttr();
4085 ReducedPartRdx =
4086 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4087 // If the reduction can be performed in a smaller type, we need to extend
4088 // the reduction to the wider type before we branch to the original loop.
4089 if (Phi->getType() != RdxDesc.getRecurrenceType())
4090 ReducedPartRdx =
4091 RdxDesc.isSigned()
4092 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4093 : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4094 }
4095
4096 // Create a phi node that merges control-flow from the backedge-taken check
4097 // block and the middle block.
4098 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4099 LoopScalarPreHeader->getTerminator());
4100 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4101 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4102 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4103
4104 // Now, we need to fix the users of the reduction variable
4105 // inside and outside of the scalar remainder loop.
4106 // We know that the loop is in LCSSA form. We need to update the
4107 // PHI nodes in the exit blocks.
4108 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4109 // All PHINodes need to have a single entry edge, or two if
4110 // we already fixed them.
4111 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI")((LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"
) ? static_cast<void> (0) : __assert_fail ("LCSSAPhi.getNumIncomingValues() < 3 && \"Invalid LCSSA PHI\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4111, __PRETTY_FUNCTION__))
;
4112
4113 // We found a reduction value exit-PHI. Update it with the
4114 // incoming bypass edge.
4115 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4116 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4117 } // end of the LCSSA phi scan.
4118
4119 // Fix the scalar loop reduction variable with the incoming reduction sum
4120 // from the vector body and from the backedge value.
4121 int IncomingEdgeBlockIdx =
4122 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4123 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")((IncomingEdgeBlockIdx >= 0 && "Invalid block index"
) ? static_cast<void> (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4123, __PRETTY_FUNCTION__))
;
4124 // Pick the other block.
4125 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4126 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4127 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4128}
4129
4130void InnerLoopVectorizer::clearReductionWrapFlags(
4131 RecurrenceDescriptor &RdxDesc) {
4132 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4133 if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
4134 RK != RecurrenceDescriptor::RK_IntegerMult)
4135 return;
4136
4137 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4138 assert(LoopExitInstr && "null loop exit instruction")((LoopExitInstr && "null loop exit instruction") ? static_cast
<void> (0) : __assert_fail ("LoopExitInstr && \"null loop exit instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4138, __PRETTY_FUNCTION__))
;
4139 SmallVector<Instruction *, 8> Worklist;
4140 SmallPtrSet<Instruction *, 8> Visited;
4141 Worklist.push_back(LoopExitInstr);
4142 Visited.insert(LoopExitInstr);
4143
4144 while (!Worklist.empty()) {
4145 Instruction *Cur = Worklist.pop_back_val();
4146 if (isa<OverflowingBinaryOperator>(Cur))
4147 for (unsigned Part = 0; Part < UF; ++Part) {
4148 Value *V = getOrCreateVectorValue(Cur, Part);
4149 cast<Instruction>(V)->dropPoisonGeneratingFlags();
4150 }
4151
4152 for (User *U : Cur->users()) {
4153 Instruction *UI = cast<Instruction>(U);
4154 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4155 Visited.insert(UI).second)
4156 Worklist.push_back(UI);
4157 }
4158 }
4159}
4160
4161void InnerLoopVectorizer::fixLCSSAPHIs() {
4162 assert(!VF.isScalable() && "the code below assumes fixed width vectors")((!VF.isScalable() && "the code below assumes fixed width vectors"
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"the code below assumes fixed width vectors\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4162, __PRETTY_FUNCTION__))
;
4163 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4164 if (LCSSAPhi.getNumIncomingValues() == 1) {
4165 auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4166 // Non-instruction incoming values will have only one value.
4167 unsigned LastLane = 0;
4168 if (isa<Instruction>(IncomingValue))
4169 LastLane = Cost->isUniformAfterVectorization(
4170 cast<Instruction>(IncomingValue), VF)
4171 ? 0
4172 : VF.getKnownMinValue() - 1;
4173 // Can be a loop invariant incoming value or the last scalar value to be
4174 // extracted from the vectorized loop.
4175 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4176 Value *lastIncomingValue =
4177 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4178 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4179 }
4180 }
4181}
4182
4183void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4184 // The basic block and loop containing the predicated instruction.
4185 auto *PredBB = PredInst->getParent();
4186 auto *VectorLoop = LI->getLoopFor(PredBB);
4187
4188 // Initialize a worklist with the operands of the predicated instruction.
4189 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4190
4191 // Holds instructions that we need to analyze again. An instruction may be
4192 // reanalyzed if we don't yet know if we can sink it or not.
4193 SmallVector<Instruction *, 8> InstsToReanalyze;
4194
4195 // Returns true if a given use occurs in the predicated block. Phi nodes use
4196 // their operands in their corresponding predecessor blocks.
4197 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4198 auto *I = cast<Instruction>(U.getUser());
4199 BasicBlock *BB = I->getParent();
4200 if (auto *Phi = dyn_cast<PHINode>(I))
4201 BB = Phi->getIncomingBlock(
4202 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4203 return BB == PredBB;
4204 };
4205
4206 // Iteratively sink the scalarized operands of the predicated instruction
4207 // into the block we created for it. When an instruction is sunk, it's
4208 // operands are then added to the worklist. The algorithm ends after one pass
4209 // through the worklist doesn't sink a single instruction.
4210 bool Changed;
4211 do {
4212 // Add the instructions that need to be reanalyzed to the worklist, and
4213 // reset the changed indicator.
4214 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4215 InstsToReanalyze.clear();
4216 Changed = false;
4217
4218 while (!Worklist.empty()) {
4219 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4220
4221 // We can't sink an instruction if it is a phi node, is already in the
4222 // predicated block, is not in the loop, or may have side effects.
4223 if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4224 !VectorLoop->contains(I) || I->mayHaveSideEffects())
4225 continue;
4226
4227 // It's legal to sink the instruction if all its uses occur in the
4228 // predicated block. Otherwise, there's nothing to do yet, and we may
4229 // need to reanalyze the instruction.
4230 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4231 InstsToReanalyze.push_back(I);
4232 continue;
4233 }
4234
4235 // Move the instruction to the beginning of the predicated block, and add
4236 // it's operands to the worklist.
4237 I->moveBefore(&*PredBB->getFirstInsertionPt());
4238 Worklist.insert(I->op_begin(), I->op_end());
4239
4240 // The sinking may have enabled other instructions to be sunk, so we will
4241 // need to iterate.
4242 Changed = true;
4243 }
4244 } while (Changed);
4245}
4246
4247void InnerLoopVectorizer::fixNonInductionPHIs() {
4248 for (PHINode *OrigPhi : OrigPHIsToFix) {
4249 PHINode *NewPhi =
4250 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4251 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4252
4253 SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4254 predecessors(OrigPhi->getParent()));
4255 SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4256 predecessors(NewPhi->getParent()));
4257 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&((ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
"Scalar and Vector BB should have the same number of predecessors"
) ? static_cast<void> (0) : __assert_fail ("ScalarBBPredecessors.size() == VectorBBPredecessors.size() && \"Scalar and Vector BB should have the same number of predecessors\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4258, __PRETTY_FUNCTION__))
4258 "Scalar and Vector BB should have the same number of predecessors")((ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
"Scalar and Vector BB should have the same number of predecessors"
) ? static_cast<void> (0) : __assert_fail ("ScalarBBPredecessors.size() == VectorBBPredecessors.size() && \"Scalar and Vector BB should have the same number of predecessors\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4258, __PRETTY_FUNCTION__))
;
4259
4260 // The insertion point in Builder may be invalidated by the time we get
4261 // here. Force the Builder insertion point to something valid so that we do
4262 // not run into issues during insertion point restore in
4263 // getOrCreateVectorValue calls below.
4264 Builder.SetInsertPoint(NewPhi);
4265
4266 // The predecessor order is preserved and we can rely on mapping between
4267 // scalar and vector block predecessors.
4268 for (unsigned i = 0; i < NumIncomingValues; ++i) {
4269 BasicBlock *NewPredBB = VectorBBPredecessors[i];
4270
4271 // When looking up the new scalar/vector values to fix up, use incoming
4272 // values from original phi.
4273 Value *ScIncV =
4274 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4275
4276 // Scalar incoming value may need a broadcast
4277 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4278 NewPhi->addIncoming(NewIncV, NewPredBB);
4279 }
4280 }
4281}
4282
4283void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
4284 unsigned UF, ElementCount VF,
4285 bool IsPtrLoopInvariant,
4286 SmallBitVector &IsIndexLoopInvariant,
4287 VPTransformState &State) {
4288 // Construct a vector GEP by widening the operands of the scalar GEP as
4289 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4290 // results in a vector of pointers when at least one operand of the GEP
4291 // is vector-typed. Thus, to keep the representation compact, we only use
4292 // vector-typed operands for loop-varying values.
4293
4294 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4295 // If we are vectorizing, but the GEP has only loop-invariant operands,
4296 // the GEP we build (by only using vector-typed operands for
4297 // loop-varying values) would be a scalar pointer. Thus, to ensure we
4298 // produce a vector of pointers, we need to either arbitrarily pick an
4299 // operand to broadcast, or broadcast a clone of the original GEP.
4300 // Here, we broadcast a clone of the original.
4301 //
4302 // TODO: If at some point we decide to scalarize instructions having
4303 // loop-invariant operands, this special case will no longer be
4304 // required. We would add the scalarization decision to
4305 // collectLoopScalars() and teach getVectorValue() to broadcast
4306 // the lane-zero scalar value.
4307 auto *Clone = Builder.Insert(GEP->clone());
4308 for (unsigned Part = 0; Part < UF; ++Part) {
4309 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4310 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4311 addMetadata(EntryPart, GEP);
4312 }
4313 } else {
4314 // If the GEP has at least one loop-varying operand, we are sure to
4315 // produce a vector of pointers. But if we are only unrolling, we want
4316 // to produce a scalar GEP for each unroll part. Thus, the GEP we
4317 // produce with the code below will be scalar (if VF == 1) or vector
4318 // (otherwise). Note that for the unroll-only case, we still maintain
4319 // values in the vector mapping with initVector, as we do for other
4320 // instructions.
4321 for (unsigned Part = 0; Part < UF; ++Part) {
4322 // The pointer operand of the new GEP. If it's loop-invariant, we
4323 // won't broadcast it.
4324 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4325 : State.get(Operands.getOperand(0), Part);
4326
4327 // Collect all the indices for the new GEP. If any index is
4328 // loop-invariant, we won't broadcast it.
4329 SmallVector<Value *, 4> Indices;
4330 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4331 VPValue *Operand = Operands.getOperand(I);
4332 if (IsIndexLoopInvariant[I - 1])
4333 Indices.push_back(State.get(Operand, {0, 0}));
4334 else
4335 Indices.push_back(State.get(Operand, Part));
4336 }
4337
4338 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4339 // but it should be a vector, otherwise.
4340 auto *NewGEP =
4341 GEP->isInBounds()
4342 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4343 Indices)
4344 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4345 assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&(((VF == 1 || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector") ? static_cast<void> (
0) : __assert_fail ("(VF == 1 || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4346, __PRETTY_FUNCTION__))
4346 "NewGEP is not a pointer vector")(((VF == 1 || NewGEP->getType()->isVectorTy()) &&
"NewGEP is not a pointer vector") ? static_cast<void> (
0) : __assert_fail ("(VF == 1 || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4346, __PRETTY_FUNCTION__))
;
4347 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4348 addMetadata(NewGEP, GEP);
4349 }
4350 }
4351}
4352
4353void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4354 ElementCount VF) {
4355 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4355, __PRETTY_FUNCTION__))
;
4356 PHINode *P = cast<PHINode>(PN);
4357 if (EnableVPlanNativePath) {
4358 // Currently we enter here in the VPlan-native path for non-induction
4359 // PHIs where all control flow is uniform. We simply widen these PHIs.
4360 // Create a vector phi with no operands - the vector phi operands will be
4361 // set at the end of vector code generation.
4362 Type *VecTy =
4363 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4364 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4365 VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4366 OrigPHIsToFix.push_back(P);
4367
4368 return;
4369 }
4370
4371 assert(PN->getParent() == OrigLoop->getHeader() &&((PN->getParent() == OrigLoop->getHeader() && "Non-header phis should have been handled elsewhere"
) ? static_cast<void> (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4372, __PRETTY_FUNCTION__))
4372 "Non-header phis should have been handled elsewhere")((PN->getParent() == OrigLoop->getHeader() && "Non-header phis should have been handled elsewhere"
) ? static_cast<void> (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4372, __PRETTY_FUNCTION__))
;
4373
4374 // In order to support recurrences we need to be able to vectorize Phi nodes.
4375 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4376 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4377 // this value when we vectorize all of the instructions that use the PHI.
4378 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4379 for (unsigned Part = 0; Part < UF; ++Part) {
4380 // This is phase one of vectorizing PHIs.
4381 bool ScalarPHI =
4382 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4383 Type *VecTy =
4384 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4385 Value *EntryPart = PHINode::Create(
4386 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4387 VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4388 }
4389 return;
4390 }
4391
4392 setDebugLocFromInst(Builder, P);
4393
4394 // This PHINode must be an induction variable.
4395 // Make sure that we know about it.
4396 assert(Legal->getInductionVars().count(P) && "Not an induction variable")((Legal->getInductionVars().count(P) && "Not an induction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->getInductionVars().count(P) && \"Not an induction variable\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4396, __PRETTY_FUNCTION__))
;
4397
4398 InductionDescriptor II = Legal->getInductionVars().lookup(P);
4399 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4400
4401 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4402 // which can be found from the original scalar operations.
4403 switch (II.getKind()) {
4404 case InductionDescriptor::IK_NoInduction:
4405 llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4405)
;
4406 case InductionDescriptor::IK_IntInduction:
4407 case InductionDescriptor::IK_FpInduction:
4408 llvm_unreachable("Integer/fp induction is handled elsewhere.")::llvm::llvm_unreachable_internal("Integer/fp induction is handled elsewhere."
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4408)
;
4409 case InductionDescriptor::IK_PtrInduction: {
4410 // Handle the pointer induction variable case.
4411 assert(P->getType()->isPointerTy() && "Unexpected type.")((P->getType()->isPointerTy() && "Unexpected type."
) ? static_cast<void> (0) : __assert_fail ("P->getType()->isPointerTy() && \"Unexpected type.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4411, __PRETTY_FUNCTION__))
;
4412
4413 if (Cost->isScalarAfterVectorization(P, VF)) {
4414 // This is the normalized GEP that starts counting at zero.
4415 Value *PtrInd =
4416 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4417 // Determine the number of scalars we need to generate for each unroll
4418 // iteration. If the instruction is uniform, we only need to generate the
4419 // first lane. Otherwise, we generate all VF values.
4420 unsigned Lanes =
4421 Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
4422 for (unsigned Part = 0; Part < UF; ++Part) {
4423 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4424 Constant *Idx = ConstantInt::get(PtrInd->getType(),
4425 Lane + Part * VF.getKnownMinValue());
4426 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4427 Value *SclrGep =
4428 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4429 SclrGep->setName("next.gep");
4430 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4431 }
4432 }
4433 return;
4434 }
4435 assert(isa<SCEVConstant>(II.getStep()) &&((isa<SCEVConstant>(II.getStep()) && "Induction step not a SCEV constant!"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(II.getStep()) && \"Induction step not a SCEV constant!\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4436, __PRETTY_FUNCTION__))
4436 "Induction step not a SCEV constant!")((isa<SCEVConstant>(II.getStep()) && "Induction step not a SCEV constant!"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(II.getStep()) && \"Induction step not a SCEV constant!\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4436, __PRETTY_FUNCTION__))
;
4437 Type *PhiType = II.getStep()->getType();
4438
4439 // Build a pointer phi
4440 Value *ScalarStartValue = II.getStartValue();
4441 Type *ScStValueType = ScalarStartValue->getType();
4442 PHINode *NewPointerPhi =
4443 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4444 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4445
4446 // A pointer induction, performed by using a gep
4447 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4448 Instruction *InductionLoc = LoopLatch->getTerminator();
4449 const SCEV *ScalarStep = II.getStep();
4450 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4451 Value *ScalarStepValue =
4452 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4453 Value *InductionGEP = GetElementPtrInst::Create(
4454 ScStValueType->getPointerElementType(), NewPointerPhi,
4455 Builder.CreateMul(
4456 ScalarStepValue,
4457 ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
4458 "ptr.ind", InductionLoc);
4459 NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4460
4461 // Create UF many actual address geps that use the pointer
4462 // phi as base and a vectorized version of the step value
4463 // (<step*0, ..., step*N>) as offset.
4464 for (unsigned Part = 0; Part < UF; ++Part) {
4465 SmallVector<Constant *, 8> Indices;
4466 // Create a vector of consecutive numbers from zero to VF.
4467 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
4468 Indices.push_back(
4469 ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
4470 Constant *StartOffset = ConstantVector::get(Indices);
4471
4472 Value *GEP = Builder.CreateGEP(
4473 ScStValueType->getPointerElementType(), NewPointerPhi,
4474 Builder.CreateMul(
4475 StartOffset,
4476 Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
4477 "vector.gep"));
4478 VectorLoopValueMap.setVectorValue(P, Part, GEP);
4479 }
4480 }
4481 }
4482}
4483
4484/// A helper function for checking whether an integer division-related
4485/// instruction may divide by zero (in which case it must be predicated if
4486/// executed conditionally in the scalar code).
4487/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4488/// Non-zero divisors that are non compile-time constants will not be
4489/// converted into multiplication, so we will still end up scalarizing
4490/// the division, but can do so w/o predication.
4491static bool mayDivideByZero(Instruction &I) {
4492 assert((I.getOpcode() == Instruction::UDiv ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4496, __PRETTY_FUNCTION__))
4493 I.getOpcode() == Instruction::SDiv ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4496, __PRETTY_FUNCTION__))
4494 I.getOpcode() == Instruction::URem ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4496, __PRETTY_FUNCTION__))
4495 I.getOpcode() == Instruction::SRem) &&(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4496, __PRETTY_FUNCTION__))
4496 "Unexpected instruction")(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
== Instruction::SRem) && "Unexpected instruction") ?
static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4496, __PRETTY_FUNCTION__))
;
4497 Value *Divisor = I.getOperand(1);
4498 auto *CInt = dyn_cast<ConstantInt>(Divisor);
4499 return !CInt || CInt->isZero();
4500}
4501
4502void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
4503 VPTransformState &State) {
4504 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4504, __PRETTY_FUNCTION__))
;
4505 switch (I.getOpcode()) {
4506 case Instruction::Call:
4507 case Instruction::Br:
4508 case Instruction::PHI:
4509 case Instruction::GetElementPtr:
4510 case Instruction::Select:
4511 llvm_unreachable("This instruction is handled by a different recipe.")::llvm::llvm_unreachable_internal("This instruction is handled by a different recipe."
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4511)
;
4512 case Instruction::UDiv:
4513 case Instruction::SDiv:
4514 case Instruction::SRem:
4515 case Instruction::URem:
4516 case Instruction::Add:
4517 case Instruction::FAdd:
4518 case Instruction::Sub:
4519 case Instruction::FSub:
4520 case Instruction::FNeg:
4521 case Instruction::Mul:
4522 case Instruction::FMul:
4523 case Instruction::FDiv:
4524 case Instruction::FRem:
4525 case Instruction::Shl:
4526 case Instruction::LShr:
4527 case Instruction::AShr:
4528 case Instruction::And:
4529 case Instruction::Or:
4530 case Instruction::Xor: {
4531 // Just widen unops and binops.
4532 setDebugLocFromInst(Builder, &I);
4533
4534 for (unsigned Part = 0; Part < UF; ++Part) {
4535 SmallVector<Value *, 2> Ops;
4536 for (VPValue *VPOp : User.operands())
4537 Ops.push_back(State.get(VPOp, Part));
4538
4539 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4540
4541 if (auto *VecOp = dyn_cast<Instruction>(V))
4542 VecOp->copyIRFlags(&I);
4543
4544 // Use this vector value for all users of the original instruction.
4545 VectorLoopValueMap.setVectorValue(&I, Part, V);
4546 addMetadata(V, &I);
4547 }
4548
4549 break;
4550 }
4551 case Instruction::ICmp:
4552 case Instruction::FCmp: {
4553 // Widen compares. Generate vector compares.
4554 bool FCmp = (I.getOpcode() == Instruction::FCmp);
4555 auto *Cmp = cast<CmpInst>(&I);
4556 setDebugLocFromInst(Builder, Cmp);
4557 for (unsigned Part = 0; Part < UF; ++Part) {
4558 Value *A = State.get(User.getOperand(0), Part);
4559 Value *B = State.get(User.getOperand(1), Part);
4560 Value *C = nullptr;
4561 if (FCmp) {
4562 // Propagate fast math flags.
4563 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4564 Builder.setFastMathFlags(Cmp->getFastMathFlags());
4565 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4566 } else {
4567 C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4568 }
4569 VectorLoopValueMap.setVectorValue(&I, Part, C);
4570 addMetadata(C, &I);
4571 }
4572
4573 break;
4574 }
4575
4576 case Instruction::ZExt:
4577 case Instruction::SExt:
4578 case Instruction::FPToUI:
4579 case Instruction::FPToSI:
4580 case Instruction::FPExt:
4581 case Instruction::PtrToInt:
4582 case Instruction::IntToPtr:
4583 case Instruction::SIToFP:
4584 case Instruction::UIToFP:
4585 case Instruction::Trunc:
4586 case Instruction::FPTrunc:
4587 case Instruction::BitCast: {
4588 auto *CI = cast<CastInst>(&I);
4589 setDebugLocFromInst(Builder, CI);
4590
4591 /// Vectorize casts.
4592 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4592, __PRETTY_FUNCTION__))
;
4593 Type *DestTy =
4594 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4595
4596 for (unsigned Part = 0; Part < UF; ++Part) {
4597 Value *A = State.get(User.getOperand(0), Part);
4598 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4599 VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4600 addMetadata(Cast, &I);
4601 }
4602 break;
4603 }
4604 default:
4605 // This instruction is not vectorized by simple widening.
4606 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an unhandled instruction: "
<< I; } } while (false)
;
4607 llvm_unreachable("Unhandled instruction!")::llvm::llvm_unreachable_internal("Unhandled instruction!", "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4607)
;
4608 } // end of switch.
4609}
4610
4611void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
4612 VPTransformState &State) {
4613 assert(!isa<DbgInfoIntrinsic>(I) &&((!isa<DbgInfoIntrinsic>(I) && "DbgInfoIntrinsic should have been dropped during VPlan construction"
) ? static_cast<void> (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(I) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4614, __PRETTY_FUNCTION__))
4614 "DbgInfoIntrinsic should have been dropped during VPlan construction")((!isa<DbgInfoIntrinsic>(I) && "DbgInfoIntrinsic should have been dropped during VPlan construction"
) ? static_cast<void> (0) : __assert_fail ("!isa<DbgInfoIntrinsic>(I) && \"DbgInfoIntrinsic should have been dropped during VPlan construction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4614, __PRETTY_FUNCTION__))
;
4615 setDebugLocFromInst(Builder, &I);
4616
4617 Module *M = I.getParent()->getParent()->getParent();
4618 auto *CI = cast<CallInst>(&I);
4619
4620 SmallVector<Type *, 4> Tys;
4621 for (Value *ArgOperand : CI->arg_operands())
4622 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4623
4624 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4625
4626 // The flag shows whether we use Intrinsic or a usual Call for vectorized
4627 // version of the instruction.
4628 // Is it beneficial to perform intrinsic call compared to lib call?
4629 bool NeedToScalarize = false;
4630 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4631 bool UseVectorIntrinsic =
4632 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4633 assert((UseVectorIntrinsic || !NeedToScalarize) &&(((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."
) ? static_cast<void> (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4634, __PRETTY_FUNCTION__))
4634 "Instruction should be scalarized elsewhere.")(((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."
) ? static_cast<void> (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4634, __PRETTY_FUNCTION__))
;
4635
4636 for (unsigned Part = 0; Part < UF; ++Part) {
4637 SmallVector<Value *, 4> Args;
4638 for (auto &I : enumerate(ArgOperands.operands())) {
4639 // Some intrinsics have a scalar argument - don't replace it with a
4640 // vector.
4641 Value *Arg;
4642 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4643 Arg = State.get(I.value(), Part);
4644 else
4645 Arg = State.get(I.value(), {0, 0});
4646 Args.push_back(Arg);
4647 }
4648
4649 Function *VectorF;
4650 if (UseVectorIntrinsic) {
4651 // Use vector version of the intrinsic.
4652 Type *TysForDecl[] = {CI->getType()};
4653 if (VF.isVector()) {
4654 assert(!VF.isScalable() && "VF is assumed to be non scalable.")((!VF.isScalable() && "VF is assumed to be non scalable."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4654, __PRETTY_FUNCTION__))
;
4655 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4656 }
4657 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4658 assert(VectorF && "Can't retrieve vector intrinsic.")((VectorF && "Can't retrieve vector intrinsic.") ? static_cast
<void> (0) : __assert_fail ("VectorF && \"Can't retrieve vector intrinsic.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4658, __PRETTY_FUNCTION__))
;
4659 } else {
4660 // Use vector version of the function call.
4661 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4662#ifndef NDEBUG
4663 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&((VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
"Can't create vector function.") ? static_cast<void> (
0) : __assert_fail ("VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4664, __PRETTY_FUNCTION__))
4664 "Can't create vector function.")((VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
"Can't create vector function.") ? static_cast<void> (
0) : __assert_fail ("VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && \"Can't create vector function.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4664, __PRETTY_FUNCTION__))
;
4665#endif
4666 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4667 }
4668 SmallVector<OperandBundleDef, 1> OpBundles;
4669 CI->getOperandBundlesAsDefs(OpBundles);
4670 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4671
4672 if (isa<FPMathOperator>(V))
4673 V->copyFastMathFlags(CI);
4674
4675 VectorLoopValueMap.setVectorValue(&I, Part, V);
4676 addMetadata(V, &I);
4677 }
4678}
4679
4680void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
4681 VPUser &Operands,
4682 bool InvariantCond,
4683 VPTransformState &State) {
4684 setDebugLocFromInst(Builder, &I);
4685
4686 // The condition can be loop invariant but still defined inside the
4687 // loop. This means that we can't just use the original 'cond' value.
4688 // We have to take the 'vectorized' value and pick the first lane.
4689 // Instcombine will make this a no-op.
4690 auto *InvarCond =
4691 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4692
4693 for (unsigned Part = 0; Part < UF; ++Part) {
4694 Value *Cond =
4695 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4696 Value *Op0 = State.get(Operands.getOperand(1), Part);
4697 Value *Op1 = State.get(Operands.getOperand(2), Part);
4698 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4699 VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4700 addMetadata(Sel, &I);
4701 }
4702}
4703
4704void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4705 // We should not collect Scalars more than once per VF. Right now, this
4706 // function is called from collectUniformsAndScalars(), which already does
4707 // this check. Collecting Scalars for VF=1 does not make any sense.
4708 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&((VF.isVector() && Scalars.find(VF) == Scalars.end() &&
"This function should not be visited twice for the same VF")
? static_cast<void> (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4709, __PRETTY_FUNCTION__))
4709 "This function should not be visited twice for the same VF")((VF.isVector() && Scalars.find(VF) == Scalars.end() &&
"This function should not be visited twice for the same VF")
? static_cast<void> (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4709, __PRETTY_FUNCTION__))
;
4710
4711 SmallSetVector<Instruction *, 8> Worklist;
4712
4713 // These sets are used to seed the analysis with pointers used by memory
4714 // accesses that will remain scalar.
4715 SmallSetVector<Instruction *, 8> ScalarPtrs;
4716 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4717 auto *Latch = TheLoop->getLoopLatch();
4718
4719 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4720 // The pointer operands of loads and stores will be scalar as long as the
4721 // memory access is not a gather or scatter operation. The value operand of a
4722 // store will remain scalar if the store is scalarized.
4723 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4724 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4725 assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4726, __PRETTY_FUNCTION__))
4726 "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4726, __PRETTY_FUNCTION__))
;
4727 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4728 if (Ptr == Store->getValueOperand())
4729 return WideningDecision == CM_Scalarize;
4730 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&((Ptr == getLoadStorePointerOperand(MemAccess) && "Ptr is neither a value or pointer operand"
) ? static_cast<void> (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4731, __PRETTY_FUNCTION__))
4731 "Ptr is neither a value or pointer operand")((Ptr == getLoadStorePointerOperand(MemAccess) && "Ptr is neither a value or pointer operand"
) ? static_cast<void> (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4731, __PRETTY_FUNCTION__))
;
4732 return WideningDecision != CM_GatherScatter;
4733 };
4734
4735 // A helper that returns true if the given value is a bitcast or
4736 // getelementptr instruction contained in the loop.
4737 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4738 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4739 isa<GetElementPtrInst>(V)) &&
4740 !TheLoop->isLoopInvariant(V);
4741 };
4742
4743 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4744 if (!isa<PHINode>(Ptr) ||
4745 !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4746 return false;
4747 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4748 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4749 return false;
4750 return isScalarUse(MemAccess, Ptr);
4751 };
4752
4753 // A helper that evaluates a memory access's use of a pointer. If the
4754 // pointer is actually the pointer induction of a loop, it is being
4755 // inserted into Worklist. If the use will be a scalar use, and the
4756 // pointer is only used by memory accesses, we place the pointer in
4757 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4758 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4759 if (isScalarPtrInduction(MemAccess, Ptr)) {
4760 Worklist.insert(cast<Instruction>(Ptr));
4761 Instruction *Update = cast<Instruction>(
4762 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4763 Worklist.insert(Update);
4764 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptrdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Ptr << "\n"; } } while (false)
4765 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Ptr << "\n"; } } while (false)
;
4766 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Updatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Update << "\n"; } } while (false)
4767 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found new scalar instruction: "
<< *Update << "\n"; } } while (false)
;
4768 return;
4769 }
4770 // We only care about bitcast and getelementptr instructions contained in
4771 // the loop.
4772 if (!isLoopVaryingBitCastOrGEP(Ptr))
4773 return;
4774
4775 // If the pointer has already been identified as scalar (e.g., if it was
4776 // also identified as uniform), there's nothing to do.
4777 auto *I = cast<Instruction>(Ptr);
4778 if (Worklist.count(I))
4779 return;
4780
4781 // If the use of the pointer will be a scalar use, and all users of the
4782 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4783 // place the pointer in PossibleNonScalarPtrs.
4784 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4785 return isa<LoadInst>(U) || isa<StoreInst>(U);
4786 }))
4787 ScalarPtrs.insert(I);
4788 else
4789 PossibleNonScalarPtrs.insert(I);
4790 };
4791
4792 // We seed the scalars analysis with three classes of instructions: (1)
4793 // instructions marked uniform-after-vectorization and (2) bitcast,
4794 // getelementptr and (pointer) phi instructions used by memory accesses
4795 // requiring a scalar use.
4796 //
4797 // (1) Add to the worklist all instructions that have been identified as
4798 // uniform-after-vectorization.
4799 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4800
4801 // (2) Add to the worklist all bitcast and getelementptr instructions used by
4802 // memory accesses requiring a scalar use. The pointer operands of loads and
4803 // stores will be scalar as long as the memory accesses is not a gather or
4804 // scatter operation. The value operand of a store will remain scalar if the
4805 // store is scalarized.
4806 for (auto *BB : TheLoop->blocks())
4807 for (auto &I : *BB) {
4808 if (auto *Load = dyn_cast<LoadInst>(&I)) {
4809 evaluatePtrUse(Load, Load->getPointerOperand());
4810 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4811 evaluatePtrUse(Store, Store->getPointerOperand());
4812 evaluatePtrUse(Store, Store->getValueOperand());
4813 }
4814 }
4815 for (auto *I : ScalarPtrs)
4816 if (!PossibleNonScalarPtrs.count(I)) {
4817 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *I << "\n"; } } while (false)
;
4818 Worklist.insert(I);
4819 }
4820
4821 // Insert the forced scalars.
4822 // FIXME: Currently widenPHIInstruction() often creates a dead vector
4823 // induction variable when the PHI user is scalarized.
4824 auto ForcedScalar = ForcedScalars.find(VF);
4825 if (ForcedScalar != ForcedScalars.end())
4826 for (auto *I : ForcedScalar->second)
4827 Worklist.insert(I);
4828
4829 // Expand the worklist by looking through any bitcasts and getelementptr
4830 // instructions we've already identified as scalar. This is similar to the
4831 // expansion step in collectLoopUniforms(); however, here we're only
4832 // expanding to include additional bitcasts and getelementptr instructions.
4833 unsigned Idx = 0;
4834 while (Idx != Worklist.size()) {
4835 Instruction *Dst = Worklist[Idx++];
4836 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4837 continue;
4838 auto *Src = cast<Instruction>(Dst->getOperand(0));
4839 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4840 auto *J = cast<Instruction>(U);
4841 return !TheLoop->contains(J) || Worklist.count(J) ||
4842 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4843 isScalarUse(J, Src));
4844 })) {
4845 Worklist.insert(Src);
4846 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Src << "\n"; } } while (false)
;
4847 }
4848 }
4849
4850 // An induction variable will remain scalar if all users of the induction
4851 // variable and induction variable update remain scalar.
4852 for (auto &Induction : Legal->getInductionVars()) {
4853 auto *Ind = Induction.first;
4854 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4855
4856 // If tail-folding is applied, the primary induction variable will be used
4857 // to feed a vector compare.
4858 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4859 continue;
4860
4861 // Determine if all users of the induction variable are scalar after
4862 // vectorization.
4863 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4864 auto *I = cast<Instruction>(U);
4865 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4866 });
4867 if (!ScalarInd)
4868 continue;
4869
4870 // Determine if all users of the induction variable update instruction are
4871 // scalar after vectorization.
4872 auto ScalarIndUpdate =
4873 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4874 auto *I = cast<Instruction>(U);
4875 return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4876 });
4877 if (!ScalarIndUpdate)
4878 continue;
4879
4880 // The induction variable and its update instruction will remain scalar.
4881 Worklist.insert(Ind);
4882 Worklist.insert(IndUpdate);
4883 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Ind << "\n"; } } while (false)
;
4884 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
4885 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
;
4886 }
4887
4888 Scalars[VF].insert(Worklist.begin(), Worklist.end());
4889}
4890
4891bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
4892 ElementCount VF) {
4893 assert(!VF.isScalable() && "scalable vectors not yet supported.")((!VF.isScalable() && "scalable vectors not yet supported."
) ? static_cast<void> (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4893, __PRETTY_FUNCTION__))
;
4894 if (!blockNeedsPredication(I->getParent()))
4895 return false;
4896 switch(I->getOpcode()) {
4897 default:
4898 break;
4899 case Instruction::Load:
4900 case Instruction::Store: {
4901 if (!Legal->isMaskRequired(I))
4902 return false;
4903 auto *Ptr = getLoadStorePointerOperand(I);
4904 auto *Ty = getMemInstValueType(I);
4905 // We have already decided how to vectorize this instruction, get that
4906 // result.
4907 if (VF.isVector()) {
4908 InstWidening WideningDecision = getWideningDecision(I, VF);
4909 assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4910, __PRETTY_FUNCTION__))
4910 "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4910, __PRETTY_FUNCTION__))
;
4911 return WideningDecision == CM_Scalarize;
4912 }
4913 const Align Alignment = getLoadStoreAlignment(I);
4914 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4915 isLegalMaskedGather(Ty, Alignment))
4916 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4917 isLegalMaskedScatter(Ty, Alignment));
4918 }
4919 case Instruction::UDiv:
4920 case Instruction::SDiv:
4921 case Instruction::SRem:
4922 case Instruction::URem:
4923 return mayDivideByZero(*I);
4924 }
4925 return false;
4926}
4927
4928bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4929 Instruction *I, ElementCount VF) {
4930 assert(isAccessInterleaved(I) && "Expecting interleaved access.")((isAccessInterleaved(I) && "Expecting interleaved access."
) ? static_cast<void> (0) : __assert_fail ("isAccessInterleaved(I) && \"Expecting interleaved access.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4930, __PRETTY_FUNCTION__))
;
4931 assert(getWideningDecision(I, VF) == CM_Unknown &&((getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."
) ? static_cast<void> (0) : __assert_fail ("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4932, __PRETTY_FUNCTION__))
4932 "Decision should not be set yet.")((getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."
) ? static_cast<void> (0) : __assert_fail ("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4932, __PRETTY_FUNCTION__))
;
4933 auto *Group = getInterleavedAccessGroup(I);
4934 assert(Group && "Must have a group.")((Group && "Must have a group.") ? static_cast<void
> (0) : __assert_fail ("Group && \"Must have a group.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4934, __PRETTY_FUNCTION__))
;
4935
4936 // If the instruction's allocated size doesn't equal it's type size, it
4937 // requires padding and will be scalarized.
4938 auto &DL = I->getModule()->getDataLayout();
4939 auto *ScalarTy = getMemInstValueType(I);
4940 if (hasIrregularType(ScalarTy, DL, VF))
4941 return false;
4942
4943 // Check if masking is required.
4944 // A Group may need masking for one of two reasons: it resides in a block that
4945 // needs predication, or it was decided to use masking to deal with gaps.
4946 bool PredicatedAccessRequiresMasking =
4947 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4948 bool AccessWithGapsRequiresMasking =
4949 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4950 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4951 return true;
4952
4953 // If masked interleaving is required, we expect that the user/target had
4954 // enabled it, because otherwise it either wouldn't have been created or
4955 // it should have been invalidated by the CostModel.
4956 assert(useMaskedInterleavedAccesses(TTI) &&((useMaskedInterleavedAccesses(TTI) && "Masked interleave-groups for predicated accesses are not enabled."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4957, __PRETTY_FUNCTION__))
4957 "Masked interleave-groups for predicated accesses are not enabled.")((useMaskedInterleavedAccesses(TTI) && "Masked interleave-groups for predicated accesses are not enabled."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4957, __PRETTY_FUNCTION__))
;
4958
4959 auto *Ty = getMemInstValueType(I);
4960 const Align Alignment = getLoadStoreAlignment(I);
4961 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4962 : TTI.isLegalMaskedStore(Ty, Alignment);
4963}
4964
4965bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4966 Instruction *I, ElementCount VF) {
4967 // Get and ensure we have a valid memory instruction.
4968 LoadInst *LI = dyn_cast<LoadInst>(I);
4969 StoreInst *SI = dyn_cast<StoreInst>(I);
4970 assert((LI || SI) && "Invalid memory instruction")(((LI || SI) && "Invalid memory instruction") ? static_cast
<void> (0) : __assert_fail ("(LI || SI) && \"Invalid memory instruction\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4970, __PRETTY_FUNCTION__))
;
4971
4972 auto *Ptr = getLoadStorePointerOperand(I);
4973
4974 // In order to be widened, the pointer should be consecutive, first of all.
4975 if (!Legal->isConsecutivePtr(Ptr))
4976 return false;
4977
4978 // If the instruction is a store located in a predicated block, it will be
4979 // scalarized.
4980 if (isScalarWithPredication(I))
4981 return false;
4982
4983 // If the instruction's allocated size doesn't equal it's type size, it
4984 // requires padding and will be scalarized.
4985 auto &DL = I->getModule()->getDataLayout();
4986 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4987 if (hasIrregularType(ScalarTy, DL, VF))
4988 return false;
4989
4990 return true;
4991}
4992
4993void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4994 // We should not collect Uniforms more than once per VF. Right now,
4995 // this function is called from collectUniformsAndScalars(), which
4996 // already does this check. Collecting Uniforms for VF=1 does not make any
4997 // sense.
4998
4999 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&((VF.isVector() && Uniforms.find(VF) == Uniforms.end(
) && "This function should not be visited twice for the same VF"
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5000, __PRETTY_FUNCTION__))
5000 "This function should not be visited twice for the same VF")((VF.isVector() && Uniforms.find(VF) == Uniforms.end(
) && "This function should not be visited twice for the same VF"
) ? static_cast<void> (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5000, __PRETTY_FUNCTION__))
;
5001
5002 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5003 // not analyze again. Uniforms.count(VF) will return 1.
5004 Uniforms[VF].clear();
5005
5006 // We now know that the loop is vectorizable!
5007 // Collect instructions inside the loop that will remain uniform after
5008 // vectorization.
5009
5010 // Global values, params and instructions outside of current loop are out of
5011 // scope.
5012 auto isOutOfScope = [&](Value *V) -> bool {
5013 Instruction *I = dyn_cast<Instruction>(V);
5014 return (!I || !TheLoop->contains(I));
5015 };
5016
5017 SetVector<Instruction *> Worklist;
5018 BasicBlock *Latch = TheLoop->getLoopLatch();
5019
5020 // Instructions that are scalar with predication must not be considered
5021 // uniform after vectorization, because that would create an erroneous
5022 // replicating region where only a single instance out of VF should be formed.
5023 // TODO: optimize such seldom cases if found important, see PR40816.
5024 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5025 if (isScalarWithPredication(I, VF)) {
5026 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
5027 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
;
5028 return;
5029 }
5030 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
<< *I << "\n"; } } while (false)
;
5031 Worklist.insert(I);
5032 };
5033
5034 // Start with the conditional branch. If the branch condition is an
5035 // instruction contained in the loop that is only used by the branch, it is
5036 // uniform.
5037 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5038 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5039 addToWorklistIfAllowed(Cmp);
5040
5041 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
5042 // are pointers that are treated like consecutive pointers during
5043 // vectorization. The pointer operands of interleaved accesses are an
5044 // example.
5045 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
5046
5047 // Holds pointer operands of instructions that are possibly non-uniform.
5048 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
5049
5050 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5051 InstWidening WideningDecision = getWideningDecision(I, VF);
5052 assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5053, __PRETTY_FUNCTION__))
5053 "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5053, __PRETTY_FUNCTION__))
;
5054
5055 return (WideningDecision == CM_Widen ||
5056 WideningDecision == CM_Widen_Reverse ||
5057 WideningDecision == CM_Interleave);
5058 };
5059 // Iterate over the instructions in the loop, and collect all
5060 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
5061 // that a consecutive-like pointer operand will be scalarized, we collect it
5062 // in PossibleNonUniformPtrs instead. We use two sets here because a single
5063 // getelementptr instruction can be used by both vectorized and scalarized
5064 // memory instructions. For example, if a loop loads and stores from the same
5065 // location, but the store is conditional, the store will be scalarized, and
5066 // the getelementptr won't remain uniform.
5067 for (auto *BB : TheLoop->blocks())
5068 for (auto &I : *BB) {
5069 // If there's no pointer operand, there's nothing to do.
5070 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5071 if (!Ptr)
5072 continue;
5073
5074 // True if all users of Ptr are memory accesses that have Ptr as their
5075 // pointer operand.
5076 auto UsersAreMemAccesses =
5077 llvm::all_of(Ptr->users(), [&](User *U) -> bool {
5078 return getLoadStorePointerOperand(U) == Ptr;
5079 });
5080
5081 // Ensure the memory instruction will not be scalarized or used by
5082 // gather/scatter, making its pointer operand non-uniform. If the pointer
5083 // operand is used by any instruction other than a memory access, we
5084 // conservatively assume the pointer operand may be non-uniform.
5085 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
5086 PossibleNonUniformPtrs.insert(Ptr);
5087
5088 // If the memory instruction will be vectorized and its pointer operand
5089 // is consecutive-like, or interleaving - the pointer operand should
5090 // remain uniform.
5091 else
5092 ConsecutiveLikePtrs.insert(Ptr);
5093 }
5094
5095 // Add to the Worklist all consecutive and consecutive-like pointers that
5096 // aren't also identified as possibly non-uniform.
5097 for (auto *V : ConsecutiveLikePtrs)
5098 if (!PossibleNonUniformPtrs.count(V))
5099 addToWorklistIfAllowed(V);
5100
5101 // Expand Worklist in topological order: whenever a new instruction
5102 // is added , its users should be already inside Worklist. It ensures
5103 // a uniform instruction will only be used by uniform instructions.
5104 unsigned idx = 0;
5105 while (idx != Worklist.size()) {
5106 Instruction *I = Worklist[idx++];
5107
5108 for (auto OV : I->operand_values()) {
5109 // isOutOfScope operands cannot be uniform instructions.
5110 if (isOutOfScope(OV))
5111 continue;
5112 // First order recurrence Phi's should typically be considered
5113 // non-uniform.
5114 auto *OP = dyn_cast<PHINode>(OV);
5115 if (OP && Legal->isFirstOrderRecurrence(OP))
5116 continue;
5117 // If all the users of the operand are uniform, then add the
5118 // operand into the uniform worklist.
5119 auto *OI = cast<Instruction>(OV);
5120 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5121 auto *J = cast<Instruction>(U);
5122 return Worklist.count(J) ||
5123 (OI == getLoadStorePointerOperand(J) &&
5124 isUniformDecision(J, VF));
5125 }))
5126 addToWorklistIfAllowed(OI);
5127 }
5128 }
5129
5130 // Returns true if Ptr is the pointer operand of a memory access instruction
5131 // I, and I is known to not require scalarization.
5132 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5133 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5134 };
5135
5136 // For an instruction to be added into Worklist above, all its users inside
5137 // the loop should also be in Worklist. However, this condition cannot be
5138 // true for phi nodes that form a cyclic dependence. We must process phi
5139 // nodes separately. An induction variable will remain uniform if all users
5140 // of the induction variable and induction variable update remain uniform.
5141 // The code below handles both pointer and non-pointer induction variables.
5142 for (auto &Induction : Legal->getInductionVars()) {
5143 auto *Ind = Induction.first;
5144 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5145
5146 // Determine if all users of the induction variable are uniform after
5147 // vectorization.
5148 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5149 auto *I = cast<Instruction>(U);
5150 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5151 isVectorizedMemAccessUse(I, Ind);
5152 });
5153 if (!UniformInd)
5154 continue;
5155
5156 // Determine if all users of the induction variable update instruction are
5157 // uniform after vectorization.
5158 auto UniformIndUpdate =
5159 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5160 auto *I = cast<Instruction>(U);
5161 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5162 isVectorizedMemAccessUse(I, IndUpdate);
5163 });
5164 if (!UniformIndUpdate)
5165 continue;
5166
5167 // The induction variable and its update instruction will remain uniform.
5168 addToWorklistIfAllowed(Ind);
5169 addToWorklistIfAllowed(IndUpdate);
5170 }
5171
5172 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5173}
5174
5175bool LoopVectorizationCostModel::runtimeChecksRequired() {
5176 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Performing code size checks.\n"
; } } while (false)
;
5177
5178 if (Legal->getRuntimePointerChecking()->Need) {
5179 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5180 "runtime pointer checks needed. Enable vectorization of this "
5181 "loop with '#pragma clang loop vectorize(enable)' when "
5182 "compiling with -Os/-Oz",
5183 "CantVersionLoopWithOptForSize", ORE, TheLoop);
5184 return true;
5185 }
5186
5187 if (!PSE.getUnionPredicate().getPredicates().empty()) {
5188 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5189 "runtime SCEV checks needed. Enable vectorization of this "
5190 "loop with '#pragma clang loop vectorize(enable)' when "
5191 "compiling with -Os/-Oz",
5192 "CantVersionLoopWithOptForSize", ORE, TheLoop);
5193 return true;
5194 }
5195
5196 // FIXME: Avoid specializing for stride==1 instead of bailing out.
5197 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5198 reportVectorizationFailure("Runtime stride check for small trip count",
5199 "runtime stride == 1 checks needed. Enable vectorization of "
5200 "this loop without such check by compiling with -Os/-Oz",
5201 "CantVersionLoopWithOptForSize", ORE, TheLoop);
5202 return true;
5203 }
5204
5205 return false;
5206}
5207
5208Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
5209 unsigned UserIC) {
5210 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5211 // TODO: It may by useful to do since it's still likely to be dynamically
5212 // uniform if the target can skip.
5213 reportVectorizationFailure(
5214 "Not inserting runtime ptr check for divergent target",
5215 "runtime pointer checks needed. Not enabled for divergent target",
5216 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5217 return None;
5218 }
5219
5220 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5221 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found trip count: "
<< TC << '\n'; } } while (false)
;
5222 if (TC == 1) {
5223 reportVectorizationFailure("Single iteration (non) loop",
5224 "loop trip count is one, irrelevant for vectorization",
5225 "SingleIterationLoop", ORE, TheLoop);
5226 return None;
5227 }
5228
5229 switch (ScalarEpilogueStatus) {
5230 case CM_ScalarEpilogueAllowed:
5231 return UserVF ? UserVF : computeFeasibleMaxVF(TC);
5232 case CM_ScalarEpilogueNotNeededUsePredicate:
5233 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5234 dbgs() << "LV: vector predicate hint/switch found.\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5235 << "LV: Not allowing scalar epilogue, creating predicated "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5236 << "vector loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
;
5237 break;
5238 case CM_ScalarEpilogueNotAllowedLowTripLoop:
5239 // fallthrough as a special case of OptForSize
5240 case CM_ScalarEpilogueNotAllowedOptSize:
5241 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5242 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
5243 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
;
5244 else
5245 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
<< "count.\n"; } } while (false)
5246 << "count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
<< "count.\n"; } } while (false)
;
5247
5248 // Bail if runtime checks are required, which are not good when optimising
5249 // for size.
5250 if (runtimeChecksRequired())
5251 return None;
5252 break;
5253 }
5254
5255 // Now try the tail folding
5256
5257 // Invalidate interleave groups that require an epilogue if we can't mask
5258 // the interleave-group.
5259 if (!useMaskedInterleavedAccesses(TTI)) {
5260 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&((WideningDecisions.empty() && Uniforms.empty() &&
Scalars.empty() && "No decisions should have been taken at this point"
) ? static_cast<void> (0) : __assert_fail ("WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && \"No decisions should have been taken at this point\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5261, __PRETTY_FUNCTION__))
5261 "No decisions should have been taken at this point")((WideningDecisions.empty() && Uniforms.empty() &&
Scalars.empty() && "No decisions should have been taken at this point"
) ? static_cast<void> (0) : __assert_fail ("WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && \"No decisions should have been taken at this point\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5261, __PRETTY_FUNCTION__))
;
5262 // Note: There is no need to invalidate any cost modeling decisions here, as
5263 // non where taken so far.
5264 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5265 }
5266
5267 unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
5268 assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2")(((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"
) ? static_cast<void> (0) : __assert_fail ("(UserVF || isPowerOf2_32(MaxVF)) && \"MaxVF must be a power of 2\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5268, __PRETTY_FUNCTION__))
;
5269 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
5270 if (TC > 0 && TC % MaxVFtimesIC == 0) {
5271 // Accept MaxVF if we do not have a tail.
5272 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: No tail will remain for any chosen VF.\n"
; } } while (false)
;
5273 return MaxVF;
5274 }
5275
5276 // If we don't know the precise trip count, or if the trip count that we
5277 // found modulo the vectorization factor is not zero, try to fold the tail
5278 // by masking.
5279 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5280 if (Legal->prepareToFoldTailByMasking()) {
5281 FoldTailByMasking = true;
5282 return MaxVF;
5283 }
5284
5285 // If there was a tail-folding hint/switch, but we can't fold the tail by
5286 // masking, fallback to a vectorization with a scalar epilogue.
5287 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5288 if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) {
5289 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"
; } } while (false)
;
5290 return None;
5291 }
5292 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
5293 "scalar epilogue instead.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
;
5294 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5295 return MaxVF;
5296 }
5297
5298 if (TC == 0) {
5299 reportVectorizationFailure(
5300 "Unable to calculate the loop count due to complex control flow",
5301 "unable to calculate the loop count due to complex control flow",
5302 "UnknownLoopCountComplexCFG", ORE, TheLoop);
5303 return None;
5304 }
5305
5306 reportVectorizationFailure(
5307 "Cannot optimize for size and vectorize at the same time.",
5308 "cannot optimize for size and vectorize at the same time. "
5309 "Enable vectorization of this loop with '#pragma clang loop "
5310 "vectorize(enable)' when compiling with -Os/-Oz",
5311 "NoTailLoopWithOptForSize", ORE, TheLoop);
5312 return None;
5313}
5314
5315unsigned
5316LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5317 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5318 unsigned SmallestType, WidestType;
5319 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5320 unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5321
5322 // Get the maximum safe dependence distance in bits computed by LAA.
5323 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5324 // the memory accesses that is most restrictive (involved in the smallest
5325 // dependence distance).
5326 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5327
5328 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5329
5330 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5331 // Note that both WidestRegister and WidestType may not be a powers of 2.
5332 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5333
5334 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestTypedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
5335 << " / " << WidestType << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
;
5336 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< WidestRegister << " bits.\n"; } } while (false
)
5337 << WidestRegister << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< WidestRegister << " bits.\n"; } } while (false
)
;
5338
5339 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"((MaxVectorSize <= 256 && "Did not expect to pack so many elements"
" into one vector!") ? static_cast<void> (0) : __assert_fail
("MaxVectorSize <= 256 && \"Did not expect to pack so many elements\" \" into one vector!\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5340, __PRETTY_FUNCTION__))
5340 " into one vector!")((MaxVectorSize <= 256 && "Did not expect to pack so many elements"
" into one vector!") ? static_cast<void> (0) : __assert_fail
("MaxVectorSize <= 256 && \"Did not expect to pack so many elements\" \" into one vector!\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5340, __PRETTY_FUNCTION__))
;
5341 if (MaxVectorSize == 0) {
5342 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no vector registers.\n"
; } } while (false)
;
5343 MaxVectorSize = 1;
5344 return MaxVectorSize;
5345 } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5346 isPowerOf2_32(ConstTripCount)) {
5347 // We need to clamp the VF to be the ConstTripCount. There is no point in
5348 // choosing a higher viable VF as done in the loop below.
5349 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
<< ConstTripCount << "\n"; } } while (false)
5350 << ConstTripCount << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
<< ConstTripCount << "\n"; } } while (false)
;
5351 MaxVectorSize = ConstTripCount;
5352 return MaxVectorSize;
5353 }
5354
5355 unsigned MaxVF = MaxVectorSize;
5356 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5357 (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5358 // Collect all viable vectorization factors larger than the default MaxVF
5359 // (i.e. MaxVectorSize).
5360 SmallVector<ElementCount, 8> VFs;
5361 unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5362 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5363 VFs.push_back(ElementCount::getFixed(VS));
5364
5365 // For each VF calculate its register usage.
5366 auto RUs = calculateRegisterUsage(VFs);
5367
5368 // Select the largest VF which doesn't require more registers than existing
5369 // ones.
5370 for<